赞
踩
这是我的第一个项目,之前做过很多的小项目,这次项目目标是针对当当、苏宁易购、以及豆瓣读书三大网站进行图书数据的爬取到本地。
确定好项目后,接下来要针对项目进行分析及实际的操作,由于此次的目标要针对三个网站数据的爬取,还是对图书信息的爬取,可想数据量之庞大,框架方面就使用scrapy,先搭建一个爬虫项目,等到后续优化为redis分布式,接下来分步骤进行。
在项目中创建三个爬虫分别爬取当当、苏宁、豆瓣。
cd book_project
scrapy genspider -t basic dangdang_spider dangdang.com
scrapy genspider -t basic suning_spider suning.com
scrapy genspider -t basic douban_spider douban.com
搭建dangdang_spider:
class DangdangSpiderSpider(scrapy.Spider): name = 'dangdang_spider' allowed_domains = ['dangdang.com'] start_urls = ['http://e.dangdang.com/list-DZS-dd_sale-0-1.html'] url = 'http://e.dangdang.com/media/api.go?action=mediaCategoryLeaf&start={}&end={}&category={}&dimension=dd_sale' def parse(self, response): # 对返回的response进行数据提取 item_list = response.xpath("//ul/a/li") for i in item_list: item = {} item['data_type'] = i.xpath('./@data-type').get() item['dd_name'] = i.xpath('./@dd_name').get() url = self.url.format(0, 20, item['data_type']) # 先请求前20个数据 yield scrapy.Request(url, callback=self.parse_item) # 要爬取全站图书就把break注释掉 break def parse_item(self, response): # 提取数据 dangdnag_spideritem = dangdang_spiderItem() responses = json.loads(response.text) # total = responses['data']['total'] 该行为获取每个小分类图书的总量 saleList = responses['data']['saleList'] for book in saleList: # 遍历每一本书提取信息 dangdnag_spideritem['title'] = book['mediaList'][0]['title'] dangdnag_spideritem['authorpenname'] = book['mediaList'][0]['authorPenname'] dangdnag_spideritem['price'] = book['mediaList'][0]['price'] * 0.01 dangdnag_spideritem['categorys'] = book['mediaList'][0]['categorys'] yield dangdnag_spideritem if responses['status']['code'] == 0: # 处理下一页数据 for i in range(21, responses['data']['total'], 20): url = self.url.format(i, i+19, responses['data']['code']) yield scrapy.Request(url, callback=self.parse_item)
这是爬虫部分的代码,因为是接口所以要对“下一页”做处理,这里我学艺不精,只是简单的做了一个方法对返回的数据做了判断,当然这只是最初版本,还会继续优化。
搭建douban_spider:
class DoubanSpiderSpider(scrapy.Spider): name = 'douban_spider' allowed_domains = ['book.douban.com'] start_urls = ['https://book.douban.com/tag/'] def parse(self, response): # 获取分类链接 book_tags = response.xpath( "//div[@id='content']//b/../a/@href").extract() for book in book_tags: # print(response.urljoin(book)) yield scrapy.Request(response.urljoin(book), callback=self.parse_item) # 要爬取全站图书就把break注释掉 break def parse_item(self, response): douban_spideritem = douban_spiderItem() items = response.xpath('//div[@id="content"]')[0] book_tag = items.xpath('.//h1/text()').get() li_list = items.xpath('//div[@id="subject_list"]/ul//li') for li in li_list: # 循环一个页面书籍列表 douban_spideritem['title'] = li.xpath('.//h2/a/text()').get() douban_spideritem['author_price'] = li.xpath('.//div[@class="pub"]/text()').get() douban_spideritem['book_tag'] = book_tag # 把每本书信息塞到管道进行处理 yield douban_spideritem # 处理下一页链接 next_href = response.xpath('//span[@class="next"]/a/@href') if len(next_href) > 0: # 还有下一页 next_href = next_href.get() yield scrapy.Request(response.urljoin(next_href), callback=self.parse_item)
这是豆瓣代码部分,代码思路还是一样的,提取信息等还是一样的,无非在管道那边做的设置不一样,在后面会介绍管道部分。
搭建suning_spider:
class SuningSpiderSpider(scrapy.Spider): name = 'suning_spider' allowed_domains = ['suning.com'] start_urls = ['http://lib.suning.com/api/jsonp/cb/KfQ-cmsJsonpApi.jsonp'] url = 'https://search.suning.com/emall/mobile/wap/clientSearch.jsonp?keyword={}&channel=99999972&cp={}&ps=10&set=5&ct=-1&v=99999999' tag_name = '' def parse(self, response): # 对类别名称进行提取 pattern = re.compile(r'\(.*\)') response_item = pattern.findall(response.body_as_unicode()) # 使用正则提取网页中的数据,对数据进行预处理 response_eval = eval(response_item[0]) # 处理后的数据为list # 获取分类信息的字典 for d in response_eval: if d.__contains__('fl8p_bt'): response_dict_list = d['fl8p_bt']['nodes'][0]['tag'] for url_dict in response_dict_list: url_name = url_dict['linkUrl'] pattern = re.compile(r'https://m\.suning\.com/search/(.*?)/&') global tag_name self.tag_name = pattern.findall(url_name)[0] # 已经获取到URL的类别名称,开始构造URL # 构造一个一页十个的URL stat_url = self.url.format(self.tag_name, 0) # 返回到下一页函数进行数据提取 yield scrapy.Request(stat_url, callback=self.parse_item) # 要爬取全站图书就把break注释掉 break def parse_item(self, response): response_json = json.loads(response.text) # 进行判断是否有返回数据 if len(response_json['errorCode']) == 0: # 进行数据提取及下一页请求 suning_spideritem = suning_spiderItem() book_list = response_json['goods'] for book in book_list: # 对每本书的数据进行提取 suning_spideritem['catentdesc'] = book['catentdesc'] suning_spideritem['author'] = book['author'] if len(book['author']) > 0 else None suning_spideritem['price'] = book['price'] suning_spideritem['tag_type'] = self.tag_name yield suning_spideritem ''' 构造下一页URL,构造一个生成器, 这的601是根据分析页面获得的结果,这里可以随便设定, 尽量大一点要比六百大,因为苏宁这个网站最多返回六千个数据 ''' for x in range(1, 601): next_url = self.url.format(self.tag_name, x) yield scrapy.Request(next_url, callback=self.parse_item) else: pass
这个苏宁爬虫还是有点难度的,后期我会更新代码针对网页版有一个爬取,苏宁这个网站无论在网页端还是手机端都是最多只返回六千的数据,反爬机制还是做的很全的,包括在爬取的时候返回时间也是个问题,这些问题在后续更新中都会处理掉。
管道、以及item:
class BookProjectPipeline(object): def process_item(self, item, spider): return item class suning_spiderPipeline(object): """ 对豆瓣爬虫进行数据处理 """ def open_spider(self, spider): self.client = pymongo.MongoClient( host=spider.settings.get('MONGO_HOST'), port=spider.settings.get('MONGO_PORT'), ) # 数据库登录需要帐号密码的话 # self.client.admin.authenticate( # settings['MINGO_USER'], # settings['MONGO_PSW'] # ) self.db = self.client[spider.settings.get('MONGO_DB')] # 获得数据库的句柄 self.coll = self.db['suning_spider_book'] # 获得collection的句柄 def process_item(self, item, spider): """ 进行判断,如果进来的item是豆瓣的数据就进行数据处理存储, 不然就跳过数据 """ if spider.name == 'suning_spider' and isinstance(item, suning_spiderItem): # 把进来的item数据进行清洗处理 book = {} book['title'] = item['catentdesc'] book['author'] = item['author'] book['price'] = item['price'] book['type'] = item['tag_type'] self.coll.insert(book) # 向数据库插入一条记录,要插入字典 # print(item) return item class douban_spiderPipeline(object): """ 对豆瓣爬虫进行数据处理 """ def open_spider(self, spider): self.client = pymongo.MongoClient( host=spider.settings.get('MONGO_HOST'), port=spider.settings.get('MONGO_PORT'), ) # 数据库登录需要帐号密码的话 # self.client.admin.authenticate( # settings['MINGO_USER'], # settings['MONGO_PSW'] # ) self.db = self.client[spider.settings.get('MONGO_DB')] # 获得数据库的句柄 # 获得collection的句柄 # self.coll = self.db[spider.settings.get('MONGO_COLL')] self.coll = self.db['douban_spider_book'] def process_item(self, item, spider): """ 进行判断,如果进来的item是豆瓣的数据就进行数据处理存储, 不然就跳过数据 """ if spider.name == 'douban_spider' and isinstance(item, douban_spiderItem): # 把进来的item数据进行清洗处理 book = {} book['title'] = item['title'].split()[0] # 这个地方还要做一下处理 pattern = re.compile(r'(.)|\[.\]|\(.\)') author_price = [x.split() for x in item['author_price'].split("/")] author = author_price[0] if len(author) > 0: if len(pattern.findall(author[0])) > 0: if author[0] == pattern.findall(author[0])[0]: author = author[0] + author[1] else: author = author[0] else: author = '' book['author'] = author book['price'] = author_price[-1][0] book['tag'] = item['book_tag'].split(":")[1] self.coll.insert(book) # 向数据库插入一条记录,要插入字典 # print(item) return item class dangdnag_spiderPipeline(object): """ 对当当爬虫进行数据处理 """ def open_spider(self, spider): db = spider.settings.get('MYSQL_DB_NAME') host = spider.settings.get('MYSQL_HOST') port = spider.settings.get('MYSQL_PORT') user = spider.settings.get('MYSQL_USER') passwd = spider.settings.get('MYSQL_PASSWORD') self.db_conn = pymysql.connect( host=host, port=port, db=db, user=user, passwd=passwd, charset='utf8', ) self.db_cur = self.db_conn.cursor() # 关闭数据库 def process_item(self, item, spider): # 对spider,item进行判断 if spider.name == 'dangdang_spider' and isinstance(item, dangdang_spiderItem): self.insert_db(item) return item # 插入数据 def insert_db(self, item): values = ( item['title'], item['authorpenname'], item['price'], item['categorys'], ) sql = 'INSERT INTO dangdang_books(title,authorpenname,price,categorys) VALUES(%s,%s,%s,%s)' self.db_cur.execute(sql, values) def close_spider(self, spider): self.db_conn.commit() self.db_conn.close()
这里设置了三个管道,负责对每个爬虫数据进行处理,设置三个管道是为了后续方便对整个项目改进,毕竟项目还没有完成,这只是初建。
class BookProjectItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class dangdang_spiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() authorpenname = scrapy.Field() price = scrapy.Field() categorys = scrapy.Field() class douban_spiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() author_price = scrapy.Field() book_tag = scrapy.Field() class suning_spiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() catentdesc = scrapy.Field() author = scrapy.Field() price = scrapy.Field() tag_type = scrapy.Field()
Item 部分的代码也是每个爬虫分别一个Item以保证后续代码开发时方便更改数据。
项目实施这块基本就这些。
终于到最后了,book项目1.0有一个落幕了,这几天的劳神费脑也算是有一个好结果了,整体感觉这个项目流程已搭建完毕,后续还要做细节上的开发,包括数据具体的分析处理,还有爬取速度改进方面,具体可以使用分布式来去部署这个项目,到时还要改进爬虫细节,目前的话三个爬虫感觉写的最顺的时豆瓣的,毕竟之前拿豆瓣来来回回爬了不知多少遍了,对于苏宁的爬虫还需要努力攻克,学识不深,后续努力,总结至此。
rapy.Field()
price = scrapy.Field()
tag_type = scrapy.Field()
```
Item 部分的代码也是每个爬虫分别一个Item以保证后续代码开发时方便更改数据。
项目实施这块基本就这些。
终于到最后了,book项目1.0有一个落幕了,这几天的劳神费脑也算是有一个好结果了,整体感觉这个项目流程已搭建完毕,后续还要做细节上的开发,包括数据具体的分析处理,还有爬取速度改进方面,具体可以使用分布式来去部署这个项目,到时还要改进爬虫细节,目前的话三个爬虫感觉写的最顺的时豆瓣的,毕竟之前拿豆瓣来来回回爬了不知多少遍了,对于苏宁的爬虫还需要努力攻克,学识不深,后续努力,总结至此。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。