赞
踩
1、创建Scrapy项目
scrapy startproject Dushu
2.进入项目目录,使用命令genspider创建Spider
scrapy genspider dushu dushu.com
3、定义要抓取的数据(处理items.py文件)
- import scrapy
-
- class DushuItem(scrapy.Item):
- # 书籍ID
- book_id = scrapy.Field()
- # 书的链接地址
- book_url = scrapy.Field()
- # 书名
- book_name = scrapy.Field()
- # 作者
- book_author = scrapy.Field()
- # 简介
- book_info = scrapy.Field()
- # 封面图
- cover_img_url = scrapy.Field()
- # 价格
- book_price = scrapy.Field()
- # 标签
- book_tag = scrapy.Field()
- # ISBN,有空获取不到值的情况
- book_isbn = scrapy.Field()
- # 一级分类名
- firstTitle= scrapy.Field()
- # 二级分类名
- secondTitle = scrapy.Field()
- # 三级分类名
- threeTitle = scrapy.Field()
- # 四级分类名
- fourTitle = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:dushu.py)
- # -*- coding: utf-8 -*-
- # 通过一级分类页面,获取所有当前分类下的书籍,获取书籍信息后,通过面包屑导航栏来确定该书籍的四级分类
- import scrapy
- from ..items import DushuItem
-
- class DushuSpider(scrapy.Spider):
- name = 'dushu'
- allowed_domains = ['dushu.com']
- start_urls = ['https://www.dushu.com/book/']
- base_url = "https://www.dushu.com"
-
- def parse(self, response):
- # 获取所有需要爬取的一级分类链接地址
- firstUrls = response.xpath('//div/div[@class="row"]/div/div/dl/dt/a/@href').extract()
- for url in firstUrls:
- yield scrapy.Request(url=self.base_url+url,callback=self.parse_second)
-
- def parse_second(self,response):
- # 一级分类的下一页
- next_pages = response.xpath('//div/div[@class="pages"]/a[not(@class="disabled")]/@href').extract()
- for page in next_pages:
- yield scrapy.Request(url=self.base_url+page,callback=self.parse_book)
-
- def parse_book(self, response):
- title = response.xpath('//div/div[@class="row"]/div/div/dl[@class="active"]/dt/text()').get()
- next_url = response.url
- next_url_page = next_url.split('_')[-1].split('.')[0]
- print("准备处理[%s]第%s页" % (title, next_url_page))
- # 书籍url
- all_books_url = response.xpath('//div[@class="bookslist"]/ul/li/div/h3/a/@href').extract()
- for url in all_books_url:
- yield scrapy.Request(url=self.base_url+url,callback=self.book_detail)
-
- def book_detail(self,response):
- item = DushuItem()
- # 书名
- book_name = response.xpath('//div/div/div[@class="book-title"]/h1/text()').get() or "该项为空"
- # with open(book_name+".html","w+",encoding="utf-8")as f:
- # f.write(response.text)
- # 获取书籍面包屑
- navbar = response.xpath('//div[@class="crumbs"]/a[position()>2]/text()').extract()
- length = len(navbar)
- if length == 2:
- print("该书籍一级分类:%s"%book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = "-"
- item['threeTitle'] = "-"
- item['fourTitle'] = "-"
- elif length == 3:
- print("该书籍二级分类:%s"%book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = navbar[1]
- item['threeTitle'] = "-"
- item['fourTitle'] = "-"
- elif length == 4:
- print("该书籍三级分类%s"%book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = navbar[1]
- item['threeTitle'] = navbar[2]
- item['fourTitle'] = "-"
- elif length == 5:
- print("该书籍四级分类:%s"%book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = navbar[1]
- item['threeTitle'] = navbar[2]
- item['fourTitle'] = navbar[3]
- else:
- print("该书籍分类异常:%s"%book_name)
- item['firstTitle'] = "-"
- item['secondTitle'] = "-"
- item['threeTitle'] = "-"
- item['fourTitle'] = "-"
-
- # 作者
- book_author = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[1]/td[2]/text()').get()or "该项为空"
- # 标签
- book_tag = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[4]/td[2]/text()').get()or "该项为空"
- # ISBN
- book_isbn = response.xpath('//div/div/div[@class="book-details"]/table//tr[1]/td[2]/text()').get() or "该项为空"
- # 价格
- book_price = response.xpath('//div/div/div[@class="book-details"]/div/p/span/text()').get()or "该项为空"
- # 简介
- book_info = response.xpath('//div/div/div[@class="book-summary"][1]/div/div/text()').get()or "该项为空"
- # 封面图
- cover_img_url = response.xpath('//div/div/div[@class="book-pic"]/div/img/@src').get()or "该项为空"
- # 书籍详细页地址
- book_url = response.url
- # 书籍id
- book_id = book_url.split("/")[-2]
-
- item['book_id'] = book_id
- item['book_name'] = book_name
- item['book_author'] = book_author
- item['book_tag'] = book_tag
- item['book_isbn'] = book_isbn
- item['book_price'] = book_price[1:]
- item['book_info'] = book_info.strip()
- item['cover_img_url'] = "暂无封面图" if "n200.png" in cover_img_url else cover_img_url
- item['book_url'] = book_url
- yield item

5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
- # -*- coding: utf-8 -*-
-
- import os
- import time
- import json
- import scrapy
- import pymysql
- import pymongo
- import redis
- from openpyxl import Workbook
- from scrapy.pipelines.images import ImagesPipeline
- from scrapy.utils.project import get_project_settings
- from scrapy import Item
- from scrapy.exceptions import DropItem
-
- class MyEncoder(json.JSONEncoder):
- def default(self, o):
- if isinstance(o, bytes):
- return str(o, encoding='utf-8')
- return json.JSONEncoder.default(self, o)
-
- class ImagePipeline(ImagesPipeline):
- # 获取存放图片文件夹名字
- IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
- # 图片名字不能含以下特殊符号
- char_list = ['*', '|', ':', '?', '/', '<', '>', '"', '\\']
- def get_media_requests(self, item, info):
- # 下载封面图的时候一定要Referer书的url地址,否则403
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
- "Referer": item['book_url']}
- # 无封面图的不下载
- cover_img_url = item['cover_img_url']
- if cover_img_url.startswith('http'):
- yield scrapy.Request(cover_img_url,headers=headers)
-
- def item_completed(self, results, item, info):
- book_name = item['book_name']
- if (not os.path.exists(self.IMAGES_STORE)):
- os.makedirs(self.IMAGES_STORE)
- # results:[(True, {'url': 'https://a.dushu.com/img/n200.png', 'path': 'full/783a2.jpg', 'checksum': '2792e5'})]
- if results:
- print("封面图下载:%s"% book_name)
- image_path = [x['path'] for ok, x in results if ok]
- for i in self.char_list:
- if i in book_name:
- print("'%s'包含特殊符号'%s'已转义!" % (book_name, i))
- book_name = book_name.replace(i, "_")
- os.rename(self.IMAGES_STORE + '/' + image_path[0], self.IMAGES_STORE + '/' + book_name + ".jpg")
-
- else:
- print("无封面图,无需下载:%s"% book_name)
- return item
-
- def close_spider(self, spider):
- print("图片下载完成!")
- # full最后如果是空文件夹删掉
- path = self.IMAGES_STORE + '/full'
- if not os.listdir(path):
- os.rmdir(path)
- if not os.listdir(self.IMAGES_STORE):
- os.rmdir(path)
-
- class XlsxPipeline(object):
- def __init__(self):
- self.wb = Workbook()
- self.ws = self.wb.active
- self.ws.title = "dushu网图书信息表"
- self.ws.append(['book_id','一级分类','二级分类','三级分类', '四级分类', '书名',
- '作者', '标签', 'ISBN', '价格(元)', '简介',
- '封面图', '书的链接地址'])
-
- def process_item(self, item, spider):
- text = [item['book_id'],item['firstTitle'],item['secondTitle'],item['threeTitle'],item['fourTitle'],item['book_name'],
- item['book_author'],item['book_tag'],item['book_isbn'],item['book_price'],item['book_info'],
- item['cover_img_url'],item['book_url']]
- self.ws.append(text)
- return item
-
- def close_spider(self,spider):
- time_file = time.strftime("%Y-%m-%d",time.localtime())
- self.wb.save(spider.name + time_file + ".xlsx")
- print("表格数据处理完毕,谢谢使用!")
-
- class MysqlPipeline():
- @classmethod
- def from_crawler(cls,crawler):
- cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
- cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
- cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
- cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
- cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
- cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
- return cls()
- def open_spider(self,spider):
- self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,
- passwd=self.MYSQL_PASSWD,db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
- self.cursor = self.db.cursor()
- def process_item(self, item, spider):
- try:
- sql = 'CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,firstTitle VARCHAR(15),' \
- 'secondTitle VARCHAR(20),threeTitle VARCHAR(20),fourTitle VARCHAR(20),book_name VARCHAR(200) NOT NULL ,' \
- 'book_author VARCHAR(200),book_tag VARCHAR(100),book_isbn VARCHAR(50),book_price VARCHAR(20),book_info TEXT,' \
- 'cover_img_url VARCHAR(200),book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";'
- self.cursor.execute(sql)
- except:
- pass
- try:
- self.cursor.execute("SELECT book_id FROM dushu WHERE book_id = %s;" ,item['book_id'])
- switch = self.cursor.fetchone()
- keys,values = zip(*item.items())
- if switch:
- sql = """INSERT INTO dushu({})VALUES ({})ON DUPLICATE KEY UPDATE {};""".format(
- ','.join(keys),
- ','.join(['%s']*len(values)),
- ','.join(['{}=%s'.format(k)for k in keys])
- )
- self.cursor.execute(sql, values * 2)
- else:
- sql = """INSERT INTO dushu({})VALUES ({});""".format(
- ','.join(keys),
- ','.join(['%s'] * len(values))
- )
- self.cursor.execute(sql,values)
- self.db.commit()
- return item
- except Exception as e:
- print("出错了:",e)
- self.db.rollback()
-
- def close_spider(self,spider):
- print("mysql数据库处理完毕")
- self.cursor.close()
- self.db.close()
-
- class MongoPipeline():
- @classmethod
- def from_crawler(cls, crawler):
- cls.MONGO_HOST = crawler.settings.get('MONGO_HOST')
- cls.MONGO_PORT = crawler.settings.get('MONGO_PORT')
- cls.MONGO_DB = crawler.settings.get('MONGO_DB')
- return cls()
-
- def open_spider(self,spider):
- self.client = pymongo.MongoClient(self.MONGO_HOST,self.MONGO_PORT)
- # 记录插入了几条数据
- self.num = 0
-
- def process_item(self,item,spider):
- try:
- self.db = self.client[self.MONGO_DB]
- self.book = self.db[spider.name]
- # 先获取数据库中有没有数据,如果有要去判断该值是否已经存在
- count = self.book.find().count()
- # 集合对象的insert方法需传入一个字典对象(不能传入Item对象)
- data = dict(item) if isinstance(item,Item)else item
- if count == 0:
- print("MongoDB数据库无数据,直接插入数据!")
- self.book.insert(data)
- self.num += 1
- else:
- book_name = item['book_name']
- count = self.book.find({'book_name':book_name}).count()
- if count == 0:
- print("%s:添加数据库中..."%book_name)
- self.book.insert(data)
- self.num += 1
- else:
- print("%s:该数据已存在无需添加!"%book_name)
- return item
- except Exception as e:
- print("MongoDB数据库出错:",e)
-
- def close_spider(self,spider):
- print("已完成,本次共保存到MongoDB数据库:%s条数据!" % self.num)
- self.client.close()
-
- class RedisPipeline():
- @classmethod
- def from_crawler(cls,crawler):
- # cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
- cls.REDIS_HOST = get_project_settings().get('REDIS_HOST')
- cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
- cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
- cls.REDIS_decode_responses = crawler.settings.get('REDIS_decode_responses')
- return cls()
- def open_spider(self,spider):
- try:
- self.redis_client = redis.StrictRedis(self.REDIS_HOST,self.REDIS_PORT,self.REDIS_DBNAME,
- decode_responses=self.REDIS_decode_responses)
- except Exception as e:
- print("redis数据库出错:",e)
-
- def process_item(self,item,spider):
- if self.redis_client.sadd('dushu:items',item['book_name']):
- return item
- raise DropItem
-
- def close_spider(self,spider):
- print("redis处理完成!")

6.配置settings文件(settings.py)
-
- LOG_FILE = "dushu.log"
- LOG_LEVEL = "DEBUG"
- LOG_STDOUT = True
-
- IMAGES_STORE = './images'
-
- # 配置MYSQL
- MYSQL_HOST = "localhost"
- MYSQL_PORT = 3306
- MYSQL_USER = "root"
- MYSQL_PASSWD = "123456"
- MYSQL_DBNAME = "python5"
- MYSQL_CHARSET = "utf8mb4"
-
- # 配置mongodb
- MONGO_HOST = 'localhost'
- MONGO_PORT = 27017
- MONGO_DB = 'py4'
-
- # 配置redis
- REDIS_HOST = 'localhost'
- REDIS_PORT = 6379
- REDIS_DBNAME = 4
- REDIS_decode_responses = True
-
- ROBOTSTXT_OBEY = False
- DOWNLOAD_DELAY = 3
- DEFAULT_REQUEST_HEADERS = {
- 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- }
- ITEM_PIPELINES = {
- 'Dushu.pipelines.XlsxPipeline': 301,
- 'Dushu.pipelines.ImagePipeline': 3,
- 'Dushu.pipelines.RedisPipeline': 300,
- 'Dushu.pipelines.MysqlPipeline': 302,
- 'Dushu.pipelines.MongoPipeline': 303,
- }

7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表
- CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,
- book_name VARCHAR(200) NOT NULL ,book_author VARCHAR(200),
- book_tag VARCHAR(100),book_isbn VARCHAR(50),
- book_price VARCHAR(20),book_info TEXT,cover_img_url VARCHAR(200),book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";
8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
scrapy crawl dushu
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。