当前位置:   article > 正文

scrapy获取读书网书籍信息保存MongoDB、Redis、MySQL数据库和本地表格,并用ImagePipeline下载封面图_数据库中书籍的封面图在查询中显示

数据库中书籍的封面图在查询中显示

1、创建Scrapy项目

scrapy startproject Dushu

2.进入项目目录,使用命令genspider创建Spider

scrapy genspider dushu dushu.com

3、定义要抓取的数据(处理items.py文件)

  1. import scrapy
  2. class DushuItem(scrapy.Item):
  3. # 书籍ID
  4. book_id = scrapy.Field()
  5. # 书的链接地址
  6. book_url = scrapy.Field()
  7. # 书名
  8. book_name = scrapy.Field()
  9. # 作者
  10. book_author = scrapy.Field()
  11. # 简介
  12. book_info = scrapy.Field()
  13. # 封面图
  14. cover_img_url = scrapy.Field()
  15. # 价格
  16. book_price = scrapy.Field()
  17. # 标签
  18. book_tag = scrapy.Field()
  19. # ISBN,有空获取不到值的情况
  20. book_isbn = scrapy.Field()
  21. # 一级分类名
  22. firstTitle= scrapy.Field()
  23. # 二级分类名
  24. secondTitle = scrapy.Field()
  25. # 三级分类名
  26. threeTitle = scrapy.Field()
  27. # 四级分类名
  28. fourTitle = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:dushu.py)

  1. # -*- coding: utf-8 -*-
  2. # 通过一级分类页面,获取所有当前分类下的书籍,获取书籍信息后,通过面包屑导航栏来确定该书籍的四级分类
  3. import scrapy
  4. from ..items import DushuItem
  5. class DushuSpider(scrapy.Spider):
  6. name = 'dushu'
  7. allowed_domains = ['dushu.com']
  8. start_urls = ['https://www.dushu.com/book/']
  9. base_url = "https://www.dushu.com"
  10. def parse(self, response):
  11. # 获取所有需要爬取的一级分类链接地址
  12. firstUrls = response.xpath('//div/div[@class="row"]/div/div/dl/dt/a/@href').extract()
  13. for url in firstUrls:
  14. yield scrapy.Request(url=self.base_url+url,callback=self.parse_second)
  15. def parse_second(self,response):
  16. # 一级分类的下一页
  17. next_pages = response.xpath('//div/div[@class="pages"]/a[not(@class="disabled")]/@href').extract()
  18. for page in next_pages:
  19. yield scrapy.Request(url=self.base_url+page,callback=self.parse_book)
  20. def parse_book(self, response):
  21. title = response.xpath('//div/div[@class="row"]/div/div/dl[@class="active"]/dt/text()').get()
  22. next_url = response.url
  23. next_url_page = next_url.split('_')[-1].split('.')[0]
  24. print("准备处理[%s]第%s页" % (title, next_url_page))
  25. # 书籍url
  26. all_books_url = response.xpath('//div[@class="bookslist"]/ul/li/div/h3/a/@href').extract()
  27. for url in all_books_url:
  28. yield scrapy.Request(url=self.base_url+url,callback=self.book_detail)
  29. def book_detail(self,response):
  30. item = DushuItem()
  31. # 书名
  32. book_name = response.xpath('//div/div/div[@class="book-title"]/h1/text()').get() or "该项为空"
  33. # with open(book_name+".html","w+",encoding="utf-8")as f:
  34. # f.write(response.text)
  35. # 获取书籍面包屑
  36. navbar = response.xpath('//div[@class="crumbs"]/a[position()>2]/text()').extract()
  37. length = len(navbar)
  38. if length == 2:
  39. print("该书籍一级分类:%s"%book_name)
  40. item['firstTitle'] = navbar[0]
  41. item['secondTitle'] = "-"
  42. item['threeTitle'] = "-"
  43. item['fourTitle'] = "-"
  44. elif length == 3:
  45. print("该书籍二级分类:%s"%book_name)
  46. item['firstTitle'] = navbar[0]
  47. item['secondTitle'] = navbar[1]
  48. item['threeTitle'] = "-"
  49. item['fourTitle'] = "-"
  50. elif length == 4:
  51. print("该书籍三级分类%s"%book_name)
  52. item['firstTitle'] = navbar[0]
  53. item['secondTitle'] = navbar[1]
  54. item['threeTitle'] = navbar[2]
  55. item['fourTitle'] = "-"
  56. elif length == 5:
  57. print("该书籍四级分类:%s"%book_name)
  58. item['firstTitle'] = navbar[0]
  59. item['secondTitle'] = navbar[1]
  60. item['threeTitle'] = navbar[2]
  61. item['fourTitle'] = navbar[3]
  62. else:
  63. print("该书籍分类异常:%s"%book_name)
  64. item['firstTitle'] = "-"
  65. item['secondTitle'] = "-"
  66. item['threeTitle'] = "-"
  67. item['fourTitle'] = "-"
  68. # 作者
  69. book_author = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[1]/td[2]/text()').get()or "该项为空"
  70. # 标签
  71. book_tag = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[4]/td[2]/text()').get()or "该项为空"
  72. # ISBN
  73. book_isbn = response.xpath('//div/div/div[@class="book-details"]/table//tr[1]/td[2]/text()').get() or "该项为空"
  74. # 价格
  75. book_price = response.xpath('//div/div/div[@class="book-details"]/div/p/span/text()').get()or "该项为空"
  76. # 简介
  77. book_info = response.xpath('//div/div/div[@class="book-summary"][1]/div/div/text()').get()or "该项为空"
  78. # 封面图
  79. cover_img_url = response.xpath('//div/div/div[@class="book-pic"]/div/img/@src').get()or "该项为空"
  80. # 书籍详细页地址
  81. book_url = response.url
  82. # 书籍id
  83. book_id = book_url.split("/")[-2]
  84. item['book_id'] = book_id
  85. item['book_name'] = book_name
  86. item['book_author'] = book_author
  87. item['book_tag'] = book_tag
  88. item['book_isbn'] = book_isbn
  89. item['book_price'] = book_price[1:]
  90. item['book_info'] = book_info.strip()
  91. item['cover_img_url'] = "暂无封面图" if "n200.png" in cover_img_url else cover_img_url
  92. item['book_url'] = book_url
  93. yield item

5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)

  1. # -*- coding: utf-8 -*-
  2. import os
  3. import time
  4. import json
  5. import scrapy
  6. import pymysql
  7. import pymongo
  8. import redis
  9. from openpyxl import Workbook
  10. from scrapy.pipelines.images import ImagesPipeline
  11. from scrapy.utils.project import get_project_settings
  12. from scrapy import Item
  13. from scrapy.exceptions import DropItem
  14. class MyEncoder(json.JSONEncoder):
  15. def default(self, o):
  16. if isinstance(o, bytes):
  17. return str(o, encoding='utf-8')
  18. return json.JSONEncoder.default(self, o)
  19. class ImagePipeline(ImagesPipeline):
  20. # 获取存放图片文件夹名字
  21. IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
  22. # 图片名字不能含以下特殊符号
  23. char_list = ['*', '|', ':', '?', '/', '<', '>', '"', '\\']
  24. def get_media_requests(self, item, info):
  25. # 下载封面图的时候一定要Referer书的url地址,否则403
  26. headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
  27. "Referer": item['book_url']}
  28. # 无封面图的不下载
  29. cover_img_url = item['cover_img_url']
  30. if cover_img_url.startswith('http'):
  31. yield scrapy.Request(cover_img_url,headers=headers)
  32. def item_completed(self, results, item, info):
  33. book_name = item['book_name']
  34. if (not os.path.exists(self.IMAGES_STORE)):
  35. os.makedirs(self.IMAGES_STORE)
  36. # results:[(True, {'url': 'https://a.dushu.com/img/n200.png', 'path': 'full/783a2.jpg', 'checksum': '2792e5'})]
  37. if results:
  38. print("封面图下载:%s"% book_name)
  39. image_path = [x['path'] for ok, x in results if ok]
  40. for i in self.char_list:
  41. if i in book_name:
  42. print("'%s'包含特殊符号'%s'已转义!" % (book_name, i))
  43. book_name = book_name.replace(i, "_")
  44. os.rename(self.IMAGES_STORE + '/' + image_path[0], self.IMAGES_STORE + '/' + book_name + ".jpg")
  45. else:
  46. print("无封面图,无需下载:%s"% book_name)
  47. return item
  48. def close_spider(self, spider):
  49. print("图片下载完成!")
  50. # full最后如果是空文件夹删掉
  51. path = self.IMAGES_STORE + '/full'
  52. if not os.listdir(path):
  53. os.rmdir(path)
  54. if not os.listdir(self.IMAGES_STORE):
  55. os.rmdir(path)
  56. class XlsxPipeline(object):
  57. def __init__(self):
  58. self.wb = Workbook()
  59. self.ws = self.wb.active
  60. self.ws.title = "dushu网图书信息表"
  61. self.ws.append(['book_id','一级分类','二级分类','三级分类', '四级分类', '书名',
  62. '作者', '标签', 'ISBN', '价格(元)', '简介',
  63. '封面图', '书的链接地址'])
  64. def process_item(self, item, spider):
  65. text = [item['book_id'],item['firstTitle'],item['secondTitle'],item['threeTitle'],item['fourTitle'],item['book_name'],
  66. item['book_author'],item['book_tag'],item['book_isbn'],item['book_price'],item['book_info'],
  67. item['cover_img_url'],item['book_url']]
  68. self.ws.append(text)
  69. return item
  70. def close_spider(self,spider):
  71. time_file = time.strftime("%Y-%m-%d",time.localtime())
  72. self.wb.save(spider.name + time_file + ".xlsx")
  73. print("表格数据处理完毕,谢谢使用!")
  74. class MysqlPipeline():
  75. @classmethod
  76. def from_crawler(cls,crawler):
  77. cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
  78. cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
  79. cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
  80. cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
  81. cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
  82. cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
  83. return cls()
  84. def open_spider(self,spider):
  85. self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,
  86. passwd=self.MYSQL_PASSWD,db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
  87. self.cursor = self.db.cursor()
  88. def process_item(self, item, spider):
  89. try:
  90. sql = 'CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,firstTitle VARCHAR(15),' \
  91. 'secondTitle VARCHAR(20),threeTitle VARCHAR(20),fourTitle VARCHAR(20),book_name VARCHAR(200) NOT NULL ,' \
  92. 'book_author VARCHAR(200),book_tag VARCHAR(100),book_isbn VARCHAR(50),book_price VARCHAR(20),book_info TEXT,' \
  93. 'cover_img_url VARCHAR(200),book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";'
  94. self.cursor.execute(sql)
  95. except:
  96. pass
  97. try:
  98. self.cursor.execute("SELECT book_id FROM dushu WHERE book_id = %s;" ,item['book_id'])
  99. switch = self.cursor.fetchone()
  100. keys,values = zip(*item.items())
  101. if switch:
  102. sql = """INSERT INTO dushu({})VALUES ({})ON DUPLICATE KEY UPDATE {};""".format(
  103. ','.join(keys),
  104. ','.join(['%s']*len(values)),
  105. ','.join(['{}=%s'.format(k)for k in keys])
  106. )
  107. self.cursor.execute(sql, values * 2)
  108. else:
  109. sql = """INSERT INTO dushu({})VALUES ({});""".format(
  110. ','.join(keys),
  111. ','.join(['%s'] * len(values))
  112. )
  113. self.cursor.execute(sql,values)
  114. self.db.commit()
  115. return item
  116. except Exception as e:
  117. print("出错了:",e)
  118. self.db.rollback()
  119. def close_spider(self,spider):
  120. print("mysql数据库处理完毕")
  121. self.cursor.close()
  122. self.db.close()
  123. class MongoPipeline():
  124. @classmethod
  125. def from_crawler(cls, crawler):
  126. cls.MONGO_HOST = crawler.settings.get('MONGO_HOST')
  127. cls.MONGO_PORT = crawler.settings.get('MONGO_PORT')
  128. cls.MONGO_DB = crawler.settings.get('MONGO_DB')
  129. return cls()
  130. def open_spider(self,spider):
  131. self.client = pymongo.MongoClient(self.MONGO_HOST,self.MONGO_PORT)
  132. # 记录插入了几条数据
  133. self.num = 0
  134. def process_item(self,item,spider):
  135. try:
  136. self.db = self.client[self.MONGO_DB]
  137. self.book = self.db[spider.name]
  138. # 先获取数据库中有没有数据,如果有要去判断该值是否已经存在
  139. count = self.book.find().count()
  140. # 集合对象的insert方法需传入一个字典对象(不能传入Item对象)
  141. data = dict(item) if isinstance(item,Item)else item
  142. if count == 0:
  143. print("MongoDB数据库无数据,直接插入数据!")
  144. self.book.insert(data)
  145. self.num += 1
  146. else:
  147. book_name = item['book_name']
  148. count = self.book.find({'book_name':book_name}).count()
  149. if count == 0:
  150. print("%s:添加数据库中..."%book_name)
  151. self.book.insert(data)
  152. self.num += 1
  153. else:
  154. print("%s:该数据已存在无需添加!"%book_name)
  155. return item
  156. except Exception as e:
  157. print("MongoDB数据库出错:",e)
  158. def close_spider(self,spider):
  159. print("已完成,本次共保存到MongoDB数据库:%s条数据!" % self.num)
  160. self.client.close()
  161. class RedisPipeline():
  162. @classmethod
  163. def from_crawler(cls,crawler):
  164. # cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
  165. cls.REDIS_HOST = get_project_settings().get('REDIS_HOST')
  166. cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
  167. cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
  168. cls.REDIS_decode_responses = crawler.settings.get('REDIS_decode_responses')
  169. return cls()
  170. def open_spider(self,spider):
  171. try:
  172. self.redis_client = redis.StrictRedis(self.REDIS_HOST,self.REDIS_PORT,self.REDIS_DBNAME,
  173. decode_responses=self.REDIS_decode_responses)
  174. except Exception as e:
  175. print("redis数据库出错:",e)
  176. def process_item(self,item,spider):
  177. if self.redis_client.sadd('dushu:items',item['book_name']):
  178. return item
  179. raise DropItem
  180. def close_spider(self,spider):
  181. print("redis处理完成!")

6.配置settings文件(settings.py)

  1. LOG_FILE = "dushu.log"
  2. LOG_LEVEL = "DEBUG"
  3. LOG_STDOUT = True
  4. IMAGES_STORE = './images'
  5. # 配置MYSQL
  6. MYSQL_HOST = "localhost"
  7. MYSQL_PORT = 3306
  8. MYSQL_USER = "root"
  9. MYSQL_PASSWD = "123456"
  10. MYSQL_DBNAME = "python5"
  11. MYSQL_CHARSET = "utf8mb4"
  12. # 配置mongodb
  13. MONGO_HOST = 'localhost'
  14. MONGO_PORT = 27017
  15. MONGO_DB = 'py4'
  16. # 配置redis
  17. REDIS_HOST = 'localhost'
  18. REDIS_PORT = 6379
  19. REDIS_DBNAME = 4
  20. REDIS_decode_responses = True
  21. ROBOTSTXT_OBEY = False
  22. DOWNLOAD_DELAY = 3
  23. DEFAULT_REQUEST_HEADERS = {
  24. 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
  25. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  26. # 'Accept-Language': 'en',
  27. }
  28. ITEM_PIPELINES = {
  29. 'Dushu.pipelines.XlsxPipeline': 301,
  30. 'Dushu.pipelines.ImagePipeline': 3,
  31. 'Dushu.pipelines.RedisPipeline': 300,
  32. 'Dushu.pipelines.MysqlPipeline': 302,
  33. 'Dushu.pipelines.MongoPipeline': 303,
  34. }

7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表

  1. CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,
  2. book_name VARCHAR(200) NOT NULL ,book_author VARCHAR(200),
  3. book_tag VARCHAR(100),book_isbn VARCHAR(50),
  4. book_price VARCHAR(20),book_info TEXT,cover_img_url VARCHAR(200),book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";

8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

scrapy crawl dushu

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/433043
推荐阅读
相关标签
  

闽ICP备14008679号