赞
踩
cd到我们存放项目的文件夹
然后 scrapy startproject BQG
cd:BQG
然后 scrapy genspider biquge biduo.cc
这里我是将我的整个文件贴到这里.
# -*- coding: utf-8 -*- from fake_useragent import UserAgent # Scrapy settings for BQG project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'BQG' SPIDER_MODULES = ['BQG.spiders'] NEWSPIDER_MODULE = 'BQG.spiders' LOG_LEVEL = 'WARNING' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = UserAgent().chrome # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'BQG.middlewares.BqgSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'BQG.middlewares.BqgDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'BQG.pipelines.BqgPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
这里我们简单分析:
# -*- coding: utf-8 -*- import scrapy from lxml import etree import re class BiqugeSpider(scrapy.Spider): name = 'biquge' allowed_domains = ['biduo.cc'] start_urls = ['https://www.biduo.cc/'] def parse(self, response): # print(response.text) pat = r'href="/book_\d+_\d+/">' pat = r'/book_\d+_\d+/' tab_lists = re.findall(pat, response.text) print("****************************") for li in tab_lists: yield scrapy.Request( url='https://www.biduo.cc/'+ li, callback=self.parse, ) pat1 = r'/biquge/\d+_\d+\/' t_lists = re.findall(pat1, response.text) for li in t_lists: # print(li) yield scrapy.Request( url='https://www.biduo.cc' + li, callback=self.get_novel, ) def get_novel(self,response): novel_url = response.url novel_title = response.xpath('//div[@id="info"]/h1/text()').extract_first() # novel_lists = response.xpath('//div[@id="list"]/dl/dd/a/@href').extract() novel_first = 'https://www.biduo.cc' + response.xpath('//div[@id="list"]/dl/dd[1]/a/@href').extract_first() yield scrapy.Request( url = novel_first, callback=self.get_page_content, meta={'novel_title':novel_title,'novel_url':novel_url} ) def get_page_content(self,response): item = {} item['novel_title'] = response.meta['novel_title'] item['novel_url'] = response.meta['novel_url'] item['page_title'] = response.xpath('//h1/text()').extract_first() item['page_url'] = response.url item['page_content'] = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace('\xa0','') # item['page_content'] = response.xpath('//div[@id="content"]/text()').extract() yield item next1 = response.xpath('//div[@class="bottem2"]/a[3]/@href').extract_first() # //*[@id="wrapper"]/div[4]/div/div[2]/div[1]/a[3] print(next1) next_url = 'https://www.biduo.cc' + next1 # response.urljoin(next_url) if next_url != item['novel_url']: yield scrapy.Request( url = next_url, callback=self.get_page_content, meta={'novel_title':item['novel_title'],'novel_url':item['novel_url']} )
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class BqgPipeline(object): def start(self,item,spider): self.file = open("novels/{}.txt".format(item['novel_title']), 'a+', encoding='utf-8') def process_item(self, item, spider): self.file = open("novels/{}.txt".format(item['novel_title']), 'a+', encoding='utf-8') print(item['page_title']) self.file.write(item['page_title']+'\n') self.file.write(item['page_url']+'\n') self.file.write(item['page_content']+'\n') return item def closed(self,item,spider): self.file.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。