赞
踩
知识点
- scrapy 分页爬取。
- scrapy提取页面元素之xpath表达式语法
- scrapy 配合pymysql保存爬取到的数据到mysql数据库
- scrapy.Request(……)向回调方法传递额外数据
- 数据库存储前先查重
本文为前文:Python scrapy使用入门,爬取拉勾网上万条职位信息 基础之上的拓展实战,没有看过前文的可以先去看看。对于之前说过的步骤不再赘述。本篇只做举一反三的一次实践,代码抄下来运行成功并没有意义,我们要善于掌握核心技术,为己所用,正好,我选择了一个电影下载站,很适合我练习,于是,在上一篇的代码基础上进行修改。
second.py文件修改后如下:
# -*- coding: utf-8 -*- import scrapy from First.items import FirstItem class SecondSpider(scrapy.Spider): name = 'second' allowed_domains = [] start_urls = ['http://www.66ys.tv/'] def parse(self, response): for item in response.xpath('//div[@class="menutv"]/ul/li/a'): movClass = item.xpath('text()').extract() movUrl = item.xpath("@href").extract_first() oneItem = FirstItem() oneItem["movClass"] =movClass oneItem["movUrl"] = movUrl for i in range(150): mvUrl2 = movUrl+str('index_%s.html'%i) try: yield scrapy.Request(url=mvUrl2, callback=lambda response, mvclass=movClass: self.parse_url(response, mvclass)) except: pass # yield scrapy.Request(url=movUrl,callback=lambda response,mvclass=movClass: self.parse_url(response,mvclass)) def parse_url(self, response,mvclass): for sel2 in response.xpath('//div[@class="listBox"]/ul/li'): imgurl = sel2.xpath("div/a/img/@src").extract() # 电影海报链接 mvname = sel2.xpath('div/h3/a/text()').extract()#电影名字 mvurl = sel2.xpath("div/h3/a/@href").extract_first()#电影链接 yield scrapy.Request(url=mvurl, callback=lambda response,mvsclass =mvclass,img = imgurl,name = mvname: self.parse_mor(response, mvclass,img,name)) def parse_mor(self, response, mvsclass,img,name): for select in response.xpath('//div[@class="contentinfo"]'): mvdownloadUrl = select.xpath("div/table/tbody/.//tr/td/a/@href").extract() # 下载地址,可能是多个 mvdtilte = select.xpath("div/table/tbody/.//tr/td/a/text()").extract()#下载标签的文本 mvdesc = select.xpath("div[@id='text']/.//p/text()")#/p[2]/text() desc = "" for p in mvdesc: desc = desc + p.extract().strip() desc= str(desc).replace('\\u3000',' ') Item = FirstItem() Item['movClass'] = mvsclass Item['downLoadName'] = name if str(mvdtilte).strip()=='': mvdtilte = "点击下载" Item['downdtitle'] = str(mvdtilte) Item['downimgurl'] = img Item['downLoadUrl'] = mvdownloadUrl Item['mvdesc'] = desc yield Item
item.py文件修改后如下:
# -*- coding: utf-8 -*- import scrapy class FirstItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() movClass = scrapy.Field()#电影分类 movUrl = scrapy.Field()#电影分类的URL mvName = scrapy.Field() mvUrl = scrapy.Field() downLoadUrl = scrapy.Field()#下载地址 downLoadName = scrapy.Field()#下载电影的名称 downimgurl = scrapy.Field()#电影海报图片 mvdesc = scrapy.Field()#电影的详情介绍 downdtitle = scrapy.Field()#下载的电影的标题
settings.py中添加数据库配置信息:
# -*- coding: utf-8 -*- # Scrapy settings for First project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'First' SPIDER_MODULES = ['First.spiders'] NEWSPIDER_MODULE = 'First.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'First (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'First.middlewares.FirstSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'First.middlewares.FirstDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'First.pipelines.FirstPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' FEED_EXPORTERS_BASE = { 'json': 'First.recode.recodes', 'jsonlines' : 'scrapy.contrib.exporter.JsonLinesItemExporter' } MY_USER_AGENT = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" ] DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 'First.middlewares.MyUserAgentMiddleware': 400, } ITEM_PIPELINES = { 'First.pipelines.FirstPipeline': 1, } #Mysql数据库的配置信息 MYSQL_HOST = '127.0.0.1' MYSQL_DBNAME = 'movie' #数据库名字,请修改 MYSQL_USER = 'root' #数据库账号,请修改 MYSQL_PASSWD = 'root' #数据库密码,请修改 MYSQL_PORT = 3306 #数据库端口,在dbhelper中使用
这里,我为了实验方便,本地装了个phpstudy,里面自带了mysql,然后用navcat新建一个数据库,建一个表,添加对应字段,由于对写sql命令不太熟,所以尤其喜欢navcat这个工具,真是太友好了。
接下来配置piplines.py,在这个里面配置数据库相关的操作,我们的数据都是在这个模块里存入mysql的:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from twisted.enterprise import adbapi import pymysql from First import settings from scrapy import log class FirstPipeline(object): def __init__(self): # 连接数据库 self.connect = pymysql.connect( host=settings.MYSQL_HOST, db=settings.MYSQL_DBNAME, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWD, charset='utf8', use_unicode=True) # 通过cursor执行增删查改 self.cursor = self.connect.cursor() def process_item(self, item, spider): try: # 插入数据 # 查重处理 self.cursor.execute( """select * from mybt where downLoadName = %s""", item['downLoadName']) # 是否有重复数据 repetition = self.cursor.fetchone() # 重复 if repetition is not None: #结果返回,已存在,则不插入 pass else: self.cursor.execute( """insert into mybt(movClass, downLoadName, downLoadUrl, mvdesc,downimgurl,downdtitle ) value (%s, %s, %s, %s, %s, %s)""", (item['movClass'], item['downLoadName'], item['downLoadUrl'], item['mvdesc'], item['downimgurl'], item['downdtitle'] )) # 提交sql语句 self.connect.commit() except Exception as error: # 出现错误时打印错误日志 log(error) return item
因为我们不希望每次采集,都累加进数据库,那样会有很多重复数据,所以在插入之前先判断是否存在该记录,如果没有再插入。可能这个方法不太理想,暂时先这么干吧,需要改进的地方很多。
好啦,就这么简单,修改的地方不多,总共代码依旧很简洁。但是却可以爬取一整个电影网站的资源了。试了一下,运行近15分钟,爬了14000多条,下载地址也存进了数据库。
然后,你可以拿这个数据库去搭建自己的影视下载app,or 下载站了。
目前存在的问题
1.详情页的数据,电影和电视剧是不一样的,电影较为统一,但是电视剧因为下载集数很多,所以格式有别,目前是粗暴的都存入了一个长字符串里,我慢慢再优化吧。
2.部分电影明明匹配格式正确但是就是没爬下来数据
3.部分非常规页面的下载链接和名称没能正确匹配,像排行榜啊啥的
完整源码在github。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。