赞
踩
1、先在settings.py把配置配好
- # 关闭ROBOTS协议
- ROBOTSTXT_OBEY = False
- # 设置log 日志等级
- LOG_LEVEL = 'WARNING'
-
- # 开启管道
- ITEM_PIPELINES = {
- 'mini_resource_scrapy.pipelines.MiniResourceScrapyPipeline': 300,
- }
-
- # 设置USER_AGENT 这个直接打开浏览器,F12控制台随便找个请求,请求头信息里面有
- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
2、在items.py文件中定义数据模型,把要爬的字段在这里定义,我这里贴代码给大家参考
- import scrapy
-
-
- class MiniResourceScrapyItem(scrapy.Item):
- # define the fields for your item here like:
- # name = scrapy.Field()
- _id = scrapy.Field() # 数据id
- add_date = scrapy.Field() # 视频添加的时间
- title = scrapy.Field() # 标题
- director = scrapy.Field() # 导演
- stars = scrapy.Field() # 主演,这里我存数组
- type = scrapy.Field() # 电影类型 这里我存数组
- area = scrapy.Field() # 电影地区
- year = scrapy.Field() # 电影年份
- describe = scrapy.Field() #电影描述
- source = scrapy.Field() # 视频来源,(这里我都写目前爬取的网站地址)
- video_url = scrapy.Field() # 视频播放url地址
- cover_img = scrapy.Field() # 视频封面url 地址
- score = scrapy.Field() # 电影评分 (我爬的网站没有拿到评分信息,后面都给初始值 0 )
- play_count = scrapy.Field() # 视频播放次数 (给初始值 0)
- is_show = scrapy.Field() # 前端是否展示
- is_new = scrapy.Field() # 是否是最新
3、配置管道pipelines.py, 用于把爬取来的数据存起来,存文件活着数据库这里随你,这里我直接存文件
- import codecs,json
- # 用codecs提供的open方法来指定打开的文件的语言编码,它会在读 取的时候自动转换为内部unicode
-
- class MiniResourceScrapyPipeline(object):
- def __init__(self):
- self.file = codecs.open('movie.json', 'w', encoding='utf-8')
-
- # 在open_spider 方法中打开文件也可以,爬虫启动的时候,文件之打开一次,爬虫结束的时候关闭文件
- # def open_spider(self, spider):
- # pass
-
-
- # 在文件里面存入json 数据
- def process_item(self, item, spider):
- lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
- self.file.write(lines)
- return item
-
- # 爬虫结束,关闭文件
- def spider_closed(self, spider):
- self.file.close()
4、正式开始我们的爬虫代码 爬的是 https://www.80mv.cc/ 这个视频网站,大家也可以用别的网站爬,xpath要写正确啊,逻辑都一样
- import scrapy
- from ..items import MiniResourceScrapyItem
- import re
-
-
- class Mv80Spider(scrapy.Spider):
- name = 'mv80'
- total_page = 4 # 这边我只爬取3页数据
- page_index = 1 # 从第一页开始
- origin = 'https://www.80mv.cc'
- id_compile = re.compile(r'80detail_(\d+).html')
- r_compile = re.compile(r'https:(.+)/index.m3u8', re.I)
- # allowed_domains = ['www.80mv.cc']
- start_urls = ['https://www.80mv.cc/80show_dianying--------{}---.html']
-
- # 设置爬取的开始url地址
- def start_requests(self):
- for i in range(self.page_index, self.total_page):
- yield scrapy.Request(url=self.start_urls[0].format(i))
-
- def parse(self, response):
- lis = response.xpath('//ul[contains(@class,"stui-vodlist clearfix")]/li')
- for tag in lis:
- page_url = tag.xpath('.//a[@class="stui-vodlist__thumb lazyload"]/@href').extract_first()
- _id = self.id_compile.search(page_url).groups()[0]
- # 电影标题
- title = tag.xpath('.//a[@class="stui-vodlist__thumb lazyload"]/@title').extract_first()
- # 电影封面图
- cover_img = tag.xpath('.//a[@class="stui-vodlist__thumb lazyload"]/@data-original').extract_first()
- # 电影详情内页地址
- detail_url = tag.xpath('.//a[@class="stui-vodlist__thumb lazyload"]/@href').extract_first()
- # print(title, cover_img)
- # 实例化数据模型 把数据存入
- item = MiniResourceScrapyItem()
- item['title'] = title
- item['cover_img'] = cover_img
- item['_id'] = _id
- # 进入详情页继续爬取详情信息
- yield scrapy.Request(
- url='{}{}'.format(self.origin, detail_url),
- callback=self.video_detail,
- meta={'item': item})
-
-
- # 视频详情页
- def video_detail(self, response):
- item = response.meta['item']
- des = response.xpath('//div[@class="stui-content__detail"]')
- stars = des.xpath('./p[1]//a//text()').extract()
-
- director = des.xpath('./p[2]/a/text()').extract_first()
-
- describe = des.xpath('./p[5]/span[2]/text()').extract_first()
-
- m_type = des.xpath('string(./p[3]/span[contains(text(), "类型")]/following-sibling::text())')\
- .extract_first().strip().split(',')
-
- area = des.xpath('string(./p[3]/span[contains(text(), "地区")]/following-sibling::text())')\
- .extract_first().strip()
-
- year = des.xpath('string(./p[3]/span[contains(text(), "年份")]/following-sibling::text())')\
- .extract_first().strip()
- add_date = des.xpath('string(./p[4]/span[contains(text(), "时间")]/following-sibling::text())') \
- .extract_first().strip()
-
- video_page_url = response.xpath('//ul[@class="stui-content__playlist clearfix"]/li/a/@href').extract_first()
- video_page_url = '{}{}'.format(self.origin, video_page_url)
- item['stars'] = stars
- item['type'] = m_type # 类型
- item['area'] = area # 区域
- item['year'] = year # 年代
- item['director'] = director # 导演
- item['describe'] = describe # 描述
- item['score'] = 0 # 没有电影评分,给个初始值
- item['play_count'] = 0 # 点击次数 给个初始值
- item['is_show'] = True # 是否展示
- item['is_new'] = True if year == '2021' else False # 是否是最新
- item['add_date'] = add_date
- item['source'] = '80S影院'
-
- yield scrapy.Request(url=video_page_url, callback=self.video_play_page, meta={'item': item})
-
- # 视频播放页
- def video_play_page(self, response):
- item = response.meta['item']
- script_str = response.xpath('//div[@class="stui-player__video clearfix"]/script[1]/text()').extract_first()
- g_url = self.r_compile.search(script_str)
- video_url = g_url.group().replace('\\', '')
- item['video_url'] = video_url
- print(item)
- yield item
5、这是我爬的数据
代码结束,文字很少,全是代码,如果有不明白的地方可以留言,大家相互学习讨论
!!!!!!下面是我自己的小程序,请大家多多扫码打开给点支持吧!!!!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。