赞
踩
爬虫说明:
1、本爬虫是以面向对象的方式进行代码架构的
2、本爬虫爬取的数据存入到MongoDB数据库中
3、爬虫代码中有详细注释
代码展示
import re import time from pymongo import MongoClient import requests from lxml import html class BaBaiSpider(): def __init__(self): self.start_url = 'https://movie.douban.com/subject/26754233/reviews' self.url_temp = 'https://movie.douban.com/subject/26754233/reviews?start={}' # 由于豆瓣有ip地址访问的反爬机制 需要登录账户后获取Cookie信息 # 有条件的可以使用ip代理池 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36", 'Cookie': 'll="118160"; bid=jBJGzgkqoW0; _ga=GA1.2.299310750.1603415173; _vwo_uuid_v2=D02C810B09B328A9291DA2DE0215B1F4E|7b20627b7b4770d357d6251faaad13b7; __yadk_uid=NVdS10Z9dQ70V1AkBBbqmLR6Ny6AQC6R; UM_distinctid=175530c360058f-0cd5eb2121026b-3e604000-144000-175530c3601502; Hm_lvt_19fc7b106453f97b6a84d64302f21a04=1603416111; __utmv=30149280.22554; douban-fav-remind=1; __gads=ID=9b3fe7aa29748925-22a3ff1066c400c6:T=1603618426:RT=1603618426:S=ALNI_MZdkcEBUdorLQd-nNQm0ECaz6aPgQ; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1610800679%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; push_doumail_num=0; push_noty_num=0; dbcl2="225547599:+KzDIeqUyH8"; ck=S_qd; __utmt=1; douban-profile-remind=1; __utma=30149280.299310750.1603415173.1610800679.1610803327.13; __utmb=30149280.0.10.1610803327; __utmz=30149280.1610803327.13.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%94%B5%E5%BD%B1; __utma=223695111.299310750.1603415173.1610800679.1610803327.7; __utmb=223695111.0.10.1610803327; __utmz=223695111.1610803327.7.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%94%B5%E5%BD%B1; _pk_id.100001.4cf6=77003652978e8b92.1603415561.6.1610803542.1610797625.' } # 初始化MongoDB数据库 self.client = MongoClient() self.collection = self.client['test']['babai'] # 构造列表页url def get_url_list(self,total_page): return [self.url_temp.format(i*20) for i in range(int(total_page)+1)] # 请求并解析url地址 def parse_url(self,url): rest = requests.get(url,headers=self.headers) time.sleep(2) return rest.content.decode() # 获取并解析列表页评论数据 def get_item(self,str_html): new_html = html.etree.HTML(str_html) div_list = new_html.xpath('//div[@class="review-list "]/div') # 获取信息多采用三目运算符的方式 防止因获取的内容不存在而报异常 # 通过三目运算符进行多重判断可以增加程序的稳定性 for i in div_list: item = {} title = i.xpath('.//div[@class="main-bd"]/h2/a/text()') item['评论标题'] = title[0] if len(title)>0 else None name = i.xpath('.//a[@class="name"]/text()') item['评论人姓名'] = name[0] if len(name)>0 else None rate = i.xpath('.//span[contains(@class,"main-title-rating")]/@title') item['评价'] = rate[0] if len(rate)>0 else None time = i.xpath('.//span[@class="main-meta"]/text()') item['评论时间'] = time[0] if len(time) > 0 else None favor = i.xpath('.//div[@class="action"]/a[1]/span/text()') item['赞成数'] = favor[0].strip() if len(favor)>0 else None oppose = i.xpath('.//div[@class="action"]/a[2]/span/text()') item['反对数'] = oppose[0].strip() if len(oppose)>0 else None reply = i.xpath('.//a[@class="reply "]/text()') item['回复数'] = reply[0].split('回应')[0] if len(reply)>0 else None star = i.xpath('.//span[contains(@class,"main-title-rating")]/@class') item['评论得分'] = re.findall(r'allstar(\d)0 main-title-rating',star[0])[0] if len(star)>0 else None print(item) self.save(item) # 保存评论数据 def save(self,item): self.collection.insert(item) def run(self): # 获取数据总页码数 rest = requests.get(self.start_url,headers=self.headers) str_html = html.etree.HTML(rest.content.decode()) total_page= str_html.xpath('//div[@class
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。