赞
踩
urllib:
Urllib库是python中的一个功能强大、用于操作URL,做爬虫时经常用
基本模块:
4个模块,主要针对URL(统一资源定位符)
request http的请求模块 ====爬虫
error 异常处理模块
parse 工具模块
robotparse 识别robots.txt
爬取豆瓣的地址:
https://movie.douban.com/subject/26322774/comments?start=30&limit=20&sort=new_score&status=P
#豆瓣反爬 模拟浏览器的信息 import urllib.request as request import random from bs4 import BeautifulSoup import csv #1.获取请求 def get_Request(url): # 浏览器信息的设置 通过浏览器信息的模拟,实现随机浏览器信息的请求 #火狐浏览器 #User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0 #Host: movie.douban.com header1 = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0' ,'Host':'movie.douban.com'} # google header2 = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' , 'Host': 'movie.douban.com'} headList = [header1,header2] #随机产生一个 randindex = random.randrange(0,len(headList)) realHeader = headList[randindex] req = request.Request(url=url,headers=realHeader) return req #2.通过请求获取数据 def getComment(url,commmentLists): req = get_Request(url) response = request.urlopen(req) print(response.getcode()) soup = BeautifulSoup(response.read(),'html5lib') #soup comments = soup.select_one('#comments') commentItems = comments.select('.comment-item') for item in commentItems: comment = item.select_one('.comment') commentInfo = comment.select_one('.comment-info') #评论的用户 author = commentInfo.select_one('a').text #打分 只有select找的的标签才是列表类型 score = commentInfo.select('span')[1]['title'] #获取短评 commentText = comment.select_one('.short').text.replace('\n','') # print(author,score,commentText) #csv文件格式,逗号文件格式 excel 表格文件格式 # 简单处理 下载成csv文件格式 yingping = {'author':author,'score':score,'comment':commentText} commentLists.append(yingping) return commentLists #3.下载评论 def downloadComment(commentLists): #语法 with as with open('逐梦.csv','w',newline='',encoding='utf-8') as file: csvwiter = csv.writer(file) for item in commentLists: #按每一行写入 csvwiter.writerow([item['author'],item['score'],item['comment']]) #https://movie.douban.com/subject/26322774/comments?status=P if __name__ == '__main__': #数据目标:短评 容器 commentLists = [] for index in range(0,10): url = 'https://movie.douban.com/subject/26322774/comments?start=%d&limit=20&sort=new_score&status=P'%(index*20) commentLists = getComment(url,commentLists) downloadComment(commentLists)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。