赞
踩
Python - 爬取豆瓣短评评论
- import requests
- from bs4 import BeautifulSoup
- import re
- import time
-
- # 保存豆瓣评分
- source_douban_score = r'e:/test/txt/douban_book_score.txt'
-
- # 获取书名
- def get_book_name(soup):
- nbg = soup.find('a','nbg')
- book_tilte = nbg['title']
- return book_tilte
- # 获取评论
- def get_comment(soup):
- shorts = soup.find_all('span','short')
- for short in shorts:
- short_content = short.string
- #short_tag = short.name
- #short_attrs = short.attrs
- print(short_content)
-
- # 获取评分
- def get_book_score(markup):
- pattern_s = re.compile(pattern='user-stars allstar(.*) rating')
- stars = re.findall(pattern=pattern_s,string=markup)
- sum_star = 0
- sum_len = len(stars)
- avg_score = 0
- if sum_len>0:
- for i in range(0,sum_len):
- star = stars[i]
- sum_star+= int(star)
- avg_score = sum_star//sum_len
- return avg_score
-
-
- # 添加请求头,Cookie需要自己获取
- headers = {
- 'Cookie': 'll="118159"; bid=Tlj9LZXK6qY; __utma=30149280.1907998096.1644908381.1644908381.1644908381.1; __utmc=30149280; __utmz=30149280.1644908381.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=81379588.181811108.1644908384.1644908384.1644908384.1; __utmc=81379588; __utmz=81379588.1644908384.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_user_id=4cc23846-8110-487a-9875-d2c22a01ffc5; gr_cs1_beee101c-a4d0-4c07-acf1-64c7195fdd7b=user_id%3A0; ap_v=0,6.0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1644908384%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; _vwo_uuid_v2=DDF9B30D066068B64AEED05150ED2CC21|59ca475fd37ba868d8b940b30cb8051c; __gads=ID=30286cce13d6257e-222fedbd9fd00006:T=1644908408:RT=1644908408:S=ALNI_Mbxvt-k0lb2MC6tAcI60A5qPobCCw; viewed="35680544"; __utmb=30149280.4.10.1644908381; __utmb=81379588.3.10.1644908384; __yadk_uid=OU6iPv3WVtSb7pup23vqweBrH4Gj0jdG; _pk_id.100001.3ac3=5c3c19525f813f77.1644908384.1.1644908470.1644908384.; _ga=GA1.1.1475140875.1644908471; refer_url=https://read.douban.com/category/105; _ga_RXNMP372GL=GS1.1.1644908471.1.1.1644909370.60',
- 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
- 'Host': 'book.douban.com',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
- }
-
-
- def get_book(book_id):
- douban_url = 'https://book.douban.com/subject/{}/'.format(book_id)
- print('获取书号:',book_id,douban_url)
- try:
- print()
- r = requests.get(douban_url,headers = headers)
- if r.status_code == 200:
- time.sleep(.6)
- markup = r.text
- soup = BeautifulSoup(markup,'lxml')
- # 获取书名
- book_name = get_book_name(soup)
- # 获取评论
- # get_comment()
- # 获取评分
- score = get_book_score(markup)
- if score >0:
- with open(source_douban_score,'a') as f:
- content = book_name+' '+ str(score)
- f.write(content)
- print(content)
- else:
- print('获取失败,',r.status_code)
- except Exception as err:
- print('获取错误',err)
-
- # 获取 https://book.douban.com/subject/35680544/
- get_book(str(35680544))

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。