赞
踩
python-每日一练-抽取某本书的前50条短评内容并计算评分的平均值
- '''
- 抽取某本书的前50条短评内容并计算评分的平均值。提示:有的评论中并不包含评分。
- '''
- import requests
- import re
- from bs4 import BeautifulSoup
- import time
- from functools import reduce
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
- 'Cookie': 'll="118159"; bid=Tlj9LZXK6qY; __utmc=30149280; __utmc=81379588; gr_user_id=4cc23846-8110-487a-9875-d2c22a01ffc5; _vwo_uuid_v2=DDF9B30D066068B64AEED05150ED2CC21|59ca475fd37ba868d8b940b30cb8051c; __gads=ID=30286cce13d6257e-222fedbd9fd00006:T=1644908408:RT=1644908408:S=ALNI_Mbxvt-k0lb2MC6tAcI60A5qPobCCw; __yadk_uid=OU6iPv3WVtSb7pup23vqweBrH4Gj0jdG; _ga=GA1.1.1475140875.1644908471; refer_url=https://read.douban.com/category/105; viewed="35630000_35620000_35610000_35690000_10790000_10800000_10900000_11000000_11000044_11000544"; dbcl2="191997283:aBzcj8HkRFg"; ck=7X40; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=9240ed32-5eb3-42cf-941a-95cdfa3e0efa; gr_cs1_9240ed32-5eb3-42cf-941a-95cdfa3e0efa=user_id%3A1; __utma=30149280.1907998096.1644908381.1644913515.1644932425.3; __utmz=30149280.1644932425.3.2.utmcsr=read.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_douban=1; __utmz=81379588.1644932425.3.2.utmcsr=read.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=81379588.181811108.1644908384.1644913517.1644932425.3; __utmt=1; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1644932425%2C%22https%3A%2F%2Fread.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_9240ed32-5eb3-42cf-941a-95cdfa3e0efa=true; push_doumail_num=0; push_noty_num=0; _ga_RXNMP372GL=GS1.1.1644931471.4.1.1644932479.60; __utmt=1; __utmv=30149280.19199; __utmb=30149280.9.10.1644932425; __utmb=81379588.5.10.1644932425; _pk_id.100001.3ac3=5c3c19525f813f77.1644908384.3.1644932627.1644914955.'
- }
- # 获取短评评分
- def get_shorts(arr,num,start):
- time.sleep(1)
- patter = re.compile(pattern='"user-stars allstar(.*) rating"')
- r = requests.get('https://book.douban.com/subject/26873486/comments/?start={}&limit=20&status=P&sort=new_score'.format(str(start)),headers = headers)
- if r.status_code == 200 :
- market = r.text
- soup = BeautifulSoup(market,'lxml')
- datas = soup.find_all('li','comment-item')
- data_lens = len(datas)
- # print(data_lens)
- for i in range(0,data_lens):
- data = datas[i]
- data_str = str(data)
- patt_rs = re.findall(patter,data_str)
- if patt_rs and len(arr)<num:
- # 获取短评评分
- score = int(patt_rs[0])
- # 获取短评内容
- short = data.find('span','short')
- # print(patt_rs)
- # print(short)
- arr.append(score)
- return arr,start
- else:
- print('获取失败',r.status_code)
- return arr,start
-
- # 评分列表
- arr_score = []
- # 获取评分数量
- score_num = 50
- # 起始页数
- start = 0
- # 获取50条短评评分
- while len(arr_score)<50:
- arr,start = get_shorts(arr_score,score_num,start)
- start+=1
- # 获取短评评分平均分
- avg_score = reduce(lambda x,y:x+y,arr_score)//len(arr_score)
- # 打印
- print('短评评分平均分为:',avg_score)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。