当前位置:   article > 正文

Python - 爬取豆瓣短评评论_python爬短评

python爬短评

Python - 爬取豆瓣短评评论

  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4. import time
  5. # 保存豆瓣评分
  6. source_douban_score = r'e:/test/txt/douban_book_score.txt'
  7. # 获取书名
  8. def get_book_name(soup):
  9. nbg = soup.find('a','nbg')
  10. book_tilte = nbg['title']
  11. return book_tilte
  12. # 获取评论
  13. def get_comment(soup):
  14. shorts = soup.find_all('span','short')
  15. for short in shorts:
  16. short_content = short.string
  17. #short_tag = short.name
  18. #short_attrs = short.attrs
  19. print(short_content)
  20. # 获取评分
  21. def get_book_score(markup):
  22. pattern_s = re.compile(pattern='user-stars allstar(.*) rating')
  23. stars = re.findall(pattern=pattern_s,string=markup)
  24. sum_star = 0
  25. sum_len = len(stars)
  26. avg_score = 0
  27. if sum_len>0:
  28. for i in range(0,sum_len):
  29. star = stars[i]
  30. sum_star+= int(star)
  31. avg_score = sum_star//sum_len
  32. return avg_score
  33. # 添加请求头,Cookie需要自己获取
  34. headers = {
  35. 'Cookie': 'll="118159"; bid=Tlj9LZXK6qY; __utma=30149280.1907998096.1644908381.1644908381.1644908381.1; __utmc=30149280; __utmz=30149280.1644908381.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=81379588.181811108.1644908384.1644908384.1644908384.1; __utmc=81379588; __utmz=81379588.1644908384.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_user_id=4cc23846-8110-487a-9875-d2c22a01ffc5; gr_cs1_beee101c-a4d0-4c07-acf1-64c7195fdd7b=user_id%3A0; ap_v=0,6.0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1644908384%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; _vwo_uuid_v2=DDF9B30D066068B64AEED05150ED2CC21|59ca475fd37ba868d8b940b30cb8051c; __gads=ID=30286cce13d6257e-222fedbd9fd00006:T=1644908408:RT=1644908408:S=ALNI_Mbxvt-k0lb2MC6tAcI60A5qPobCCw; viewed="35680544"; __utmb=30149280.4.10.1644908381; __utmb=81379588.3.10.1644908384; __yadk_uid=OU6iPv3WVtSb7pup23vqweBrH4Gj0jdG; _pk_id.100001.3ac3=5c3c19525f813f77.1644908384.1.1644908470.1644908384.; _ga=GA1.1.1475140875.1644908471; refer_url=https://read.douban.com/category/105; _ga_RXNMP372GL=GS1.1.1644908471.1.1.1644909370.60',
  36. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
  37. 'Host': 'book.douban.com',
  38. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
  39. }
  40. def get_book(book_id):
  41. douban_url = 'https://book.douban.com/subject/{}/'.format(book_id)
  42. print('获取书号:',book_id,douban_url)
  43. try:
  44. print()
  45. r = requests.get(douban_url,headers = headers)
  46. if r.status_code == 200:
  47. time.sleep(.6)
  48. markup = r.text
  49. soup = BeautifulSoup(markup,'lxml')
  50. # 获取书名
  51. book_name = get_book_name(soup)
  52. # 获取评论
  53. # get_comment()
  54. # 获取评分
  55. score = get_book_score(markup)
  56. if score >0:
  57. with open(source_douban_score,'a') as f:
  58. content = book_name+' '+ str(score)
  59. f.write(content)
  60. print(content)
  61. else:
  62. print('获取失败,',r.status_code)
  63. except Exception as err:
  64. print('获取错误',err)
  65. # 获取 https://book.douban.com/subject/35680544/
  66. get_book(str(35680544))

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号