基于天涯论坛的 "新冠疫情"舆情分析











  1. from pyquery import PyQuery as pq
  2. import requests
  3. from urllib.parse import quote
  4. from time import sleep
  5. import json
  6. page = 75
  7. key_word = '时政'
  8. def prase_all_page(urls):
  9. """
  10. 解析所有搜索页,获取帖子url,过滤无评论帖子
  11. :param urls:
  12. :return: content_urls
  13. """
  14. content_urls = []
  15. for url in urls:
  16. sleep(1)
  17. print('正在抓取:', url)
  18. doc = pq(requests.get(url=url, timeout=30).text)
  19. # print(doc)
  20. doc('.searchListOne li:last-child').remove() # 删除最后一个无用li节点
  21. lis = doc('.searchListOne li').items() # 获取content节点生成器
  22. for li in lis:
  23. reverse = li('.source span:last-child').text()
  24. a = li('a:first-child')
  25. content_url = a.attr('href')
  26. # print(content_url)
  27. # print('评论数:', reverse)
  28. content_urls.append(content_url)
  29. return content_urls
  30. def prase_all_content(urls):
  31. """
  32. 获取网页相关信息
  33. :param urls:
  34. :return:
  35. """
  36. dic = []
  37. i = 0
  38. for url in urls:
  39. print(i)
  40. i = i + 1
  41. try:
  42. dic1 = {}
  43. print('正在解析:', url)
  44. doc = pq(requests.get(url=url, timeout=30).text)
  45. title = doc('.atl-head .atl-title').text()
  46. main_id = doc('.atl-head .atl-menu').attr('_host')
  47. replytime = doc('.atl-head .atl-menu').attr('js_replytime')
  48. loc = replytime.rfind('-')
  49. # print(replytime)
  50. print(replytime[0:4])
  51. if(replytime[0:4]!="2020 "):
  52. continue
  53. print(replytime)
  54. replycount = doc('.atl-head .atl-menu').attr('js_replycount')
  55. clickcount = doc('.atl-head .atl-menu').attr('js_clickcount')
  56. article = next(doc('.bbs-content').items()).text()
  57. dic1["title"] = str(title)
  58. dic1["main_id"] = main_id
  59. dic1["time"] = replytime
  60. dic1["replycount"] = replycount
  61. dic1["clickcount"] = clickcount
  62. dic1["article"] = article
  63. comments_replys = []
  64. comments = doc('.atl-main div:gt(1)').items() # 通栏广告后的评论列表
  65. for comment in comments: # 处理评论
  66. dic3 = {}
  67. dic4 = {}
  68. dic5 = {}
  69. host_id = comment.attr('_hostid')
  70. # user_name = comment.attr('_host')
  71. comment_text = comment('.bbs-content').text()
  72. replys = comment('.item-reply-view li').items() # 评论回复
  73. if replys != None:
  74. for reply in replys:
  75. rid = reply.attr('_rid')
  76. rtext = reply('.ir-content').text()
  77. if rid:
  78. if rid != main_id and rid != host_id:
  79. dic5[host_id] = rtext
  80. if host_id:
  81. k = comment_text.rfind("----------------------------")
  82. if (k != -1):
  83. comment_text = comment_text[k + 29:]
  84. dic4[host_id] = comment_text
  85. dic3['comment'] = dic4
  86. dic3['reply'] = dic5
  87. comments_replys.append(dic3)
  88. dic1["comments_replys"] = comments_replys
  89. dic.append(dic1)
  90. except:
  91. continue
  92. string = json.dumps(dic, ensure_ascii=False, indent=4)
  93. print(string)
  94. file_name = key_word + ".json"
  95. f = open(file_name,'w',encoding='utf-8')
  96. json.dump(dic,f,ensure_ascii=False, indent=4)
  97. def run(key, page):
  98. """
  99. :param key:
  100. :param page:
  101. :return:
  102. """
  103. start_urls = []
  104. for p in range(1, page+1):
  105. url = 'http://search.tianya.cn/bbs?q={}&pn={}'.format(quote(key), p)
  106. start_urls.append(url)
  107. content_urls = prase_all_page(start_urls)
  108. # print(content_urls)
  109. prase_all_content(content_urls)
  110. if __name__ == '__main__':
  111. run(key_word, page)




  1. import json
  2. from collections import Counter
  3. from pyecharts.charts import Bar
  4. import jieba
  5. from pyecharts import options as opts
  6. #去除停用词
  7. def get_stopwords():
  8. stopwords = [line.strip() for line in open("stopword.txt", 'r',encoding="utf-8").readlines()]
  9. stopwords_other = ['\n',' ']
  10. stopwords.extend(stopwords_other)
  11. return stopwords
  12. def get_article_count_plus():
  13. with open("data.json",'r',encoding='utf-8') as load_f:
  14. load_dict = json.load(load_f)
  15. stopwords = get_stopwords()
  16. list = []
  17. dic_word = {}
  18. for dic in load_dict:
  19. time = dic['time']
  20. loc = time.rfind('-')
  21. list.append(time[0:7])
  22. article = dic['article']
  23. seg_list = jieba.lcut(article)
  24. month = time[0:7]
  25. if month in dic_word.keys():
  26. dic_word[month].extend(seg_list)
  27. else:
  28. dic_word[month] = []
  29. dic = dict(Counter(list))
  30. d = sorted(dic.items(), key=lambda d:d[0])
  31. key_word_used = []
  32. key_word = []
  33. for k in d:
  34. m = k[0]
  35. list = [i for i in dic_word[m] if i not in stopwords]
  36. word_count = Counter(list)
  37. word_list = word_count.most_common(12)
  38. for i in word_list:
  39. if(i[0] not in key_word_used):
  40. key_word.append(i[0])
  41. key_word_used.append(i[0])
  42. break
  43. columns = [i[0] for i in d]
  44. data = [i[1] for i in d]
  45. col = []
  46. for i in range(len(columns)):
  47. c1 = columns[i].find('-')
  48. m = columns[i][c1+1:] + '(' + key_word[i] + ')'
  49. col.append(m)
  50. print(col)
  51. print(data)
  52. return col,data
  53. if __name__ == "__main__":
  54. col,data = get_article_count_plus()
  55. c = (
  56. Bar()
  57. .add_xaxis(col)
  58. .add_yaxis("发帖量", data)
  59. .set_global_opts(title_opts=opts.TitleOpts(title="发帖量及关键词统计", subtitle="柱状图"))
  60. )
  61. c.render("article_conut_plus.html")




