当前位置:   article > 正文

天涯论坛——python舆情分析汇总(四)_疫情舆情分析代码

疫情舆情分析代码

基于天涯论坛的 “新冠疫情”舆情分析

完整数据和代码链接:https://download.csdn.net/download/weixin_43906500/14935218

1.天涯论坛数据架构

天涯论坛主页主要分为推荐板块、推荐贴文、搜索板块、其他链接四个部分。对于新冠疫情的相关帖子主要利用搜索板块进行数据获取。

搜索板块实现关键词检索,可检索出75页发帖,每页发帖文章为10

根据贴文数据,可以提取发帖人、发帖时间、点击量、回复量以及贴文内容信息

根据帖子下方评论可以提取评论信息,以进行后续的文本分析

用户主页含有用户基本信息,如用户昵称、等级、关注量、粉丝量、天涯分、注册日期等信息

2.舆情分析技术实现

2.1数据获取

使用python爬虫获取数据

  1. from pyquery import PyQuery as pq
  2. import requests
  3. from urllib.parse import quote
  4. from time import sleep
  5. import json
  6. page = 75
  7. key_word = '时政'
  8. def prase_all_page(urls):
  9. """
  10. 解析所有搜索页,获取帖子url,过滤无评论帖子
  11. :param urls:
  12. :return: content_urls
  13. """
  14. content_urls = []
  15. for url in urls:
  16. sleep(1)
  17. print('正在抓取:', url)
  18. doc = pq(requests.get(url=url, timeout=30).text)
  19. # print(doc)
  20. doc('.searchListOne li:last-child').remove() # 删除最后一个无用li节点
  21. lis = doc('.searchListOne li').items() # 获取content节点生成器
  22. for li in lis:
  23. reverse = li('.source span:last-child').text()
  24. a = li('a:first-child')
  25. content_url = a.attr('href')
  26. # print(content_url)
  27. # print('评论数:', reverse)
  28. content_urls.append(content_url)
  29. return content_urls
  30. def prase_all_content(urls):
  31. """
  32. 获取网页相关信息
  33. :param urls:
  34. :return:
  35. """
  36. dic = []
  37. i = 0
  38. for url in urls:
  39. print(i)
  40. i = i + 1
  41. try:
  42. dic1 = {}
  43. print('正在解析:', url)
  44. doc = pq(requests.get(url=url, timeout=30).text)
  45. title = doc('.atl-head .atl-title').text()
  46. main_id = doc('.atl-head .atl-menu').attr('_host')
  47. replytime = doc('.atl-head .atl-menu').attr('js_replytime')
  48. loc = replytime.rfind('-')
  49. # print(replytime)
  50. print(replytime[0:4])
  51. if(replytime[0:4]!="2020 "):
  52. continue
  53. print(replytime)
  54. replycount = doc('.atl-head .atl-menu').attr('js_replycount')
  55. clickcount = doc('.atl-head .atl-menu').attr('js_clickcount')
  56. article = next(doc('.bbs-content').items()).text()
  57. dic1["title"] = str(title)
  58. dic1["main_id"] = main_id
  59. dic1["time"] = replytime
  60. dic1["replycount"] = replycount
  61. dic1["clickcount"] = clickcount
  62. dic1["article"] = article
  63. comments_replys = []
  64. comments = doc('.atl-main div:gt(1)').items() # 通栏广告后的评论列表
  65. for comment in comments: # 处理评论
  66. dic3 = {}
  67. dic4 = {}
  68. dic5 = {}
  69. host_id = comment.attr('_hostid')
  70. # user_name = comment.attr('_host')
  71. comment_text = comment('.bbs-content').text()
  72. replys = comment('.item-reply-view li').items() # 评论回复
  73. if replys != None:
  74. for reply in replys:
  75. rid = reply.attr('_rid')
  76. rtext = reply('.ir-content').text()
  77. if rid:
  78. if rid != main_id and rid != host_id:
  79. dic5[host_id] = rtext
  80. if host_id:
  81. k = comment_text.rfind("----------------------------")
  82. if (k != -1):
  83. comment_text = comment_text[k + 29:]
  84. dic4[host_id] = comment_text
  85. dic3['comment'] = dic4
  86. dic3['reply'] = dic5
  87. comments_replys.append(dic3)
  88. dic1["comments_replys"] = comments_replys
  89. dic.append(dic1)
  90. except:
  91. continue
  92. string = json.dumps(dic, ensure_ascii=False, indent=4)
  93. print(string)
  94. file_name = key_word + ".json"
  95. f = open(file_name,'w',encoding='utf-8')
  96. json.dump(dic,f,ensure_ascii=False, indent=4)
  97. def run(key, page):
  98. """
  99. :param key:
  100. :param page:
  101. :return:
  102. """
  103. start_urls = []
  104. for p in range(1, page+1):
  105. url = 'http://search.tianya.cn/bbs?q={}&pn={}'.format(quote(key), p)
  106. start_urls.append(url)
  107. content_urls = prase_all_page(start_urls)
  108. # print(content_urls)
  109. prase_all_content(content_urls)
  110. if __name__ == '__main__':
  111. run(key_word, page)

结果如下:

2.2趋势分析

通过对发帖时间的统计,获取每个月发帖数量;通过对每月发帖文章进行统计,获取每月发帖关键词;而后进行趋势分析

  1. import json
  2. from collections import Counter
  3. from pyecharts.charts import Bar
  4. import jieba
  5. from pyecharts import options as opts
  6. #去除停用词
  7. def get_stopwords():
  8. stopwords = [line.strip() for line in open("stopword.txt", 'r',encoding="utf-8").readlines()]
  9. stopwords_other = ['\n',' ']
  10. stopwords.extend(stopwords_other)
  11. return stopwords
  12. def get_article_count_plus():
  13. with open("data.json",'r',encoding='utf-8') as load_f:
  14. load_dict = json.load(load_f)
  15. stopwords = get_stopwords()
  16. list = []
  17. dic_word = {}
  18. for dic in load_dict:
  19. time = dic['time']
  20. loc = time.rfind('-')
  21. list.append(time[0:7])
  22. article = dic['article']
  23. seg_list = jieba.lcut(article)
  24. month = time[0:7]
  25. if month in dic_word.keys():
  26. dic_word[month].extend(seg_list)
  27. else:
  28. dic_word[month] = []
  29. dic = dict(Counter(list))
  30. d = sorted(dic.items(), key=lambda d:d[0])
  31. key_word_used = []
  32. key_word = []
  33. for k in d:
  34. m = k[0]
  35. list = [i for i in dic_word[m] if i not in stopwords]
  36. word_count = Counter(list)
  37. word_list = word_count.most_common(12)
  38. for i in word_list:
  39. if(i[0] not in key_word_used):
  40. key_word.append(i[0])
  41. key_word_used.append(i[0])
  42. break
  43. columns = [i[0] for i in d]
  44. data = [i[1] for i in d]
  45. col = []
  46. for i in range(len(columns)):
  47. c1 = columns[i].find('-')
  48. m = columns[i][c1+1:] + '(' + key_word[i] + ')'
  49. col.append(m)
  50. print(col)
  51. print(data)
  52. return col,data
  53. if __name__ == "__main__":
  54. col,data = get_article_count_plus()
  55. c = (
  56. Bar()
  57. .add_xaxis(col)
  58. .add_yaxis("发帖量", data)
  59. .set_global_opts(title_opts=opts.TitleOpts(title="发帖量及关键词统计", subtitle="柱状图"))
  60. )
  61. c.render("article_conut_plus.html")

可视化结果如下

2.3词云绘制

通过对所有文章使用jieba分词、去除停用词并进行词频统计,而后利用python

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/222852?site
推荐阅读
相关标签
  

闽ICP备14008679号