赞
踩
完整数据和代码链接:https://download.csdn.net/download/weixin_43906500/14935218
天涯论坛主页主要分为推荐板块、推荐贴文、搜索板块、其他链接四个部分。对于新冠疫情的相关帖子主要利用搜索板块进行数据获取。
搜索板块实现关键词检索,可检索出75页发帖,每页发帖文章为10篇
根据贴文数据,可以提取发帖人、发帖时间、点击量、回复量以及贴文内容信息
根据帖子下方评论可以提取评论信息,以进行后续的文本分析
用户主页含有用户基本信息,如用户昵称、等级、关注量、粉丝量、天涯分、注册日期等信息
使用python爬虫获取数据
-
-
- from pyquery import PyQuery as pq
- import requests
- from urllib.parse import quote
- from time import sleep
- import json
-
- page = 75
- key_word = '时政'
-
-
- def prase_all_page(urls):
- """
- 解析所有搜索页,获取帖子url,过滤无评论帖子
- :param urls:
- :return: content_urls
- """
-
- content_urls = []
- for url in urls:
- sleep(1)
- print('正在抓取:', url)
- doc = pq(requests.get(url=url, timeout=30).text)
- # print(doc)
- doc('.searchListOne li:last-child').remove() # 删除最后一个无用li节点
- lis = doc('.searchListOne li').items() # 获取content节点生成器
- for li in lis:
- reverse = li('.source span:last-child').text()
- a = li('a:first-child')
- content_url = a.attr('href')
- # print(content_url)
- # print('评论数:', reverse)
- content_urls.append(content_url)
-
- return content_urls
-
-
- def prase_all_content(urls):
- """
- 获取网页相关信息
- :param urls:
- :return:
- """
- dic = []
- i = 0
- for url in urls:
- print(i)
- i = i + 1
- try:
- dic1 = {}
- print('正在解析:', url)
- doc = pq(requests.get(url=url, timeout=30).text)
- title = doc('.atl-head .atl-title').text()
- main_id = doc('.atl-head .atl-menu').attr('_host')
- replytime = doc('.atl-head .atl-menu').attr('js_replytime')
- loc = replytime.rfind('-')
- # print(replytime)
- print(replytime[0:4])
- if(replytime[0:4]!="2020 "):
- continue
- print(replytime)
- replycount = doc('.atl-head .atl-menu').attr('js_replycount')
- clickcount = doc('.atl-head .atl-menu').attr('js_clickcount')
- article = next(doc('.bbs-content').items()).text()
- dic1["title"] = str(title)
- dic1["main_id"] = main_id
- dic1["time"] = replytime
- dic1["replycount"] = replycount
- dic1["clickcount"] = clickcount
- dic1["article"] = article
-
- comments_replys = []
- comments = doc('.atl-main div:gt(1)').items() # 通栏广告后的评论列表
- for comment in comments: # 处理评论
- dic3 = {}
- dic4 = {}
- dic5 = {}
- host_id = comment.attr('_hostid')
- # user_name = comment.attr('_host')
- comment_text = comment('.bbs-content').text()
- replys = comment('.item-reply-view li').items() # 评论回复
-
- if replys != None:
- for reply in replys:
- rid = reply.attr('_rid')
- rtext = reply('.ir-content').text()
- if rid:
- if rid != main_id and rid != host_id:
- dic5[host_id] = rtext
-
- if host_id:
- k = comment_text.rfind("----------------------------")
- if (k != -1):
- comment_text = comment_text[k + 29:]
- dic4[host_id] = comment_text
- dic3['comment'] = dic4
- dic3['reply'] = dic5
- comments_replys.append(dic3)
- dic1["comments_replys"] = comments_replys
- dic.append(dic1)
- except:
- continue
- string = json.dumps(dic, ensure_ascii=False, indent=4)
- print(string)
- file_name = key_word + ".json"
- f = open(file_name,'w',encoding='utf-8')
- json.dump(dic,f,ensure_ascii=False, indent=4)
-
-
-
-
- def run(key, page):
- """
- :param key:
- :param page:
- :return:
- """
- start_urls = []
- for p in range(1, page+1):
- url = 'http://search.tianya.cn/bbs?q={}&pn={}'.format(quote(key), p)
- start_urls.append(url)
- content_urls = prase_all_page(start_urls)
- # print(content_urls)
- prase_all_content(content_urls)
-
-
- if __name__ == '__main__':
- run(key_word, page)
结果如下:
通过对发帖时间的统计,获取每个月发帖数量;通过对每月发帖文章进行统计,获取每月发帖关键词;而后进行趋势分析
- import json
- from collections import Counter
- from pyecharts.charts import Bar
- import jieba
- from pyecharts import options as opts
-
-
- #去除停用词
- def get_stopwords():
- stopwords = [line.strip() for line in open("stopword.txt", 'r',encoding="utf-8").readlines()]
- stopwords_other = ['\n',' ']
- stopwords.extend(stopwords_other)
- return stopwords
-
-
- def get_article_count_plus():
- with open("data.json",'r',encoding='utf-8') as load_f:
- load_dict = json.load(load_f)
-
- stopwords = get_stopwords()
- list = []
- dic_word = {}
- for dic in load_dict:
- time = dic['time']
- loc = time.rfind('-')
- list.append(time[0:7])
- article = dic['article']
- seg_list = jieba.lcut(article)
- month = time[0:7]
- if month in dic_word.keys():
- dic_word[month].extend(seg_list)
- else:
- dic_word[month] = []
-
- dic = dict(Counter(list))
- d = sorted(dic.items(), key=lambda d:d[0])
-
-
- key_word_used = []
- key_word = []
- for k in d:
- m = k[0]
- list = [i for i in dic_word[m] if i not in stopwords]
- word_count = Counter(list)
- word_list = word_count.most_common(12)
- for i in word_list:
- if(i[0] not in key_word_used):
- key_word.append(i[0])
- key_word_used.append(i[0])
- break
- columns = [i[0] for i in d]
- data = [i[1] for i in d]
-
- col = []
- for i in range(len(columns)):
- c1 = columns[i].find('-')
- m = columns[i][c1+1:] + '(' + key_word[i] + ')'
- col.append(m)
-
- print(col)
- print(data)
- return col,data
-
- if __name__ == "__main__":
- col,data = get_article_count_plus()
- c = (
- Bar()
- .add_xaxis(col)
- .add_yaxis("发帖量", data)
- .set_global_opts(title_opts=opts.TitleOpts(title="发帖量及关键词统计", subtitle="柱状图"))
- )
- c.render("article_conut_plus.html")
可视化结果如下
通过对所有文章使用jieba分词、去除停用词并进行词频统计,而后利用python
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。