赞
踩
# 数据爬取
def start_crawl(base_url,month,number):
for i in range(2,month):
for j in range(1,number):
headers = request_header()
url = base_url.format(i,i+1,j)
time.sleep(0.5)
response = requests.get(url=url, headers=headers)
res = etree.HTML(response.text)
yield res
# 数据获取并清洗 def data_clean(base_url,month,number): res = start_crawl(base_url,month,number) data = [] for html in res: div_list = html.xpath('//div[@class="content"]') for div in div_list: info = [] name = div.xpath('./div[@class="info"]//a[@class="name"]/text()') content = div.xpath('./p[@class="txt"]//text()') time = div.xpath('./p[@class="from"]/a[1]/text()') name = name[0] if len(name) > 0 else None time = time[0].strip() if len(time) > 0 else None content = ''.join([i.strip() for i in content]) info.append(name) info.append(time) info.append(content) if info[0] is not None and info[1] is not None: data.append(info) return data
附上一张元素审查时的经典图:
以下附上部分数据表:
data = data.loc[data['发布时间'].str.contains('今天') == False,:] # 去除干扰 data.index = np.arange(len(data)) # 取出月份 data["发布时间"] = data["发布时间"].apply(lambda x: x[:3]) # 对发布时间进行分层聚合处理 time_count = data.groupby(by="发布时间").count() # 画图 import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.plot(time_count.index,time_count.values.ravel(),"-g") plt.title("随时间的热度走向图") plt.xlabel("时间(月份)") plt.ylabel("发布量(个)") plt.show()
plt.bar(x=time_count.index,height=time_count.values.ravel(),align='center',color="g")
plt.title("随时间的热度走向图")
plt.xlabel("时间(月份)")
plt.ylabel("发布量(个)")
plt.show()
# 自定义官媒名称 OMN = ["青岛文明网","880山西交通广播","楚天交通广播","央视影音","抚州南丰发布",\ "珠江商报","云南日报","国家邮政局","中国青年报","武汉晚报","长江日报",\ "新都资讯","成都少先队","成都发布","潍坊政法","杭锦发布","四川党的建设杂志",\ "法治日报","中国舆论场","苏州发布","新浪辽宁","山西经济广播","山西共青团",\ "北京朝阳","江西共青团","包头新闻网","鄂尔多斯发布","羊城晚报","保定发布",\ "陇南礼县发布","中国残联","人民日报全国党媒平台","四川文明网","内蒙古团委",\ "武汉广播电视台","湖北卫视","潍坊市人民检察院","湖北省妇联","湖北日报",\ "人民日报","江西卫视根据地","陕西新闻广播","南京晨报","今晚报","广东共青团",\ "南昌日报","河北综合广播","陕西都市快报","楚天交通广播","陕西新广","青海网",\ "山东卫视","渭南日报社","国家应急广播","央视新闻","罗湖共青团","内蒙古团委",\ "三秦青年","共青团青岛市委","中国共青团杂志","重庆共青团","延安青年","共青团中央",\ "西藏共青团","大同共青团","中国青年报","黑龙江晨报","国家邮政局","四川共青团",\ "合肥日报","陕西日报","南京晨报","中国新闻网","江苏共青团","共青团南川区","惠州共青团",\ "人民法院报","天津日报","青海共青团","广西卫视","贺州共青团","共青团包头市委员会",\ "西藏共青团","武汉发布","中国网","山西政法","云南网","吉林人民广播电台" ] OM = [] for name in data["媒体名"]: if name in OMN: count += 1 OM.append(name) OM = pd.Series(OM) PM = [] for name in data["媒体名"]: if name not in OMN: count += 1 PM.append(name) PM = pd.Series(PM) plt.figure(figsize=(10,8)) x = (OM.size,PM.size) labels = ["官媒","自媒体"] plt.pie(x=x,labels=labels,colors=['red','c'],autopct='%.1f%%',pctdistance=0.5,labeldistance=1.2,radius=1.2,explode=[0,0.1],\ wedgeprops={'linewidth':1.5,'edgecolor':'green'},textprops={'fontsize':10,'color':'black'}) plt.title("官媒和自媒体占比图") plt.show()
# 分别切分出官媒的数据和自媒体数据 OM_data = data_.loc[data_['媒体名'].isin(OM)] PM_data = data_.loc[~data_['媒体名'].isin(OM)] # 对两种媒体分别进行分层聚合 om_plot = OM_data.groupby(by="发布时间").count() pm_plot = PM_data_.groupby(by="发布时间").count() # 画图 plt.figure(figsize=(10,8)) ax = plt.subplot(111) ax.plot(om_plot.index,om_plot.values,linewidth=3,label="官媒") ax.plot(pm_plot.index,pm_plot.values,linewidth=3,label="自媒体") plt.title("官媒和自媒体随时间的热度走向图") plt.xlabel("时间(月份)") plt.ylabel("发布量(个)") plt.legend(loc="best") plt.show()
# 获取2月~3月的日期 data_["发布时间"] = data_["发布时间"].apply(lambda x: x[:6]) data_1 = data_.loc[data_["发布时间"].str.contains("02月")] data_2 = data_.loc[data_["发布时间"].str.contains("03月")] data_ = pd.concat([data_1,data_2],axis=0) data_.index = np.arange(len(data_)) # 取出官媒和自媒体各自数据 OM_data_ = data_.loc[data_['媒体名'].isin(OM)] PM_data_ = data_.loc[~data_['媒体名'].isin(OM)] # 分别对其分层聚合处理 om_plot_ = OM_data_.groupby(by="发布时间").count() pm_plot_ = PM_data_.groupby(by="发布时间").count() # 画图 plt.figure(figsize=(15,5)) ax = plt.subplot(121) ax.plot(om_plot_.index,om_plot_.values,linewidth=3,label="官媒") plt.title("2月~3月官媒随时间的热度走向图") plt.xlabel("时间(天数)") plt.ylabel("发布量(个)") plt.legend(loc="best") ax_2 = plt.subplot(122) ax_2.plot(pm_plot_.index,pm_plot_.values,linewidth=3,label="自媒体") plt.title("2月~3月自媒体随时间的热度走向图") plt.xlabel("时间(天数)") plt.ylabel("发布量(个)") plt.xticks([]) plt.legend(loc="best") plt.show()
data_ = pd.read_excel('C:/Users/雷神/Desktop/wy.xlsx',sheet_name=0,index_col=0) data_ = data_.loc[data_['发布时间'].str.contains('今天') == False,:] # 去除干扰 article = data_["文章"] text = [] for a in article: text.append(a) text = ",".join(str(i) for i in text) # 将字符串切分为单个字符 def chinese_jieba(text): wordlist_jieba=jieba.cut(text,cut_all=True) space_wordlist='/'.join(wordlist_jieba) return space_wordlist # 绘制词云图 with open("F:/wangyong/article.txt",encoding="utf-8") as f: text = f.read() text = chinese_jieba(text) image = PIL.Image.open('F:/wangyong/bg2.jpg') MASK = np.array(image) wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\simkai.ttf",\ background_color="white", width=800, \ repeat=False, mask=MASK,\ height=500, max_words=180,min_font_size=8).generate(text) #img = wordcloud.to_image() #img.show() wordcloud.to_file('F:/wangyong/wordcloud.png') plt.figure(figsize=(10,8),dpi=100) plt.imshow(wordcloud) plt.axis('off') plt.show()
链接: 点击跳转
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。