赞
踩
我所用的python版本为: Python 3.7.4
可以到w3cschool了解
打开百度新闻网站,按F12开发者工具,或者右键点击查看源,就可以看到网页的源代码。
1.每个新闻网页通过request请求获得网页源代码,再通过bs4(beautifulsoup)来对源代码进行提取信息;
2.每个类的新闻获取标题、链接是相同的。通过观察源代码,可发现新闻的标题是存在li标签中a标签里面并且每个a标签都有<target=”_blank”>属性,通过bs4(beautifulsoup)的select()来获取其标题和链接;
如图:
新闻标题、链接存放的代码特征:
3.而首页新闻中有新闻热搜词,体育新闻中有体育热搜词,其标题和链接也是以上述的特征存储;
4. 除了首页的新闻,其它类的新闻里面新闻的每个网页的源代码几乎都是差不多的。而前面先获取了每类新闻里面每个新闻链接,重新以步骤1来提取相关信息(编辑作者、编辑日期、编辑时间)。通过查看网页源代码,可发现编辑信息是放在类名为author-txt的div块里面(div class=”author-txt”),编辑作者姓名放在类名为author-name的p标签,编辑日期、时间放在类名为date的span标签和类名为time的span标签里。而当中也有些新闻的网页源代码是不同的,只能以-1的形式存入信息列表中;小部分的编辑信息存放不符合上面的特征(就以-1代替):
5.每个信息都以一个列表来进行存储;
6.将每个列表的信息通过pandas库来存入csv文件中;
7.进行每类数据分析可视化处理时,通过datetime来获取今天、昨天的时间,然后用dict字典和一定运算来统计每类新闻里面今天、昨天、其它时间三个编辑时间分布的百分比,然后用matplotlib库来画条形图。
8.通过tkinter库设计一个界面,把每类新闻设计成一个按钮,通过点击按钮中显示出每类里面的新闻信息;把首页、体育新闻热搜词放在左下、右下两个角。
可以点击按钮获取各类信息:
发布日期分布的分析:
main.py(主程序):
from tkinter import * import datetime import numpy as np import matplotlib.pyplot as plt from hp import news_title,news_url,hot_title,hot_url from inte import news_title2,news_url2,news_date,news_time,news_author,li from mil import news_title3,news_url3,news_date2,news_time2,news_author2,li2 from finance import news_title4,news_url4,news_date3,news_time3,news_author3,li3 from ent import news_title5,news_url5,news_date4,news_time4,news_author4,li4 from sports import news_title6,news_url6,news_date5,news_time5,news_author5,hot_title2,hot_url2,li5 from tech import news_title7,news_url7,news_date6,news_time6,news_author6,li6 from game import news_title8,news_url8,news_date7,news_time7,news_author7,li7 def hp_print():#首页新闻输出 txt.delete('1.0','end')#清空Text框内容 txt.insert(END,'首页新闻标题\t新闻链接\n') for x in range(len(news_title)): txt.insert(END,news_title[x]) txt.insert(END,'\t') txt.insert(END,news_url[x]) txt.insert(END,'\n') def print(title,url,date,time,author):#除首页外其它类新闻输出 txt.delete('1.0','end')#清空 txt.insert(END,'新闻标题\t新闻链接\t编辑日期\t编辑时间\t编辑作者\n') for x in range(len(title)): txt.insert(END,title[x]) txt.insert(END,'\t') txt.insert(END,url[x]) txt.insert(END,'\t') txt.insert(END,date[x]) txt.insert(END,'\t') txt.insert(END,time[x]) txt.insert(END,'\t') txt.insert(END,author[x]) txt.insert(END,'\n') root=Tk() root.title('百度新闻-我知道!')#界面标题 root.geometry('1024x560') lb=Label(root,text='点击按钮,获得各类新闻中新闻信息(-1表示不清楚)') lb.place(relx=0.1,rely=0.01,relwidth=0.8,relheight=0.08) txt = Text(root)#各类新闻信息输出框 btn1=Button(root,text='首页',command=hp_print) btn1.place(relx=0.005, rely=0.1, relwidth=0.05, relheight=0.05) btn2=Button(root,text='int',command=lambda:print(news_title2,news_url2,news_date,news_time,news_author)) btn2.place(relx=0.08, rely=0.1, relwidth=0.05, relheight=0.05) btn3=Button(root,text='mil',command=lambda:print(news_title3,news_url3,news_date2,news_time2,news_author2)) btn3.place(relx=0.155, rely=0.1, relwidth=0.05, relheight=0.05) btn4=Button(root,text='财经',command=lambda:print(news_title4,news_url4,news_date3,news_time3,news_author3)) btn4.place(relx=0.23, rely=0.1, relwidth=0.05, relheight=0.05) btn5=Button(root,text='娱乐',command=lambda:print(news_title5,news_url5,news_date4,news_time4,news_author4)) btn5.place(relx=0.305, rely=0.1, relwidth=0.05, relheight=0.05) btn6=Button(root,text='体育',command=lambda:print(news_title6,news_url6,news_date5,news_time5,news_author5)) btn6.place(relx=0.38, rely=0.1, relwidth=0.05, relheight=0.05) btn7=Button(root,text='科技',command=lambda:print(news_title7,news_url7,news_date6,news_time6,news_author6)) btn7.place(relx=0.455, rely=0.1, relwidth=0.05, relheight=0.05) btn8=Button(root,text='游戏',command=lambda:print(news_title8,news_url8,news_date7,news_time7,news_author7)) btn8.place(relx=0.53, rely=0.1, relwidth=0.05, relheight=0.05) txt2=Text(root)#新闻热搜词框 txt2.insert(END,'新闻热搜词\t链接\n') for x in range(len(hot_title)): txt2.insert(END,hot_title[x]) txt2.insert(END,'\t') txt2.insert(END,hot_url[x]) txt2.insert(END,'\n') txt2.place(rely=0.8, relwidth=0.4, relheight=0.2) txt3=Text(root)#体育热搜词框 txt3.insert(END,'体育热搜词\t链接\n') for x in range(len(hot_title2)): txt3.insert(END,hot_title2[x]) txt3.insert(END,'\t') txt3.insert(END,hot_url2[x]) txt3.insert(END,'\n') txt3.place(relx=0.6,rely=0.8, relwidth=0.4, relheight=0.2) txt.place(rely=0.2, relwidth=1, relheight=0.6) today=datetime.date.today() yesterday=today - datetime.timedelta(days=1) s=str(today)[5:]#获得今天 s2=str(yesterday)[5:]#昨天 ind=np.arange(7) l1=[li[0],li2[0],li3[0],li4[0],li5[0],li6[0],li7[0]]#今天百分比数据 l2=[li[1],li2[1],li3[1],li4[1],li5[1],li6[1],li7[1]]#昨天百分比数据 l3=[li[2],li2[2],li3[2],li4[2],li5[2],li6[2],li7[2]]#其它时间百分比数据 ax=plt.subplot() rects1=ax.bar(ind,l1,0.3,color='SkyBlue',label=s)#今天 rects2=ax.bar(ind+0.3,l2,0.3,color='IndianRed',label=s2)#昨天 rects2=ax.bar(ind+0.6,l3,0.3,color='black',label='-1')#其它 ax.set_ylabel('Percent') ax.set_title('Percentage of news date distribution') plt.xticks(ind+0.3,('Civil','Mil','Fin','Ent','Sport','Tech','Game'))#横坐标 ax.legend() plt.show() root.mainloop()
hp.py(首页新闻):
import pandas as pd import requests import re from bs4 import BeautifulSoup html="https://news.baidu.com/"#首页新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='首页新闻.csv' news_title=[]#标题 news_url=[]#链接 hot_title=[]#热搜 hot_url=[]#热搜链接 for news in bs.select('li'):#标题、链接获取 if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title.append(title) news_url.append(url) dataframe=pd.DataFrame({'首页新闻标题':news_title,'新闻链接':news_url}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件 for hotwords in bs.select('li'):#热搜词获取 if(len(hotwords.select('a.hotwords_li_a'))>0): hot_title.append(hotwords.select('a.hotwords_li_a')[0].text) hot_url.append(hotwords.select('a.hotwords_li_a')[0]['href'])
inte.py:
import re import requests import datetime import pandas as pd from bs4 import BeautifulSoup html="https://news.baidu.com/guonei"##inte新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='inte新闻.csv' news_title2=[] news_url2=[] news_date=[]#编辑日期 news_time=[]#编辑时间 news_author=[]#编辑作者 for news in bs.select('li'):#获取标题、链接 if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title2.append(title) news_url2.append(url) pos=0 for html2 in news_url2: resp=requests.get(html2) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') flag=1#用来区别新闻链接后的一些信息存放标签不同 if(len(bs.select('div.author-txt'))==0): flag=0 for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中 author='' if len(news.select('p.author-name'))>0: author=news.select('p.author-name')[0].text date=news.select('span.date')[0].text time=news.select('span.time')[0].text news_date.append(date[5:]) news_time.append(time) news_author.append(author) if(flag==0): news_date.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到 news_time.append('-1') news_author.append('-1') pos=pos+1 dataframe=pd.DataFrame({'inte新闻标题':news_title2,'新闻链接':news_url2,'编辑日期':news_date,'编辑时间':news_time,'编辑作者':news_author}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件 today=datetime.date.today()#获取三个时间百分比 yesterday=today - datetime.timedelta(days=1) s=str(today)[5:] s2=str(yesterday)[5:] dx={s:0,s2:0,'-1':0}#字典统计每个日期个数,-1表示不清楚/其它时间 dx_keys=dx.keys() for i in news_date: if i in dx_keys: dx[i]=dx[i]+1 else:dx['-1']=dx['-1']+1 for x in dx: dx[x]=dx[x]/len(news_date) li=list(dx.values())
mil.py:
import re import requests import datetime import pandas as pd from bs4 import BeautifulSoup html="https://news.baidu.com/mil"#新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='新闻.csv' news_title3=[] news_url3=[] news_date2=[] news_time2=[] news_author2=[] for news in bs.select('li'):#获取标题、链接 if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title3.append(title) news_url3.append(url) pos=0 for html2 in news_url3:#获取编辑信息 resp=requests.get(html2) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') flag=1#用来区别新闻链接后的一些信息存放标签不同 if(len(bs.select('div.author-txt'))==0): flag=0 for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中 author='' if len(news.select('p.author-name'))>0: author=news.select('p.author-name')[0].text date=news.select('span.date')[0].text time=news.select('span.time')[0].text news_date2.append(date[5:]) news_time2.append(time) news_author2.append(author) if(flag==0): news_date2.append('-1')#-1表示新闻链接后的存放需要信息标签不一样,未能找到 news_time2.append('-1') news_author2.append('-1') pos=pos+1 dataframe=pd.DataFrame({'inter新闻标题':news_title3,'新闻链接':news_url3,'编辑日期':news_date2,'编辑时间':news_time2,'编辑作者':news_author2}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件 today=datetime.date.today()#获取三个时间的百分比 yesterday=today - datetime.timedelta(days=1) s=str(today)[5:] s2=str(yesterday)[5:] dx={s:0,s2:0,'-1':0} dx_keys=dx.keys() for i in news_date2: if i in dx_keys: dx[i]=dx[i]+1 else:dx['-1']=dx['-1']+1 for x in dx: dx[x]=dx[x]/len(news_date2) li2=list(dx.values())
finance.py(财经):
import re import requests import datetime import pandas as pd from bs4 import BeautifulSoup html="https://news.baidu.com/finance"#财经新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='财经新闻.csv' news_title4=[] news_url4=[] news_date3=[] news_time3=[] news_author3=[] for news in bs.select('li'):#获取标题、链接 if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title4.append(title) news_url4.append(url) pos=0 for html2 in news_url4:#获取编辑信息 resp=requests.get(html2) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') flag=1#用来区别新闻链接后的一些信息存放标签不同 if(len(bs.select('div.author-txt'))==0): flag=0 for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中 author='' if len(news.select('p.author-name'))>0: author=news.select('p.author-name')[0].text date=news.select('span.date')[0].text time=news.select('span.time')[0].text news_date3.append(date[5:]) news_time3.append(time) news_author3.append(author) if(flag==0): news_date3.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到 news_time3.append('-1') news_author3.append('-1') pos=pos+1 dataframe=pd.DataFrame({'财经新闻标题':news_title4,'新闻链接':news_url4,'编辑日期':news_date3,'编辑时间':news_time3,'编辑作者':news_author3}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件 today=datetime.date.today() yesterday=today - datetime.timedelta(days=1) s=str(today)[5:] s2=str(yesterday)[5:] dx={s:0,s2:0,'-1':0} dx_keys=dx.keys() for i in news_date3: if i in dx_keys: dx[i]=dx[i]+1 else:dx['-1']=dx['-1']+1 for x in dx: dx[x]=dx[x]/len(news_date3) li3=list(dx.values())
ent.py(娱乐):
import re import requests import datetime import pandas as pd from bs4 import BeautifulSoup html="https://news.baidu.com/ent"#娱乐新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='娱乐新闻.csv' news_title5=[] news_url5=[] news_date4=[] news_time4=[] news_author4=[] for news in bs.select('li'): if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title5.append(title) news_url5.append(url) pos=0 for html2 in news_url5: resp=requests.get(html2) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') flag=1#用来区别新闻链接后的一些信息存放标签不同 if(len(bs.select('div.author-txt'))==0): flag=0 for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中 author='' if len(news.select('p.author-name'))>0: author=news.select('p.author-name')[0].text date=news.select('span.date')[0].text time=news.select('span.time')[0].text news_date4.append(date[5:]) news_time4.append(time) news_author4.append(author) if(flag==0): news_date4.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到 news_time4.append('-1') news_author4.append('-1') pos=pos+1 dataframe=pd.DataFrame({'娱乐新闻标题':news_title5,'新闻链接':news_url5,'编辑日期':news_date4,'编辑时间':news_time4,'编辑作者':news_author4}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig') today=datetime.date.today() yesterday=today - datetime.timedelta(days=1) s=str(today)[5:] s2=str(yesterday)[5:] dx={s:0,s2:0,'-1':0} dx_keys=dx.keys() for i in news_date4: if i in dx_keys: dx[i]=dx[i]+1 else:dx['-1']=dx['-1']+1 for x in dx: dx[x]=dx[x]/len(news_date4) li4=list(dx.values())
sports.py(体育)其中有热搜词:
import re import requests import datetime import pandas as pd from bs4 import BeautifulSoup html="https://news.baidu.com/sports"#体育新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='体育新闻.csv' news_title6=[] news_url6=[] news_date5=[] news_time5=[] news_author5=[] hot_title2=[]#体育新闻里面的热搜词 hot_url2=[] for news in bs.select('li'): if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title6.append(title) news_url6.append(url) for hotwords in bs.select('li'): if(len(hotwords.select('a[title]'))>0): news_title6.remove(hotwords.select('a[title]')[0].text)#删去热搜词 news_url6.remove(hotwords.select('a[title]')[0]['href']) hot_title2.append(hotwords.select('a[title]')[0].text) hot_url2.append(hotwords.select('a[title]')[0]['href']) for hotwords in bs.select('li'): if(len(hotwords.select('a[mon="col=schedule"]'))>0): news_title6.remove(hotwords.select('a[mon="col=schedule"]')[0].text)#删去赛程表 news_url6.remove(hotwords.select('a[mon="col=schedule"]')[0]['href']) pos=0 for html2 in news_url6: resp=requests.get(html2) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') flag=1#用来区别新闻链接后的一些信息存放标签不同 if(len(bs.select('div.author-txt'))==0): flag=0 for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中 author='' if len(news.select('p.author-name'))>0: author=news.select('p.author-name')[0].text date=news.select('span.date')[0].text time=news.select('span.time')[0].text news_date5.append(date[5:]) news_time5.append(time) news_author5.append(author) if(flag==0): news_date5.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到 news_time5.append('-1') news_author5.append('-1') pos=pos+1 dataframe=pd.DataFrame({'体育新闻标题':news_title6,'新闻链接':news_url6,'编辑日期':news_date5,'编辑时间':news_time5,'编辑作者':news_author5}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig') today=datetime.date.today() yesterday=today - datetime.timedelta(days=1) s=str(today)[5:] s2=str(yesterday)[5:] dx={s:0,s2:0,'-1':0} dx_keys=dx.keys() for i in news_date5: if i in dx_keys: dx[i]=dx[i]+1 else:dx['-1']=dx['-1']+1 for x in dx: dx[x]=dx[x]/len(news_date5) li5=list(dx.values())
ech.py(教育):
import re import requests import datetime import pandas as pd from bs4 import BeautifulSoup html="https://news.baidu.com/tech"#科技新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='科技新闻.csv' news_title7=[] news_url7=[] news_date6=[] news_time6=[] news_author6=[] for news in bs.select('li'): if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title7.append(title) news_url7.append(url) pos=0 for html2 in news_url7: resp=requests.get(html2) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') flag=1#用来区别新闻链接后的一些信息存放标签不同 if(len(bs.select('div.author-txt'))==0): flag=0 for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中 author='' if len(news.select('p.author-name'))>0: author=news.select('p.author-name')[0].text date=news.select('span.date')[0].text time=news.select('span.time')[0].text news_date6.append(date[5:]) news_time6.append(time) news_author6.append(author) if(flag==0): news_date6.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到 news_time6.append('-1') news_author6.append('-1') pos=pos+1 dataframe=pd.DataFrame({'科技新闻标题':news_title7,'新闻链接':news_url7,'编辑日期':news_date6,'编辑时间':news_time6,'编辑作者':news_author6}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig') today=datetime.date.today() yesterday=today - datetime.timedelta(days=1) s=str(today)[5:] s2=str(yesterday)[5:] dx={s:0,s2:0,'-1':0} dx_keys=dx.keys() for i in news_date6: if i in dx_keys: dx[i]=dx[i]+1 else:dx['-1']=dx['-1']+1 for x in dx: dx[x]=dx[x]/len(news_date6) li6=list(dx.values())
game.py(游戏):
import re import requests import datetime import pandas as pd from bs4 import BeautifulSoup html="https://news.baidu.com/game"#游戏新闻 resp=requests.get(html) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') filename='游戏新闻.csv' news_title8=[] news_url8=[] news_date7=[] news_time7=[] news_author7=[] for news in bs.select('h1'): title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title8.append(title) news_url8.append(url) for news in bs.select('li'): if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([]) title=news.select('a[target="_blank"]')[0].text url=news.select('a[target="_blank"]')[0]['href'] news_title8.append(title) news_url8.append(url) pos=0 for html2 in news_url8: resp=requests.get(html2) resp.encoding='utf-8' content=resp.text bs=BeautifulSoup(content,'html.parser') flag=1#用来区别新闻链接后的一些信息存放标签不同 if(len(bs.select('div.author-txt'))==0): flag=0 for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中 author='' if len(news.select('p.author-name'))>0: author=news.select('p.author-name')[0].text date=news.select('span.date')[0].text time=news.select('span.time')[0].text news_date7.append(date[5:]) news_time7.append(time) news_author7.append(author) if(flag==0): news_date7.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到 news_time7.append('-1') news_author7.append('-1') pos=pos+1 dataframe=pd.DataFrame({'游戏新闻标题':news_title8,'新闻链接':news_url8,'编辑日期':news_date7,'编辑时间':news_time7,'编辑作者':news_author7}) dataframe.to_csv(filename,sep=',',encoding='utf-8-sig') today=datetime.date.today() yesterday=today - datetime.timedelta(days=1) s=str(today)[5:] s2=str(yesterday)[5:] dx={s:0,s2:0,'-1':0} dx_keys=dx.keys() for i in news_date7: if i in dx_keys: dx[i]=dx[i]+1 else:dx['-1']=dx['-1']+1 for x in dx: dx[x]=dx[x]/len(news_date7) li7=list(dx.values())
源码想要获取的花加下企鹅群:1136192749
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。