当前位置:   article > 正文

Python爬虫百度新闻标题,并且做简单的数据分析!挺简单的

Python爬虫百度新闻标题,并且做简单的数据分析!挺简单的

需要下载的库

我所用的python版本为: Python 3.7.4

  1. 获取新闻信息需要的库: beautifulsoup4,request,re;
  2. 信息存储需要的库(获取信息存在csv文件中): csv;
  3. 数据分析需要的库: numpy、matplotlib;
  4. 界面设计需要的库: tkinter;

需要对html一些标签有一定的了解

可以到w3cschool了解
打开百度新闻网站,按F12开发者工具,或者右键点击查看源,就可以看到网页的源代码。

代码设计思想

1.每个新闻网页通过request请求获得网页源代码,再通过bs4(beautifulsoup)来对源代码进行提取信息;
2.每个类的新闻获取标题、链接是相同的。通过观察源代码,可发现新闻的标题是存在li标签中a标签里面并且每个a标签都有<target=”_blank”>属性,通过bs4(beautifulsoup)的select()来获取其标题和链接;
如图:
新闻标题、链接存放的代码特征:


3.而首页新闻中有新闻热搜词,体育新闻中有体育热搜词,其标题和链接也是以上述的特征存储;
4. 除了首页的新闻,其它类的新闻里面新闻的每个网页的源代码几乎都是差不多的。而前面先获取了每类新闻里面每个新闻链接,重新以步骤1来提取相关信息(编辑作者、编辑日期、编辑时间)。通过查看网页源代码,可发现编辑信息是放在类名为author-txt的div块里面(div class=”author-txt”),编辑作者姓名放在类名为author-name的p标签,编辑日期、时间放在类名为date的span标签和类名为time的span标签里。而当中也有些新闻的网页源代码是不同的,只能以-1的形式存入信息列表中;小部分的编辑信息存放不符合上面的特征(就以-1代替):
5.每个信息都以一个列表来进行存储;
6.将每个列表的信息通过pandas库来存入csv文件中;
7.进行每类数据分析可视化处理时,通过datetime来获取今天、昨天的时间,然后用dict字典和一定运算来统计每类新闻里面今天、昨天、其它时间三个编辑时间分布的百分比,然后用matplotlib库来画条形图。
8.通过tkinter库设计一个界面,把每类新闻设计成一个按钮,通过点击按钮中显示出每类里面的新闻信息;把首页、体育新闻热搜词放在左下、右下两个角。

运行结果

可以点击按钮获取各类信息:


发布日期分布的分析:

源代码

main.py(主程序):

from tkinter import *
import datetime
import numpy as np
import matplotlib.pyplot as plt
from hp import news_title,news_url,hot_title,hot_url
from inte import news_title2,news_url2,news_date,news_time,news_author,li
from mil import news_title3,news_url3,news_date2,news_time2,news_author2,li2
from finance import news_title4,news_url4,news_date3,news_time3,news_author3,li3
from ent import news_title5,news_url5,news_date4,news_time4,news_author4,li4
from sports import news_title6,news_url6,news_date5,news_time5,news_author5,hot_title2,hot_url2,li5
from tech import news_title7,news_url7,news_date6,news_time6,news_author6,li6
from game import news_title8,news_url8,news_date7,news_time7,news_author7,li7
def hp_print():#首页新闻输出
    txt.delete('1.0','end')#清空Text框内容
    txt.insert(END,'首页新闻标题\t新闻链接\n')
    for x in range(len(news_title)):
        txt.insert(END,news_title[x])
        txt.insert(END,'\t')
        txt.insert(END,news_url[x])
        txt.insert(END,'\n')
def print(title,url,date,time,author):#除首页外其它类新闻输出
    txt.delete('1.0','end')#清空
    txt.insert(END,'新闻标题\t新闻链接\t编辑日期\t编辑时间\t编辑作者\n')
    for x in range(len(title)):
        txt.insert(END,title[x])
        txt.insert(END,'\t')
        txt.insert(END,url[x])
        txt.insert(END,'\t')
        txt.insert(END,date[x])
        txt.insert(END,'\t')
        txt.insert(END,time[x])
        txt.insert(END,'\t')
        txt.insert(END,author[x])
        txt.insert(END,'\n')        
root=Tk()
root.title('百度新闻-我知道!')#界面标题
root.geometry('1024x560')
lb=Label(root,text='点击按钮,获得各类新闻中新闻信息(-1表示不清楚)')
lb.place(relx=0.1,rely=0.01,relwidth=0.8,relheight=0.08)
txt = Text(root)#各类新闻信息输出框
btn1=Button(root,text='首页',command=hp_print)
btn1.place(relx=0.005, rely=0.1, relwidth=0.05, relheight=0.05)
btn2=Button(root,text='int',command=lambda:print(news_title2,news_url2,news_date,news_time,news_author))
btn2.place(relx=0.08, rely=0.1, relwidth=0.05, relheight=0.05)
btn3=Button(root,text='mil',command=lambda:print(news_title3,news_url3,news_date2,news_time2,news_author2))
btn3.place(relx=0.155, rely=0.1, relwidth=0.05, relheight=0.05)
btn4=Button(root,text='财经',command=lambda:print(news_title4,news_url4,news_date3,news_time3,news_author3))
btn4.place(relx=0.23, rely=0.1, relwidth=0.05, relheight=0.05)
btn5=Button(root,text='娱乐',command=lambda:print(news_title5,news_url5,news_date4,news_time4,news_author4))
btn5.place(relx=0.305, rely=0.1, relwidth=0.05, relheight=0.05)
btn6=Button(root,text='体育',command=lambda:print(news_title6,news_url6,news_date5,news_time5,news_author5))
btn6.place(relx=0.38, rely=0.1, relwidth=0.05, relheight=0.05)
btn7=Button(root,text='科技',command=lambda:print(news_title7,news_url7,news_date6,news_time6,news_author6))
btn7.place(relx=0.455, rely=0.1, relwidth=0.05, relheight=0.05)
btn8=Button(root,text='游戏',command=lambda:print(news_title8,news_url8,news_date7,news_time7,news_author7))
btn8.place(relx=0.53, rely=0.1, relwidth=0.05, relheight=0.05)
txt2=Text(root)#新闻热搜词框
txt2.insert(END,'新闻热搜词\t链接\n')
for x in range(len(hot_title)):
    txt2.insert(END,hot_title[x])
    txt2.insert(END,'\t')
    txt2.insert(END,hot_url[x])
    txt2.insert(END,'\n')
txt2.place(rely=0.8, relwidth=0.4, relheight=0.2)
txt3=Text(root)#体育热搜词框
txt3.insert(END,'体育热搜词\t链接\n')
for x in range(len(hot_title2)):
    txt3.insert(END,hot_title2[x])
    txt3.insert(END,'\t')
    txt3.insert(END,hot_url2[x])
    txt3.insert(END,'\n')   
txt3.place(relx=0.6,rely=0.8, relwidth=0.4, relheight=0.2)
txt.place(rely=0.2, relwidth=1, relheight=0.6)
today=datetime.date.today()
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]#获得今天
s2=str(yesterday)[5:]#昨天
ind=np.arange(7)
l1=[li[0],li2[0],li3[0],li4[0],li5[0],li6[0],li7[0]]#今天百分比数据
l2=[li[1],li2[1],li3[1],li4[1],li5[1],li6[1],li7[1]]#昨天百分比数据
l3=[li[2],li2[2],li3[2],li4[2],li5[2],li6[2],li7[2]]#其它时间百分比数据
ax=plt.subplot()
rects1=ax.bar(ind,l1,0.3,color='SkyBlue',label=s)#今天
rects2=ax.bar(ind+0.3,l2,0.3,color='IndianRed',label=s2)#昨天
rects2=ax.bar(ind+0.6,l3,0.3,color='black',label='-1')#其它
ax.set_ylabel('Percent')
ax.set_title('Percentage of news date distribution')
plt.xticks(ind+0.3,('Civil','Mil','Fin','Ent','Sport','Tech','Game'))#横坐标
ax.legend()
plt.show()
root.mainloop()

hp.py(首页新闻):

import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
html="https://news.baidu.com/"#首页新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='首页新闻.csv'
news_title=[]#标题
news_url=[]#链接
hot_title=[]#热搜
hot_url=[]#热搜链接
for news in bs.select('li'):#标题、链接获取
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title.append(title)
        news_url.append(url)
dataframe=pd.DataFrame({'首页新闻标题':news_title,'新闻链接':news_url})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件
for hotwords in bs.select('li'):#热搜词获取
    if(len(hotwords.select('a.hotwords_li_a'))>0):
        hot_title.append(hotwords.select('a.hotwords_li_a')[0].text)
        hot_url.append(hotwords.select('a.hotwords_li_a')[0]['href'])

inte.py:

import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
html="https://news.baidu.com/guonei"##inte新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='inte新闻.csv'
news_title2=[]
news_url2=[]
news_date=[]#编辑日期
news_time=[]#编辑时间
news_author=[]#编辑作者
for news in bs.select('li'):#获取标题、链接
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title2.append(title)
        news_url2.append(url)
pos=0
for html2 in news_url2:
    resp=requests.get(html2)
    resp.encoding='utf-8'
    content=resp.text
    bs=BeautifulSoup(content,'html.parser')
    flag=1#用来区别新闻链接后的一些信息存放标签不同
    if(len(bs.select('div.author-txt'))==0):
        flag=0
    for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中
        author=''
        if len(news.select('p.author-name'))>0:
            author=news.select('p.author-name')[0].text
            date=news.select('span.date')[0].text
            time=news.select('span.time')[0].text
        news_date.append(date[5:])
        news_time.append(time)
        news_author.append(author)
    if(flag==0):
        news_date.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到
        news_time.append('-1')
        news_author.append('-1')
    pos=pos+1
dataframe=pd.DataFrame({'inte新闻标题':news_title2,'新闻链接':news_url2,'编辑日期':news_date,'编辑时间':news_time,'编辑作者':news_author})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件
today=datetime.date.today()#获取三个时间百分比
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]
s2=str(yesterday)[5:]
dx={s:0,s2:0,'-1':0}#字典统计每个日期个数,-1表示不清楚/其它时间
dx_keys=dx.keys()
for i in news_date:
    if i in dx_keys:
        dx[i]=dx[i]+1
    else:dx['-1']=dx['-1']+1
for x in dx:
    dx[x]=dx[x]/len(news_date)
li=list(dx.values())

mil.py:

import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
html="https://news.baidu.com/mil"#新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='新闻.csv'
news_title3=[]
news_url3=[]
news_date2=[]
news_time2=[]
news_author2=[]
for news in bs.select('li'):#获取标题、链接
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title3.append(title)
        news_url3.append(url)
pos=0
for html2 in news_url3:#获取编辑信息
    resp=requests.get(html2)
    resp.encoding='utf-8'
    content=resp.text
    bs=BeautifulSoup(content,'html.parser')
    flag=1#用来区别新闻链接后的一些信息存放标签不同
    if(len(bs.select('div.author-txt'))==0):
        flag=0
    for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中
        author=''
        if len(news.select('p.author-name'))>0:
            author=news.select('p.author-name')[0].text
            date=news.select('span.date')[0].text
            time=news.select('span.time')[0].text
        news_date2.append(date[5:])
        news_time2.append(time)
        news_author2.append(author)
    if(flag==0):
        news_date2.append('-1')#-1表示新闻链接后的存放需要信息标签不一样,未能找到
        news_time2.append('-1')
        news_author2.append('-1')
    pos=pos+1
dataframe=pd.DataFrame({'inter新闻标题':news_title3,'新闻链接':news_url3,'编辑日期':news_date2,'编辑时间':news_time2,'编辑作者':news_author2})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件
today=datetime.date.today()#获取三个时间的百分比
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]
s2=str(yesterday)[5:]
dx={s:0,s2:0,'-1':0}
dx_keys=dx.keys()
for i in news_date2:
    if i in dx_keys:
        dx[i]=dx[i]+1
    else:dx['-1']=dx['-1']+1
for x in dx:
    dx[x]=dx[x]/len(news_date2)
li2=list(dx.values())



finance.py(财经):

import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
html="https://news.baidu.com/finance"#财经新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='财经新闻.csv'
news_title4=[]
news_url4=[]
news_date3=[]
news_time3=[]
news_author3=[]
for news in bs.select('li'):#获取标题、链接
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title4.append(title)
        news_url4.append(url)
pos=0
for html2 in news_url4:#获取编辑信息
    resp=requests.get(html2)
    resp.encoding='utf-8'
    content=resp.text
    bs=BeautifulSoup(content,'html.parser')
    flag=1#用来区别新闻链接后的一些信息存放标签不同
    if(len(bs.select('div.author-txt'))==0):
        flag=0
    for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中
        author=''
        if len(news.select('p.author-name'))>0:
            author=news.select('p.author-name')[0].text
            date=news.select('span.date')[0].text
            time=news.select('span.time')[0].text
        news_date3.append(date[5:])
        news_time3.append(time)
        news_author3.append(author)
    if(flag==0):
        news_date3.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到
        news_time3.append('-1')
        news_author3.append('-1')
    pos=pos+1
dataframe=pd.DataFrame({'财经新闻标题':news_title4,'新闻链接':news_url4,'编辑日期':news_date3,'编辑时间':news_time3,'编辑作者':news_author3})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')#导出csv文件
today=datetime.date.today()
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]
s2=str(yesterday)[5:]
dx={s:0,s2:0,'-1':0}
dx_keys=dx.keys()
for i in news_date3:
    if i in dx_keys:
        dx[i]=dx[i]+1
    else:dx['-1']=dx['-1']+1
for x in dx:
    dx[x]=dx[x]/len(news_date3)
li3=list(dx.values())



ent.py(娱乐):

import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
html="https://news.baidu.com/ent"#娱乐新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='娱乐新闻.csv'
news_title5=[]
news_url5=[]
news_date4=[]
news_time4=[]
news_author4=[]
for news in bs.select('li'):
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title5.append(title)
        news_url5.append(url)
pos=0
for html2 in news_url5:
    resp=requests.get(html2)
    resp.encoding='utf-8'
    content=resp.text
    bs=BeautifulSoup(content,'html.parser')
    flag=1#用来区别新闻链接后的一些信息存放标签不同
    if(len(bs.select('div.author-txt'))==0):
        flag=0
    for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中
        author=''
        if len(news.select('p.author-name'))>0:
            author=news.select('p.author-name')[0].text
            date=news.select('span.date')[0].text
            time=news.select('span.time')[0].text
        news_date4.append(date[5:])
        news_time4.append(time)
        news_author4.append(author)
    if(flag==0):
        news_date4.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到
        news_time4.append('-1')
        news_author4.append('-1')
    pos=pos+1
dataframe=pd.DataFrame({'娱乐新闻标题':news_title5,'新闻链接':news_url5,'编辑日期':news_date4,'编辑时间':news_time4,'编辑作者':news_author4})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')
today=datetime.date.today()
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]
s2=str(yesterday)[5:]
dx={s:0,s2:0,'-1':0}
dx_keys=dx.keys()
for i in news_date4:
    if i in dx_keys:
        dx[i]=dx[i]+1
    else:dx['-1']=dx['-1']+1
for x in dx:
    dx[x]=dx[x]/len(news_date4)
li4=list(dx.values())

sports.py(体育)其中有热搜词:

import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
html="https://news.baidu.com/sports"#体育新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='体育新闻.csv'
news_title6=[]
news_url6=[]
news_date5=[]
news_time5=[]
news_author5=[]
hot_title2=[]#体育新闻里面的热搜词
hot_url2=[]
for news in bs.select('li'):
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title6.append(title)
        news_url6.append(url)
for hotwords in bs.select('li'):
    if(len(hotwords.select('a[title]'))>0):
        news_title6.remove(hotwords.select('a[title]')[0].text)#删去热搜词
        news_url6.remove(hotwords.select('a[title]')[0]['href'])
        hot_title2.append(hotwords.select('a[title]')[0].text)
        hot_url2.append(hotwords.select('a[title]')[0]['href'])
for hotwords in bs.select('li'):
    if(len(hotwords.select('a[mon="col=schedule"]'))>0):
        news_title6.remove(hotwords.select('a[mon="col=schedule"]')[0].text)#删去赛程表
        news_url6.remove(hotwords.select('a[mon="col=schedule"]')[0]['href'])
pos=0
for html2 in news_url6:
    resp=requests.get(html2)
    resp.encoding='utf-8'
    content=resp.text
    bs=BeautifulSoup(content,'html.parser')
    flag=1#用来区别新闻链接后的一些信息存放标签不同
    if(len(bs.select('div.author-txt'))==0):
        flag=0
    for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中
        author=''
        if len(news.select('p.author-name'))>0:
            author=news.select('p.author-name')[0].text
            date=news.select('span.date')[0].text
            time=news.select('span.time')[0].text
        news_date5.append(date[5:])
        news_time5.append(time)
        news_author5.append(author)
    if(flag==0):
        news_date5.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到
        news_time5.append('-1')
        news_author5.append('-1')
    pos=pos+1
dataframe=pd.DataFrame({'体育新闻标题':news_title6,'新闻链接':news_url6,'编辑日期':news_date5,'编辑时间':news_time5,'编辑作者':news_author5})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')
today=datetime.date.today()
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]
s2=str(yesterday)[5:]
dx={s:0,s2:0,'-1':0}
dx_keys=dx.keys()
for i in news_date5:
    if i in dx_keys:
        dx[i]=dx[i]+1
    else:dx['-1']=dx['-1']+1
for x in dx:
    dx[x]=dx[x]/len(news_date5)
li5=list(dx.values())

ech.py(教育):

import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
html="https://news.baidu.com/tech"#科技新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='科技新闻.csv'
news_title7=[]
news_url7=[]
news_date6=[]
news_time6=[]
news_author6=[]
for news in bs.select('li'):
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title7.append(title)
        news_url7.append(url)
pos=0
for html2 in news_url7:
    resp=requests.get(html2)
    resp.encoding='utf-8'
    content=resp.text
    bs=BeautifulSoup(content,'html.parser')
    flag=1#用来区别新闻链接后的一些信息存放标签不同
    if(len(bs.select('div.author-txt'))==0):
        flag=0
    for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中
        author=''
        if len(news.select('p.author-name'))>0:
            author=news.select('p.author-name')[0].text
            date=news.select('span.date')[0].text
            time=news.select('span.time')[0].text
        news_date6.append(date[5:])
        news_time6.append(time)
        news_author6.append(author)
    if(flag==0):
        news_date6.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到
        news_time6.append('-1')
        news_author6.append('-1')
    pos=pos+1
dataframe=pd.DataFrame({'科技新闻标题':news_title7,'新闻链接':news_url7,'编辑日期':news_date6,'编辑时间':news_time6,'编辑作者':news_author6})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')
today=datetime.date.today()
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]
s2=str(yesterday)[5:]
dx={s:0,s2:0,'-1':0}
dx_keys=dx.keys()
for i in news_date6:
    if i in dx_keys:
        dx[i]=dx[i]+1
    else:dx['-1']=dx['-1']+1
for x in dx:
    dx[x]=dx[x]/len(news_date6)
li6=list(dx.values())

game.py(游戏):

import re
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
html="https://news.baidu.com/game"#游戏新闻
resp=requests.get(html)
resp.encoding='utf-8'
content=resp.text
bs=BeautifulSoup(content,'html.parser')
filename='游戏新闻.csv'
news_title8=[]
news_url8=[]
news_date7=[]
news_time7=[]
news_author7=[]
for news in bs.select('h1'):
    title=news.select('a[target="_blank"]')[0].text
    url=news.select('a[target="_blank"]')[0]['href']
    news_title8.append(title)
    news_url8.append(url)
for news in bs.select('li'):
    if len(news.select('a[target="_blank"]'))>0:#用标签属性来访问([])
        title=news.select('a[target="_blank"]')[0].text
        url=news.select('a[target="_blank"]')[0]['href']
        news_title8.append(title)
        news_url8.append(url)
pos=0
for html2 in news_url8:
    resp=requests.get(html2)
    resp.encoding='utf-8'
    content=resp.text
    bs=BeautifulSoup(content,'html.parser')
    flag=1#用来区别新闻链接后的一些信息存放标签不同
    if(len(bs.select('div.author-txt'))==0):
        flag=0
    for news in bs.select('div.author-txt'):#绝大部分日期、时间、编辑存放在这个块中
        author=''
        if len(news.select('p.author-name'))>0:
            author=news.select('p.author-name')[0].text
            date=news.select('span.date')[0].text
            time=news.select('span.time')[0].text
        news_date7.append(date[5:])
        news_time7.append(time)
        news_author7.append(author)
    if(flag==0):
        news_date7.append('-1')#表示新闻链接后的存放需要信息标签不一样,未能找到
        news_time7.append('-1')
        news_author7.append('-1')
    pos=pos+1
dataframe=pd.DataFrame({'游戏新闻标题':news_title8,'新闻链接':news_url8,'编辑日期':news_date7,'编辑时间':news_time7,'编辑作者':news_author7})
dataframe.to_csv(filename,sep=',',encoding='utf-8-sig')
today=datetime.date.today()
yesterday=today - datetime.timedelta(days=1)
s=str(today)[5:]
s2=str(yesterday)[5:]
dx={s:0,s2:0,'-1':0}
dx_keys=dx.keys()
for i in news_date7:
    if i in dx_keys:
        dx[i]=dx[i]+1
    else:dx['-1']=dx['-1']+1
for x in dx:
    dx[x]=dx[x]/len(news_date7)
li7=list(dx.values())

源码想要获取的花加下企鹅群:1136192749

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/349501
推荐阅读
相关标签
  

闽ICP备14008679号