赞
踩
信息展示:tkinter
爬取及请求:requests、BeautifulSoup
打开腾讯新闻网页->鼠标右键检查/键盘F12键->网络->刷新一下页面
然后右键复制链接地址即是
注意设置本地文件路径!!!!
Myspider_news.py
import requests
from bs4 import BeautifulSoup
class MySpider:
def __init__(self):
self.ulist = []
def getResponse(self, url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54'}
r = requests.get(url, timeout=30, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
def getJSONText(self, r):
ulist = []
data = r.json()
news = data['data']['list']
for n in news:
title = n['title']
publish_time = n['publish_time']
href = n['url']
ulist.append([title, publish_time, href])
self.ulist = ulist
return ulist
def writeFile(self, file='data.txt'):
print("ulist", self.ulist)
with open(file, "w", encoding='utf-8') as f:
for i, item in enumerate(self.ulist):
f.write(f"{i}::{item[0]}::{item[1]}::{item[2]}\n")
def getNewsContent(self, r):
data = ''
soup = BeautifulSoup(r.text, 'lxml')
datas = soup.select('div#ArticleContent>p.one-p')
title = soup.select("h1")[0].get_text()
for d in datas:
data += d.get_text() + "\n"
return title, data
MySpiderGui_news.py
from tkinter import *
from tkinter import messagebox
from Myspider_news import *
class MySpiderGUI_news:
def __init__(self):
self.window = Tk()
self.window.title("新闻爬取")
Label(self.window, text="腾讯新闻", font=("黑体", 26, 'bold')).pack()
f1 = Frame(self.window)
f1.pack(fill="both")
Label(f1, text="请输入网址:", font=('黑体', 12)).pack(side="left")
self.url = StringVar()
# self.url.set("")
# Entry(f1, textvariable=self.url).pack(side="left", fill="x", expand=1)
self.url.set("https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/"
"list?sub_srv_id=24hours&srv_id=pc&offset=0&limit=20&strategy=1&ext="
"{%22pool%22:[%22top%22,%22hot%22],%22is_filter%22:7,%22check_type%22:true}")
Entry(f1, text="显示数量: ", font=('黑体', 12)).pack(side="left")
self.num = IntVar()
Entry(f1, textvariable=self.num).pack(side="left")
Button(f1, text="确定", command=self.btOK, padx=10).pack(side="left")
Button(f1, text="清空", command=self.btCancel, padx=10).pack(side="left")
f2 = Frame(self.window)
f2.pack(fill="both", expand=1)
scrollbarx = Scrollbar(f2, orient="horizontal")
scrollbary = Scrollbar(f2, orient="vertical")
scrollbarx.pack(side="bottom", fill=X)
scrollbary.pack(side="right", fill=Y)
self.text = Text(f2, wrap='none', width=60,
xscrollcommand=scrollbarx.set,
yscrollcommand=scrollbary.set)
scrollbarx.config(command=self.text.xview)
scrollbary.config(command=self.text.yview)
self.text.pack(fill="both", expand=1)
Label(f2, text="新闻id :", font=('黑体', 12)).pack(side="left")
self.news_id = IntVar()
Entry(f2, textvariable=self.news_id).pack(side="left")
Button(f2, text="显示新闻", command=self.btNews, padx=10).pack(side="left")
self.file = ""
self.window.mainloop()
def btOK(self):
self.text.delete(1.0, END)
# tplt = "{0:^5} {1:{3}^18} {2:< 10}"
tplt = "{0:^5} {1:{3}^18} {2:<10}"
self.text.insert('end', tplt.format("序 号", "新 闻", "时 间", chr(12288)))
self.text.insert('end', '\n')
if self.num.get() > 20:
messagebox.showerror("错误", "输入的新闻数太多啦")
return
ulist = []
messagebox.showinfo("提示", "开始爬取。。。")
url = self.url.get()
spider = MySpider()
try:
r = spider.getResponse(url)
ulist = spider.getJSONText(r)
self.file = r"G:\(你本地的文件路径)test-file\data.txt"
spider.writeFile(self.file)
except Exception as ex:
print("程序出错:", ex)
for i in range(self.num.get()):
# print(self.num.get())
u = ulist[i]
if len(u[0]) > 15:
u[0] = u[0][:15]
u[0] = self.strB2Q(u[0])
u[0] = u[0] + "..."
else:
u[0] = self.strB2Q(u[0])
u[0] = u[0] + "..." + chr(12288) * (15 - len(u[0]))
if len(u[1]) > 10:
u[1] = u[1][:10]
# print(u[1])
tplt = "{0:^5} {1:^18} {2:<10}"
self.text.insert('end', tplt.format(str(i), u[0], u[1]))
self.text.insert('end', "\n")
self.text.insert('end', "\n")
self.text.insert('end', "共有记录" + str(self.num.get()) + "条")
self.text.insert('end', '\n')
def btCancel(self):
self.num.set(0)
self.text.delete(1.0, END)
tplt = "{0:^2} {1:{3}^18} {2:<10}"
self.text.insert("end", tplt.format("序号", "新闻", "时间", chr(1288)))
self.text.insert('end', '\n')
def btNews(self):
root = Tk()
root.title("显示新闻")
self.lbltitle = Label(root, text=" ", font=('黑体', 22, 'bold'))
self.lbltitle.pack()
f1 = Frame(root)
f1.pack(fill="both", expand=1)
scrollbarx = Scrollbar(f1, orient="horizontal")
scrollbary = Scrollbar(f1, orient="vertical")
scrollbarx.pack(side="bottom", fill=X)
scrollbary.pack(side="right", fill=Y)
self.news_text = Text(f1, wrap="none", width=60, height=10,
xscrollcommand=scrollbarx.set,
yscrollcommand=scrollbary.set)
scrollbarx.config(command=self.text.xview)
scrollbary.config(command=self.text.yview)
self.news_text.pack(fill="both", expand=1)
Button(f1, text="关闭窗口", command=root.destroy, padx=10).pack()
self.displayNews()
root.mainloop()
def displayNews(self):
f = open(self.file, "r", encoding='utf-8')
datas = f.readlines()[self.news_id.get()]
# 读取特定行
data = datas.split("::", 4)
news_url = data[3]
title = ""
content = ""
newsSpider = MySpider()
try:
r = newsSpider.getResponse(news_url)
title, content = newsSpider.getNewsContent(r)
except Exception as ex:
print("程序出错: ", ex)
self.lbltitle["text"] = title
self.news_text.insert('end', "标题: " + title)
self.news_text.insert('end', "\n")
self.news_text.insert('end', "内容: ")
self.news_text.insert('end', content)
self.news_text.insert('end', "n")
def strB2Q(self, ustring):
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 32:
# 空格转换
inside_code = 12288
elif 32 <= inside_code <= 126: # 半 角范围
inside_code += 65248
rstring += chr(inside_code)
return rstring
MySpiderGUI_news()
注意设置本地文件路径!!!!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。