当前位置:   article > 正文

python实现新闻爬取系统_python 爬取qq新闻

python 爬取qq新闻

新闻爬取系统

在这里插入图片描述

在这里插入图片描述

信息展示:tkinter

爬取及请求:requests、BeautifulSoup

设置新闻列表API

在这里插入图片描述

打开腾讯新闻网页->鼠标右键检查/键盘F12键->网络->刷新一下页面

在这里插入图片描述

然后右键复制链接地址即是

程序运行效果

在这里插入图片描述

在这里插入图片描述

文件写入内容

在这里插入图片描述

参考coding部分-两个文件

注意设置本地文件路径!!!!

数据爬取文件Myspider_news.py

import requests
from bs4 import BeautifulSoup


class MySpider:
    def __init__(self):
        self.ulist = []

    def getResponse(self, url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54'}
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r

    def getJSONText(self, r):
        ulist = []
        data = r.json()
        news = data['data']['list']
        for n in news:
            title = n['title']
            publish_time = n['publish_time']
            href = n['url']
            ulist.append([title, publish_time, href])
        self.ulist = ulist
        return ulist

    def writeFile(self, file='data.txt'):
        print("ulist", self.ulist)
        with open(file, "w", encoding='utf-8') as f:
            for i, item in enumerate(self.ulist):
                f.write(f"{i}::{item[0]}::{item[1]}::{item[2]}\n")

    def getNewsContent(self, r):
        data = ''
        soup = BeautifulSoup(r.text, 'lxml')
        datas = soup.select('div#ArticleContent>p.one-p')
        title = soup.select("h1")[0].get_text()
        for d in datas:
            data += d.get_text() + "\n"
        return title, data
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43

窗口展示文件MySpiderGui_news.py

from tkinter import *
from tkinter import messagebox
from Myspider_news import *


class MySpiderGUI_news:
    def __init__(self):
        self.window = Tk()
        self.window.title("新闻爬取")
        Label(self.window, text="腾讯新闻", font=("黑体", 26, 'bold')).pack()
        f1 = Frame(self.window)
        f1.pack(fill="both")
        Label(f1, text="请输入网址:", font=('黑体', 12)).pack(side="left")
        self.url = StringVar()
        # self.url.set("")
        # Entry(f1, textvariable=self.url).pack(side="left", fill="x", expand=1)
        self.url.set("https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/"
                     "list?sub_srv_id=24hours&srv_id=pc&offset=0&limit=20&strategy=1&ext="
                     "{%22pool%22:[%22top%22,%22hot%22],%22is_filter%22:7,%22check_type%22:true}")
        Entry(f1, text="显示数量: ", font=('黑体', 12)).pack(side="left")
        self.num = IntVar()
        Entry(f1, textvariable=self.num).pack(side="left")

        Button(f1, text="确定", command=self.btOK, padx=10).pack(side="left")
        Button(f1, text="清空", command=self.btCancel, padx=10).pack(side="left")

        f2 = Frame(self.window)
        f2.pack(fill="both", expand=1)
        scrollbarx = Scrollbar(f2, orient="horizontal")
        scrollbary = Scrollbar(f2, orient="vertical")
        scrollbarx.pack(side="bottom", fill=X)
        scrollbary.pack(side="right", fill=Y)

        self.text = Text(f2, wrap='none', width=60,
                         xscrollcommand=scrollbarx.set,
                         yscrollcommand=scrollbary.set)
        scrollbarx.config(command=self.text.xview)
        scrollbary.config(command=self.text.yview)
        self.text.pack(fill="both", expand=1)
        Label(f2, text="新闻id :", font=('黑体', 12)).pack(side="left")
        self.news_id = IntVar()
        Entry(f2, textvariable=self.news_id).pack(side="left")
        Button(f2, text="显示新闻", command=self.btNews, padx=10).pack(side="left")
        self.file = ""
        self.window.mainloop()

    def btOK(self):
        self.text.delete(1.0, END)

        # tplt = "{0:^5} {1:{3}^18} {2:< 10}"
        tplt = "{0:^5} {1:{3}^18} {2:<10}"
        self.text.insert('end', tplt.format("序 号", "新 闻", "时 间", chr(12288)))
        self.text.insert('end', '\n')

        if self.num.get() > 20:
            messagebox.showerror("错误", "输入的新闻数太多啦")
            return

        ulist = []
        messagebox.showinfo("提示", "开始爬取。。。")
        url = self.url.get()
        spider = MySpider()
        try:
            r = spider.getResponse(url)
            ulist = spider.getJSONText(r)
            self.file = r"G:\(你本地的文件路径)test-file\data.txt"
            spider.writeFile(self.file)
        except Exception as ex:
            print("程序出错:", ex)

        for i in range(self.num.get()):
            # print(self.num.get())
            u = ulist[i]

            if len(u[0]) > 15:
                u[0] = u[0][:15]
                u[0] = self.strB2Q(u[0])
                u[0] = u[0] + "..."
            else:
                u[0] = self.strB2Q(u[0])
                u[0] = u[0] + "..." + chr(12288) * (15 - len(u[0]))

            if len(u[1]) > 10:
                u[1] = u[1][:10]
                # print(u[1])
            tplt = "{0:^5} {1:^18} {2:<10}"
            self.text.insert('end', tplt.format(str(i), u[0], u[1]))
            self.text.insert('end', "\n")
            self.text.insert('end', "\n")

        self.text.insert('end', "共有记录" + str(self.num.get()) + "条")
        self.text.insert('end', '\n')

    def btCancel(self):
        self.num.set(0)
        self.text.delete(1.0, END)
        tplt = "{0:^2} {1:{3}^18} {2:<10}"
        self.text.insert("end", tplt.format("序号", "新闻", "时间", chr(1288)))
        self.text.insert('end', '\n')

    def btNews(self):
        root = Tk()
        root.title("显示新闻")
        self.lbltitle = Label(root, text=" ", font=('黑体', 22, 'bold'))
        self.lbltitle.pack()
        f1 = Frame(root)
        f1.pack(fill="both", expand=1)
        scrollbarx = Scrollbar(f1, orient="horizontal")
        scrollbary = Scrollbar(f1, orient="vertical")
        scrollbarx.pack(side="bottom", fill=X)
        scrollbary.pack(side="right", fill=Y)
        self.news_text = Text(f1, wrap="none", width=60, height=10,
                              xscrollcommand=scrollbarx.set,
                              yscrollcommand=scrollbary.set)
        scrollbarx.config(command=self.text.xview)
        scrollbary.config(command=self.text.yview)
        self.news_text.pack(fill="both", expand=1)
        Button(f1, text="关闭窗口", command=root.destroy, padx=10).pack()
        self.displayNews()
        root.mainloop()

    def displayNews(self):
        f = open(self.file, "r", encoding='utf-8')
        datas = f.readlines()[self.news_id.get()]

        # 读取特定行
        data = datas.split("::", 4)
        news_url = data[3]
        title = ""
        content = ""
        newsSpider = MySpider()
        try:
            r = newsSpider.getResponse(news_url)
            title, content = newsSpider.getNewsContent(r)
        except Exception as ex:
            print("程序出错: ", ex)
        self.lbltitle["text"] = title
        self.news_text.insert('end', "标题: " + title)
        self.news_text.insert('end', "\n")
        self.news_text.insert('end', "内容: ")
        self.news_text.insert('end', content)
        self.news_text.insert('end', "n")

    def strB2Q(self, ustring):
        rstring = ""
        for uchar in ustring:
            inside_code = ord(uchar)
            if inside_code == 32:
                # 空格转换
                inside_code = 12288
            elif 32 <= inside_code <= 126:  # 半 角范围
                inside_code += 65248
            rstring += chr(inside_code)
        return rstring


MySpiderGUI_news()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157

注意设置本地文件路径!!!!


在这里插入图片描述

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/687072
推荐阅读
相关标签
  

闽ICP备14008679号