python每天定时爬取微博热搜并保存到本地(表格、数据库)_td rank()

作者：程序代码艺术家2 | 2024-02-03 15:19:32

踩

td rank()

微博热搜网站：https://s.weibo.com/top/summary/

就是这个样子：
在这里插入图片描述
pyquery提取：

保险起见headers里加个UA…

from pyquery import PyQuery as pq
html = pq("https://s.weibo.com/top/summary/",
          {
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
          }
          )      
1
2
3
4
5
6

这里打印html可以看到有结果了
在这里插入图片描述

for item in html("#pl_top_realtimehot > table > tbody > tr").items():
    if len(list(item.text().split())) >= 3:
        print({
            '排名': int(item('td.td-01.ranktop').text()),
            '名称': item('td.td-02 > a').text(),
            '热度': int(item('td.td-02 > span').text())
        })
1
2
3
4
5
6
7

然后用这段代码就可以获得所有结果：
在这里插入图片描述

for item in html("#pl_top_realtimehot > table > tbody > tr").items():
    if len(list(item.text().split())) >= 3:
        with open ("weibo.csv","a",encoding="utf-8") as f:
            f.write(item('td.td-01.ranktop').text()+","+item('td.td-02 > a').text()+","+item('td.td-02 > span').text()+"\n")
1
2
3
4

这一段代码就可以把数据存储为csv文件
在这里插入图片描述

但是因为我们想要每天定时爬取，所以还要再加点东西：

import time
with open ("weibo.csv","a",encoding="utf-8") as f:
    f.write("日期,时间,排名,名称,热度\n")
for item in html("#pl_top_realtimehot > table > tbody > tr").items():
    if len(list(item.text().split())) >= 3:
        with open ("weibo.csv","a",encoding="utf-8") as f:
            f.write(time.strftime("%Y-%m-%d")+","+time.strftime("%H:%M:%S")+","+item('td.td-01.ranktop').text()+","+item('td.td-02 > a').text()+","+item('td.td-02 > span').text()+"\n")
1
2
3
4
5
6
7

这就可以了：
在这里插入图片描述

还有保存到excel：

try:
    r_xls = open_workbook("weibo.xls") 
    row = r_xls.sheets()[0].nrows 
    excel = copy(r_xls)  
    table = excel.get_sheet("sheet1") 
    for item in html("#pl_top_realtimehot > table > tbody > tr").items():
        if len(list(item.text().split())) >= 3:
            contents = [time.strftime("%Y-%m-%d"),
                        time.strftime("%H:%M:%S"),
                        int(item('td.td-01.ranktop').text()),
                        item('td.td-02 > a').text(),
                        int(item('td.td-02 > span').text())]
            for i in range(len(contents)):
                table.write(row, i, contents[i])
            row += 1
    excel.save("weibo.xls")  
except FileNotFoundError:
    f = xlwt.Workbook()
    sheet = f.add_sheet("sheet1")
    lists = ['日期', '时间', '排名', '名称', '热度']
    for i in range(0, len(lists)):
        sheet.write(0, i, lists[i])
    row = 1
    for item in html("#pl_top_realtimehot > table > tbody > tr").items():
        if len(list(item.text().split())) >= 3:
            contents = [time.strftime("%Y-%m-%d"),
                        time.strftime("%H:%M:%S"),
                        int(item('td.td-01.ranktop').text()),
                        item('td.td-02 > a').text(),
                        int(item('td.td-02 > span').text())]
            for i in range(len(contents)):
                sheet.write(row, i, contents[i])
            row += 1
    f.save("weibo.xls")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

这里的try中的内容是当"weibo.xls"存在时直接写入，except中的内容是当"weibo.xls"不存在时导入数据。
在这里插入图片描述

接下来是保存数据到MongoDB数据库：

不会用的可以看这里入门：pymongo操作MongoDB基础教程

import pymongo
client = pymongo.MongoClient(host="localhost", port=27017)
db = client.test
collection = db.weibo
for item in html("#pl_top_realtimehot > table > tbody > tr").items():
    if len(list(item.text().split())) >= 3:
        collection.insert_one({
            '日期':time.strftime("%Y-%m-%d"),
            '时间': time.strftime("%H:%M:%S"),
            '排名': int(item('td.td-01.ranktop').text()),
            '名称': item('td.td-02 > a').text(),
            '热度': int(item('td.td-02 > span').text())
        })
1
2
3
4
5
6
7
8
9
10
11
12
13

在这里插入图片描述
保存数据到mysql数据库：（4.24新增）

def save_to_mysql(html):
    conn = pymysql.connect(
            host="localhost",
            user="root",
            passwd="19834044876lpy",
            port=3306,
            database="mydb",
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor)
    conn.cursor().execute("""
    create table if not exists weibo (
    日期 char(30),
    时间 char(30),
    排名 char(30),
    名称 char(30),
    热度 char(30)
    );
    """)
    conn.commit()
    for item in html("#pl_top_realtimehot > table > tbody > tr").items():
        if len(list(item.text().split())) >= 3:
            with conn.cursor() as cursor:
                cursor.execute("insert into weibo (日期,时间,排名,名称,热度) values (%s,%s,%s,%s,%s);",
                               [
                                   time.strftime("%Y-%m-%d"),
                                   time.strftime("%H:%M:%S"),
                                   item('td.td-01.ranktop').text(),
                                   item('td.td-02 > a').text(),
                                   item('td.td-02 > span').text()
                               ])
            conn.commit()
    cursor.close()
    conn.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

然后我们把它们封装成函数便于操作，主函数里设置整点抓取数据

接下来上源码：

from pyquery import PyQuery as pq
import pymongo
import time
from xlrd import open_workbook
from xlutils.copy import copy
import xlwt
import os
import threading
import pymysql


def get_data():
    return pq("https://s.weibo.com/top/summary/",
              {
                  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
              }
              )


def save_to_csv(html):
    if os.path.exists("weibo.csv"):
        pass
    else:
        with open("weibo.csv", "a", encoding="utf-8") as f:
            f.write("日期,时间,排名,名称,热度\n")
    for item in html("#pl_top_realtimehot > table > tbody > tr").items():
        if len(list(item.text().split())) >= 3:
            with open("weibo.csv", "a", encoding="utf-8") as f:
                f.write(time.strftime("%Y-%m-%d") + "," + time.strftime("%H:%M:%S") + "," + item(
                        'td.td-01.ranktop').text() + "," + item('td.td-02 > a').text() + "," + item(
                        'td.td-02 > span').text() + "\n")


def save_to_excel(html):
    try:
        r_xls = open_workbook("weibo.xls")
        row = r_xls.sheets()[0].nrows
        excel = copy(r_xls)
        table = excel.get_sheet("sheet1")
        for item in html("#pl_top_realtimehot > table > tbody > tr").items():
            if len(list(item.text().split())) >= 3:
                contents = [time.strftime("%Y-%m-%d"),
                            time.strftime("%H:%M:%S"),
                            int(item('td.td-01.ranktop').text()),
                            item('td.td-02 > a').text(),
                            int(item('td.td-02 > span').text())]
                for i in range(len(contents)):
                    table.write(row, i, contents[i])
                row += 1
        excel.save("weibo.xls")  # 保存并覆盖文件
    except FileNotFoundError:
        f = xlwt.Workbook()
        sheet = f.add_sheet("sheet1")
        lists = ['日期', '时间', '排名', '名称', '热度']
        for i in range(0, len(lists)):
            sheet.write(0, i, lists[i])
        row = 1
        for item in html("#pl_top_realtimehot > table > tbody > tr").items():
            if len(list(item.text().split())) >= 3:
                contents = [time.strftime("%Y-%m-%d"),
                            time.strftime("%H:%M:%S"),
                            int(item('td.td-01.ranktop').text()),
                            item('td.td-02 > a').text(),
                            int(item('td.td-02 > span').text())]
                for i in range(len(contents)):
                    sheet.write(row, i, contents[i])
                row += 1
        f.save("weibo.xls")


def save_to_mongodb(html):
    client = pymongo.MongoClient(host="localhost", port=27017)
    db = client.test
    collection = db['weibo']
    for item in html("#pl_top_realtimehot > table > tbody > tr").items():
        if len(list(item.text().split())) >= 3:
            collection.insert_one({
                '日期': time.strftime("%Y-%m-%d"),
                '时间': time.strftime("%H:%M:%S"),
                '排名': int(item('td.td-01.ranktop').text()),
                '名称': item('td.td-02 > a').text(),
                '热度': int(item('td.td-02 > span').text())
            })


def save_to_mysql(html):
    conn = pymysql.connect(
            host="localhost",
            user="root",
            passwd="19834044876lpy",
            port=3306,
            database="mydb",
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor)
    conn.cursor().execute("""
    create table if not exists weibo (
    日期 char(30),
    时间 char(30),
    排名 char(30),
    名称 char(30),
    热度 char(30)
    );
    """)
    conn.commit()
    for item in html("#pl_top_realtimehot > table > tbody > tr").items():
        if len(list(item.text().split())) >= 3:
            with conn.cursor() as cursor:
                cursor.execute("insert into weibo (日期,时间,排名,名称,热度) values (%s,%s,%s,%s,%s);",
                               [
                                   time.strftime("%Y-%m-%d"),
                                   time.strftime("%H:%M:%S"),
                                   item('td.td-01.ranktop').text(),
                                   item('td.td-02 > a').text(),
                                   item('td.td-02 > span').text()
                               ])
            conn.commit()
    cursor.close()
    conn.close()


if __name__ == '__main__':
    while 1:
        if time.strftime("%M%S") == "0000":
            t1 = threading.Thread(save_to_csv(get_data()))
            t2 = threading.Thread(save_to_excel(get_data()))
            t3 = threading.Thread(save_to_mongodb(get_data()))
            t4 = threading.Thread(save_to_mysql(get_data()))
            t1.start()
            t2.start()
            t3.start()
            t4.start()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132