针对某关键词爬取相关数据_csdn 关键词文章爬虫

作者：笔触狂放9 | 2024-04-16 18:06:04

踩

csdn 关键词文章爬虫

针对某关键词爬取相关数据

目标数据：2020.3.20微博用户发布的包含关键词“疫情”的微博文本数据。
针对上述目标数据，涉及到微博的https://weibo.com和https://weibo.cn站点，其中weibo.com的时间粒度是一个小时，weibo.cn的时间粒度是一天，为了抓取到尽可能多的关键词搜索结果，需要使用weibo.com的高级搜索功能来实现按照关键字和时间进行爬取。考虑到该站点查看数据的最小粒度是一个小时，一个小时内可以查看到的数据是50页，每页最多为20条微博数据，因此每个小时得到的最多数据量是1000条，24小时就是24000条数据。
爬取流程：发送请求—获得页面—解析页面—抽取并储存内容。
爬虫设计：设置随机user-agent模拟浏览器（通过请求头知道是通过哪个浏览器来请求的）；使用代理ip、携带cookie爬取数据；针对处理好的请求地址进行爬取，给网址发送请求，以二进制返回网页内容，对其进行解析获取需要的数据，并将其保存在csv文件中。
“疫情”：%25E7%2596%25AB%25E6%2583%2585
请求的地址：设置starttime和endtime得到请求网址（按照一小时的时间粒度进行爬取）
https://s.weibo.com/weibo/%25E7%2596%25AB%25E6%2583%2585?q=%E7%96%AB%E6%83%85&typeall=1&suball=1×cope=custom:starttime:endtime&Refer=g&page=
使用到的python库：urllib、requests（requests 负责连接网站，返回网页）、Beautifulsoup（解析网页内容）。

import os
import urllib
import urllib.request
import xlwt
import requests
import re
from bs4 import BeautifulSoup
import random

# 设置代理IP
proxy_addr="122.241.72.191:808"
header = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36',
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
            'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
            'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
            'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36']
          
def txt_csv(filename,csvname):
    try:
        with open(filename,'r',encoding='utf-8') as f:
            csv=xlwt.Workbook()
             #生成excel的方法，声明excel
            sheet = csv.add_sheet('sheet1',cell_overwrite_ok=True)
            # 页数、条数、微博地址、发布时间、微博内容、点赞数、评论数、转发数
            sheet.write(0, 0, '爬取页数')
            sheet.write(0, 1, '爬取当前页数的条数')
            sheet.write(0, 2, '用户名')
            sheet.write(0, 3, '微博内容')
            sheet.write(0, 4, '链接')
            sheet.write(0, 5, '微博发布时间')
            x = 1
            while True:
                #按行循环，读取文本文件
                line = f.readline()
                if not line:
                    break  #如果没有内容，则退出循环
                for i in range(0, len(line.split('\t'))):
                    item=line.split('\t')[i]
                    sheet.write(x,i,item) # x单元格行，i 单元格列
                x += 1 #excel另起一行
        csv.save(csvname) #保存xls文件
    except:
        raise

def get_start_end_time(header,start,end,file):
    for t1 in range(0,24):
         # 加一句  在保存一个小时的数据的时候输出
        print("==============================================================================================================")
        print("当前爬取"+str(t1)+"小时的数据:")
        i = 1  # 按小时进行爬取
        headers = {
              "User-Agent":header[random.randint(0,len(header)-1)],
              "cookie":cookie值
            }# 带cookie进行爬取
        starttime = start + "-" + str(t1)
        endtime =  end + "-" + str(t1+1)
        url = "https://s.weibo.com/weibo/%25E7%2596%25AB%25E6%2583%2585?q=%E7%96%AB%E6%83%85&typeall=1&suball=1&timescope=custom:"+starttime+":"+endtime+"&Refer=g&page="
        resp = requests.get(url,headers=headers)  # 给网址发送请求，获取       &Refer=2&page=1
        resp.content.decode("utf-8") #打印网页内容 以二进制返回内容
        html = resp.text
        soup = BeautifulSoup(html,'html.parser')
        page_num = soup.find("div",{"class":"m-page"})
        num = len(page_num.find_all("li")) #获取页数--当前这个小时有多少页内容
        while i<= num:
            import time
            time.sleep(2)  # 设置时间间隔，为了防止过多访问被跳出
            try:
                j = 0
                url = "https://s.weibo.com/weibo/%25E7%2596%25AB%25E6%2583%2585?q=%E7%96%AB%E6%83%85&typeall=1&suball=1&timescope=custom:"+starttime+":"+endtime+"&Refer=g&page=&page="+str(i)
                headers = {
                  "User-Agent":header[random.randint(0,len(header)-1)],
                  "cookie":cookie值
                }
                resp = requests.get(url,headers=headers)  # 给网址发送请求，获取       &Refer=2&page=1
                resp.content.decode("utf-8") #打印网页内容 以二进制返回内容
                html = resp.text
                soup = BeautifulSoup(html,'html.parser') # 解析网页内容 选择解析器'html.parser'(内置的解析器 速度比较慢）（LxmL更快一些 需要安装）
                        #针对当前页存在的微博进行查看
                for h in soup.find_all("div",{"class":"content"}):
                    j+=1
                    print("-----正在爬取第"+str(i)+"页，第"+str(j)+"条微博------")
                    # 获取微博用户的id
                    text = h.find("p",{"class":"txt"})
                    id = text["nick-name"]

                #            print("用户id：",id)
                    # 获取微博内容
                    text = h.find("p",{"class":"txt","node-type":"feed_list_content"})
                    if "展开全文" in str(text):
                        text = h.find("p",{"class":"txt","node-type":"feed_list_content_full"})
                    text = text.text
                    new_text = re.sub(" +", "", text)  # 合并空格
                    new_text1 = re.sub("\n","",new_text)
                #   print("微博内容：",text)

                    # 获取微博的url链接
                    content_url = h.find("p",{"class":"from"}).find("a")
                    content_url = content_url["href"]

                # 获取发布微博的时间
                    for t in h.find_all("p",{"class":"from"}):
                        time1 = t.text
                        new_time = re.sub(" +", "", time1)  # 合并空格
                        new_time1 = re.sub("\n","",new_time)
                    print(new_time1)
                    # 保存文本
                    with open(file,'a',encoding='utf-8') as fh:
                        fh.write(str(i)+'\t'+str(j)+'\t'+str(id)+'\t'+str(new_text1)+'\t'+str(content_url)+'\t'+str(new_time1)+'\n')
                        print("保存第"+str(i)+"页，第"+str(j)+"条微博------")
                    # 休眠1s以免给服务器造成严重负担
                time.sleep(1)
                i+=1

            except Exception as e:
                print(e)
                pass


if __name__=="__main__":
    day = 20
    print("当前爬取的是3月"+str(day)+"日的数据")
    print("==============================================================================")
    start = "2020-03-"+str(day)
    end = "2020-03-"+str(day)
    file = "020年3月"+str(day)+"日数据.txt"
    get_start_end_time(header,start,end,file)

    filename = "2020年3月"+str(day)+"日数据.txt"
    csvname = "2020年3月"+str(day)+"日数据.csv"
    txt_csv(filename,csvname)
    os.remove("2020年3月"+str(day)+"日数据.txt")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

声明：本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：【wpsshop博客】