赞
踩
- import requests
- import re
- import json
- from bs4 import BeautifulSoup
- import datetime
- from apscheduler.schedulers.blocking import BlockingScheduler
- from apscheduler.triggers.interval import IntervalTrigger
- from apscheduler.schedulers.background import BackgroundScheduler
- import pandas as pd
- from snownlp import SnowNLP
- import numpy as np
- base_url = '' #爬取的网站
- cookie='' #自己的浏览器cookie
- headers = {'User-Agent': '',#自己浏览器的User-Agent
- 'Cookie':cookie,
- 'Accept-Encoding':'gzip, deflate, br',
- }
'运行
- class spider():
- def __init__(self, base_url, cookie,headers):
- self.base_url = base_url
- self.cookie = cookie
- self.headers=headers
-
- def web_crawler(self):#爬虫
- response = requests.get(self.base_url, headers=self.headers)
- response.raise_for_status()
- response.encoding = response.apparent_encoding
- data = response.text
- soup = str(BeautifulSoup(data,'lxml'))#解析
- data1=re.compile('target="_blank">(.+)</a>')
- data2=data1.findall(soup)[1:-2]
- print(datetime.datetime.now(),len(data2))
- print(data2)
- print("******************************************************************")
- file = open('wb_result.txt','a',encoding='utf-8')
- for i in data2:
- file.write( str(datetime.datetime.now())[:19]+"," )
- file.write( i+"\n" )
- # 关闭打开的文件
- file.close()
-
- def Timed_crawling(self):#定时调度,改时间就可
- scheduler = BlockingScheduler(timezone='Asia/Shanghai')
- scheduler.add_job(self.web_crawler, 'interval', seconds=900,start_date='2023-04-16 12:17:00',end_date='2023-04-16 12:18:00')
- #scheduler.remove_job(0)
- scheduler.start()
- #scheduler.shutdown(wait=False)
-
- def data(self):#数据读取与处理
- df = pd.read_csv("wb_result.txt", sep=",", names=["time", "hot_word"])
- return df
-
- def Sentiment_analysis(self,df):#情感分析
- E_word=list(set(df["hot_word"]))
- E_result={}
- for i in E_word:
- E_result[i]=SnowNLP(i).sentiments
- E_result=pd.Series(E_result)
- Most_negative=E_result.sort_values(ascending=False)[-3:].reset_index()
- most_positive=E_result.sort_values(ascending=False)[:3].reset_index()
- Most_negative.columns=["Most_negative_hotword","scores"]
- Most_negative=Most_negative.sort_values(by=['scores'],ascending=True)
- most_positive.columns=["most_positive_hotword","scores"]
- Most_negative.index=["第一名","第二名","第三名"]
- most_positive.index=["第一名","第二名","第三名"]
- print("最正面的3条和最负面的3条热搜如下")
- display(pd.concat([Most_negative,most_positive],axis=1,join='inner'))
-
- def Hot_search_queries(self,df):#热搜查询
- hot_search_statistics=pd.DataFrame()
-
- for i in list(set(df.time)):
- hot=df[df["time"]==i].hot_word
- hot=pd.DataFrame(hot.values,columns=[i])
- hot_search_statistics=pd.concat([hot_search_statistics,hot],axis=1)
-
- hot_search_statistics=hot_search_statistics.sort_index(axis=1)
- print("历史某节点热搜榜单:\n -----------------")
- hot_search_statistics.index=hot_search_statistics.index.values+1
- hot_search_statistics.index.name="rank"
- display(hot_search_statistics)
-
- def length_on_list(self,df):#在榜时长
- length_on_list_total={}
- for t in list(set(df.hot_word)):
- #print(t)
- L=df[df["hot_word"]==t].time.to_list()
- i=1
- length_on_list=0
- while i<len(L)-1:
- end_time=datetime.datetime.strptime(L[i+1], "%Y-%m-%d %H:%M:%S")
- #print(end_time)
- start_time=datetime.datetime.strptime(L[i], "%Y-%m-%d %H:%M:%S")
- #print(start_time)
- #print((end_time-start_time).seconds)
- if (end_time-start_time).seconds==900:
- length_on_list=length_on_list+900
- i=i+1
- if length_on_list==0:
- length_on_list_total[t]="小于15分钟"
- else:
- length_on_list_total[t]=length_on_list/60
-
-
- print("在榜时长:\n-----------------")
- display(pd.DataFrame({"hot_word":length_on_list_total.keys(),
- "on_list(min)":length_on_list_total.values()}) )
'运行
- weibo_spider=spider(base_url,cookie,headers)
- weibo_spider.Timed_crawling()
- df=weibo_spider.data()
- weibo_spider.Sentiment_analysis(df)
- weibo_spider.Hot_search_queries(df)
- weibo_spider.length_on_list(df)
此代码可以在jupyter notebook 跑,若只想跑通爬虫代码,可以删除weibo_spider.Sentiment_analysis(df)
weibo_spider.Hot_search_queries(df)
weibo_spider.length_on_list(df),这三个函数,因为他们是用来做统计的
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。