赞
踩
参考:https://blog.csdn.net/weixin_42555080/article/details/88363040
从经验来讲,爬取难度:微博网页端>手机端,参考 Blessy_Zhu.提出的方法,这里对微博移动端:htps://m.weibo.cn 进行爬取。
单从界面上来讲就能看出爬取的难度了。下面选择一条感兴趣的微博,我选择的链接为:https://weibo.cn/comment/JcgPYxrNf?uid=1713926427
右键检查,进入开发者工具,选择 network 面板,我们就得到了需要的信息
import requests import re import time def get_one_page(url):#请求函数:获取某一网页上的所有内容 headers = { 'User-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'Host' : 'weibo.cn', 'Accept' : 'application/json, text/plain, */*', 'Accept-Language' : 'zh-CN,zh;q=0.9', 'Accept-Encoding' : 'gzip, deflate, br', 'Cookie' : '你的cookie', 'DNT' : '1', 'Connection' : 'keep-alive' }#请求头的书写,包括User-agent,Cookie等 response = requests.get(url,headers = headers,verify=False)#利用requests.get命令获取网页html if response.status_code == 200: #状态为200即为爬取成功 return response.text#返回值为html文档,传入到解析函数当中 return None #解析html并存入到文档result.txt中 def parse_one_page(html): pattern = re.compile('<span class="ctt">.*?</span>', re.S) items = re.findall(pattern,html) result = str(items) with open('test.txt','a',encoding='utf-8') as fp: fp.write(result) # 当超过50页就不在获取新的评论了 所以就设为50 for i in range(50): url = "https://weibo.cn/comment/JcgPYxrNf?uid=1713926427&rl=0&&page="+str(i) html = get_one_page(url) print(html) print('正在爬取第 %d 页评论' % (i+1)) parse_one_page(html) time.sleep(3)
test.txt内容:
下面要从爬取到的内容解析出评论,并把它们存入数据库
import re import pandas as pd import pymysql import emoji # 连接数据库 conn = pymysql.connect( host='127.0.0.1', port=3306, user='root', passwd='数据库密码', db='weibo', # 数据库名 charset='utf8', ) cursor = conn.cursor() with open("/Users/guo/Desktop/爬虫/微博爬取评论数据_新参数/test.txt", "r") as f: # 打开文件 content = f.read() # 读取文件 rawResults = re.findall(">.*?<",content,re.S) firstStepResults = [] for result in rawResults: #print(result) if ">\'][\'<" in result: continue if ">:<" in result: continue if "><" in result: continue if ">回复<" in result: continue if "><" in result: continue if ">\', \'<" in result: continue if "@" in result: continue if "> <" in result: continue else: result = emoji.demojize(result) # 去除评论中的表情包 a = re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+', result, re.S) # 只要字符串中的中文,字母,数字 a = "".join(a) firstStepResults.append(a) subTextHead = re.compile(">") subTextFoot = re.compile("<") i = 0 for lastResult in firstStepResults: resultExcel1 = re.sub(subTextHead, '', lastResult) resultExcel = re.sub(subTextFoot, '', resultExcel1) sql = "insert into pinglun1(pinglun) values(%s)" # pinglun1:表明 pinglun:列名 cursor.execute(sql, resultExcel) with open('result.txt', 'a') as f: # 'a'表示append,即在原来文件内容后继续写数据(不清楚原有数据) f.write(resultExcel) print(i,resultExcel) i+=1 f.close() cursor.close() conn.commit() conn.close()
注意,评论中会出现表情,存入数据库是会出现问题,用以下方法过滤:
result = emoji.demojize(result) # 去除评论中的表情包
是数据库的操作是,创建一张表pinglun1,定义列名pinglun
运行结果:
result.txt:
def readmysql(): #读取数据库
textlist = []
conn =pymysql.connect(host='127.0.0.1',
user='root',
password='密码',
db = 'weibo',
charset="utf8") #连接服务器
with conn:
cur = conn.cursor()
cur.execute("SELECT * FROM pinglun1")
rows = cur.fetchall()
for row in rows:
a = list(row)
textlist.append(a)
return textlist
使用fetchall()提取出来的数据是元组,我们用list(row)转为列表形式
def snowanalysis(textlist):
sentimentslist = []
for li in textlist:
li = str(li)
s = SnowNLP(li)
sentimentslist.append(s.sentiments)
fig1 = plt.figure("sentiment")
plt.hist(sentimentslist,bins=np.arange(0,1,0.02))
plt.show() # 当值大于0.5时代表句子的情感极性偏向积极,当分值小于0.5时,情感极性偏向消极
显示结果:
大部分的评论还是比较偏向中立的,并且积极的评论也要多于消极评论,说明在2020的上半年,尽管经历了疫情、洪水等一系列的考验,但是大家还是对国家充满信心,对未来充满希望。
def word2cloud(textlist): fulltext = '' isCN = 1 back_coloring = imread("bjt.jpg") cloud = WordCloud(font_path='/System/Library/Fonts/Supplemental/Arial Unicode.ttf', background_color="white", max_words=2000, mask=back_coloring, max_font_size=100, random_state=42, width=1000,height=860,margin=2) for li in textlist: fulltext += ' '.join(jieba.cut(str(li),cut_all =False)) # str(li)一定要转为str格式 wc = cloud.generate(fulltext) image_colors = ImageColorGenerator(back_coloring) plt.figure("wordc") plt.imshow(wc.recolor(color_func=image_colors)) wc.to_file('评论词云.png') image_produce = cloud.to_image() image_produce.show()
运行结果:
参考
[1]: https://blog.csdn.net/weixin_42555080/article/details/88363040
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。