当前位置:   article > 正文

网易云音乐评论爬取、情感分析一体化_基于音乐评论数据的情感分析

基于音乐评论数据的情感分析

开局一张图在这里插入图片描述

网易云诞生了很多励志鸡汤,那么多的伤感流行句式,那么多微甜情话,今天我们就看他个天翻地覆,话不多说直接上个干货。

导入包、相关库

import requests
import math
import random
from Crypto.Cipher import AES
import codecs
import base64
import tkinter
from snownlp import SnowNLP
import matplotlib.pyplot as plt
import numpy as np
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

获取窗体内输入的内容

def go():
    global song_id
    x = str(entry1.get())
    print(x)  # 获取文本框的内容
    song_id = str(x)
def go1():
    global path2
    x = str(entry2.get())
    print(x)  # 获取文本框的内容
    path2 = str(x)
def go2():
    global file_name
    x = str(entry3.get())
    print(x)  # 获取文本框的内容
    file_name = str(x)
def go3():
    global cleaning_file
    x = str(entry4.get())
    print(x)  # 获取文本框的内容
    cleaning_file = str(x)
def go4():
    global cleaned_file
    x = str(entry5.get())
    print(x)  # 获取文本框的内容
    cleaned_file = str(x)
def go5():
    global analysis_path
    x = str(entry6.get())
    print(x)  # 获取文本框的内容
    analysis_path = str(x)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30

设置窗体大小、标题

win = tkinter.Tk()
win.geometry('500x500')
win.title('网易云音乐情感分析')

  • 1
  • 2
  • 3
  • 4

创建窗体的按钮,附带提示信息如输入规范

entry1 = tkinter.Entry(win, width=50, fg="black")
entry1.pack()
button = tkinter.Button(win, text="请输入id,输入后请点击我", command=go) 
 # 收到消息执行这个函数
button.pack()  # 加载到窗体

button1 = tkinter.Button(win, text="请输入存储文件的路径(以\结束,路径就可以,下一个输入框输入名称),输入后请点击我", command=go1,bg='yellow')  # 收到消息执行这个函数
entry2 = tkinter.Entry(win, width=50, fg="black",bg='yellow')
entry2.pack()
button1.pack()  # 加载到窗体

entry3 = tkinter.Entry(win, width=50, fg="black",bg='green')
entry3.pack()
button2 = tkinter.Button(win, text="请输入存储文件的名称(名称就可以,我们自动为您生成txt文件),输入后请点击我", command=go2,bg='green')  # 收到消息执行这个函数
button2.pack()  # 加载到窗体

entry4 = tkinter.Entry(win, width=50, fg="black",bg='gray')
entry4.pack()
button3 = tkinter.Button(win, text="请输入将要清洗文件的路径(具体到格式),输入后请点击我", command=go3,bg='gray')  # 收到消息执行这个函数
button3.pack()  # 加载到窗体

entry5 = tkinter.Entry(win, width=50, fg="black",bg='pink')
entry5.pack()
button4 = tkinter.Button(win, text="请输入清洗完毕文件的路径(具体到格式),输入后请点击我", command=go4,bg='pink')  # 收到消息执行这个函数
button4.pack()  # 加载到窗体

entry6 = tkinter.Entry(win, width=50, fg="black",bg='orange')
entry6.pack()
button5 = tkinter.Button(win, text="请输入进行情感分析文件的路径(具体到格式),输入后请点击我", command=go5,bg='orange')  # 收到消息执行这个函数
button5.pack()  # 加载到窗体

button6 = tkinter.Button(win,text='全部输入完毕请点击我',command=win.destroy).pack()
win.mainloop()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34

构造函数获取歌手信息

def get_comments_json(url, data):
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
               'Accept-Encoding': 'gzip, deflate',
               'Accept-Language': 'zh-CN,zh;q=0.9',
               'Connection': 'keep-alive',
               'Cookie': 'Province=****; City=****;   #说明此处为省份、城市代码,如山东济南为0530,0531建议用自定获取到的
 _ntes_nnid=861aff69ca0c1a71635a5ed2a0243acb,1544690244162; _ntes_nuid=861aff69ca0c1a71635a5ed2a0243acb; UM_distinctid=167a6b6a0a83f1-00127701049a17-335a497c-e1000-167a6b6a0a9517; usertrack=ezq0o1wSGjl5etI0BD5JAg==; vjuids=2a1d704ac.167a6b6cb26.0.b0103dac2188b; vjlast=1544690257.1544690257.30; nteslogger_exit_time=1544692086080; vinfo_n_f_l_n3=8a31831dccfd73bf.1.0.1544690256693.0.1544692154822; JSESSIONID-WYYY=HYgCofY5xb%5Cbn0UObOx4nvEqF1Akb3e%2Fh%2FzcPVbhWyj1KaJZnTusNDfyT5mWEBuSWSJ9uNs5G%2BTpVkenwYj1V7CpefhlP9FP6RtFWxFrbWIbsKPMFQo8lV58%2FrH%2BsHf42oU20b1lqMfoHApESJqjCDM9Mtgs2WRkXWs4Qbb4WTmcIipY%3A1545471673818; _iuqxldmzr_=32; __utma=94650624.195620955.1545469875.1545469875.1545469875.1; __utmc=94650624; __utmz=94650624.1545469875.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; WM_NI=HehEClh%2F
 %2FQUZj98wglZfRgNpbsu1q9m2HxBPcS9UkOXXysR7gOXojWNn82ueE5kAzm4tLz3eUvdfIZTqY5%2BVheKLttjo3RnK9Bho7dWiyA6FIqm7%2BVm5tA61RUEIYGa%2BQ3k%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeb4d7639bb9f7bad16ea7b88fa7c54f879f9a85bc5cb686a2b5d364bab287bac72af0fea7c3b92a8cbba8a6c268acf1e1b2d852aae7f898e6689aba9e88cd7ca2adbbd2f433afee8899c15ca18df7b6ea459c88b794d13da5919890e95d8eb2f8d3f86f87eba5a2f967f8ac849bb26f97aaac87cc5298af97d6aa5eacbb85adf780aab1fdb3ed41fb9ea890b67095b7b7a2b54e8bafa3d8aa5eafebbca2b53b928b8baadc4da3f59fd4ea37e2a3; WM_TID=U0nsAG4m95hEAFVVFVZ8fqJzHf1jkqZC; __utmb=94650624.7.10.1545469875',
               'Host': 'music.163.com',
               'Referer': 'http://music.163.com/',
               'Upgrade-Insecure-Requests': '1',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/66.0.3359.181 Safari/537.36'}
      try:
        r = requests.post(url, headers=headers, data=data)
        r.encoding = "utf-8"
        if r.status_code == 200:
            # 返回json格式的数据
            return r.json()          
      except:
        print("爬取失败!")

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22

生成16个随机字符

def generate_random_strs(length):
    string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    # 控制次数参数i
    i = 0
    # 初始化随机字符串
    random_strs = ""
    while i < length:
        e = random.random() * len(string)
        # 向下取整
        e = math.floor(e)
        random_strs = random_strs + list(string)[e]
        i = i + 1
    return random_strs
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

AES加密

def AESencrypt(msg, key):
    # 如果不是16的倍数则进行填充(paddiing)
    padding = 16 - len(msg) % 16
    # 这里使用padding对应的单字符进行填充
    msg = msg + padding * chr(padding)
    # 用来加密或者解密的初始向量(必须是16位)
    iv = '0102030405060708'
    
    cipher = AES.new(key, AES.MODE_CBC, iv)
    # 加密后得到的是bytes类型的数据
    encryptedbytes = cipher.encrypt(msg)
    # 使用Base64进行编码,返回byte字符串
    encodestrs = base64.b64encode(encryptedbytes)
    # 对byte字符串按utf-8进行解码
    enctext = encodestrs.decode('utf-8')
         
    return enctext
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

RSA加密

def RSAencrypt(randomstrs, key, f):
    # 随机字符串逆序排列
    string = randomstrs[::-1]
    # 将随机字符串转换成byte类型数据
    text = bytes(string, 'utf-8')
    seckey = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16)
    return format(seckey, 'x').zfill(256)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

获取参数

def get_params(page):
    # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js)
    # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的
    # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}'
    # 偏移量
    offset = (page - 1) * 20
    # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成
    msg = '{"offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
    key = '0CoJUm6Qyw8W8jud'
    f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
    e = '010001'
    enctext = AESencrypt(msg, key)
    # 生成长度为16的随机字符串
    i = generate_random_strs(16)
      # 两次AES加密之后得到params的值
    encText = AESencrypt(enctext, i)
    # RSA加密之后得到encSecKey的值
    encSecKey = RSAencrypt(i, e, f)
    return encText, encSecKey
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19

最主要的部分来了

def comments(html, songname, i, pages, total, filepath):
    # with open(filepath, 'a', encoding='utf-8') as f:
    #     f.write("\n正在获取歌曲{}的第{}页评论,总共有{}页{}条评论!\n".format(songname, i, pages, total))
    print("{}/{}\n".format( i, pages))
    # 全部评论
    j = 1
    for item in html['comments']:
        # 提取发表评论的用户名
        user = item['user']
        # print("全部评论{}: {} : {}    点赞次数: {}".format(j, user['nickname'], item['content'], item['likedCount']))
        with open(filepath, 'a', encoding='utf-8') as f:
            f.write(item['content'])
            f.write('\n')
            f.close()
      j += 1
    f.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16

歌曲id号

songid = song_id
filepath = path2
  • 1
  • 2

歌曲名字

songname = file_name
print(songid)
print(filepath)
print(songname)
  • 1
  • 2
  • 3
  • 4

文件存储路径

filepath = filepath  + songname + ".txt"
page = 1
params, encSecKey = get_params(page)
  • 1
  • 2
  • 3

获取第一页评论

url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(songid) + '?csrf_token='
data = {'params': params, 'encSecKey': encSecKey}
# url = 'https://music.163.com/#/song?id=19292984'
# 获取第一页评论
html = get_comments_json(url, data)
# 评论总数
total = html['total']
    # 总页数
pages = math.ceil(total / 20)
# hotcomments(html, songname, page, pages, total, filepath)
comments(html, songname, page, pages, total, filepath)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

获取全部评论

page = 2
while page <= pages:
    params, encSecKey = get_params(page)
    data = {'params': params, 'encSecKey': encSecKey}
    html = get_comments_json(url, data)
    # 从第二页开始获取评论
    comments(html, songname, page, pages, total, filepath)
    page += 1
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

数据预处理

filepath1 = cleaning_file
file1 = open (filepath1,'r',encoding='utf-8')
filepath2 = cleaned_file
file2 = open(filepath2,'w',encoding='utf-8')
try:
    print("转换中......")
    for line in file1.readlines():
        if line == '\n' :
            line = line.strip("\n")
        file2.write(line)
 finally:
    file1.close()
    file2.close()
    print("执行完毕!")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

情感分析SnowNLP及可视化

comment = []
with open(analysis_path, mode='r', encoding='utf-8') as f:
    rows = f.readlines()
    # print(rows)
    for row in rows:
        if row not in comment:
            comment.append(row.strip('\n'))
    # print(comment)
def snowanalysis(self):
    sentimentslist = []
    for li in self:
        print(li)
        s = SnowNLP(li)
        print(s.sentiments)
        sentimentslist.append(s.sentiments)
    plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01))
    plt.show()
    print(sentimentslist)
  
    for i in range(len(sentimentslist)):
        if (sentimentslist[i]>0.5):
            sentimentslist[i]=1
        else:
            sentimentslist[i]=-1
    print(sentimentslist)
    info=[]
    a=0
    b=0
    for x in range(0,len(sentimentslist)):
        if(sentimentslist[x]==1):
            a=a+1
        else:
            b=b+1
    info.append(b)
    info.append(a)
    print(info)
    info2=['negative','positive']
    plt.bar(info2,info,tick_label=info2,color='#2FC25B')
    plt.show()
snowanalysis(comment)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 注:本帖只用于学习交流,不得用于商业活动
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/409121
推荐阅读
相关标签
  

闽ICP备14008679号