情感分析(Sentiment Analysis)是一种常见的自然语言处理(NLP)方法的应用,它是对带有情感色彩的主观性文本进行分析、处理、归纳和推理,利用一些情感得分指标来量化定性数据的方法。在自然语言处理中,情感分析属于典型的文本分类问题,即把需要进行情感分析的文本划分为其所属类别。文本情感分析的应用非常广泛,如用户在购物网站、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论。
# coding=gbk import pandas as pd import jieba import requests import re import time # 爬取评论 class Reptile(): # 将 y_train 添加到 label.txt中 def find1(self, data): for i in data: l = [] l.append(eval(i)) print(l) f = open("label.txt", "ab") f.write("{}\n".format(str(l)).encode('utf-8')) f.close() # 将 X_train 添加到 comment.txt中 def find2(self, data): for i in data: l = [] l.append(i) print(l) f = open("comment.txt", "ab") f.write("{}\n".format(str(l)).encode('utf-8')) f.close() #爬取 y_train 和 X_train def create_website(self): for i in range(20, 201, 20): new_url = url1.replace(url1[71:73],str(i)) res = requests.get(new_url, headers=headers) label = re.findall('<span class="allstar10 rating" title=(.*?)></span>', res.text) comment = re.findall('<span class="short">(.*?)</span>', res.text) print(new_url) self.find1(label) self.find2(comment) time.sleep(10) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' } url5 = "https://movie.douban.com/subject/1291546/comments?percent_type=h&start=20&limit=20&status=P&sort=new_score" url3 = "https://movie.douban.com/subject/1291546/comments?percent_type=m&start=20&limit=20&status=P&sort=new_score" url1 = "https://movie.douban.com/subject/1291546/comments?percent_type=l&start=20&limit=20&status=P&sort=new_score" #实例化类 reptile = Reptile() reptile.create_website()
#数据分割 class Split_Data: def splitdata(self): # 读取 comment.txt 文件 data = pd.read_csv("comment.txt", sep='\n', header=None) for i in data[0]: str_data = str(i) #去除 "[" , "]" for j in str_data: if j == "[": replace1 = str_data.replace(j, '') if j == "]": replace2 = replace1.replace(j, '') lcut_list = jieba.lcut(replace2) #分割每条评论并添加到lcut.txt文件中 for k in lcut_list: if k == "'": lcut_list.remove(k) f = open("lcut.txt", "ab") f.write("{}\n".format(str(lcut_list)).encode('utf-8')) f.close() return lcut_list #实例化类 split_data = Split_Data() split_data.splitdata()
#贝叶斯预测 class Predict: #导入数据 def __init__(self): df_file = open("lcut.txt","r",encoding='utf-8') train_flie = open("label.txt","r",encoding='utf-8') df = df_file.readlines() train = train_flie.readlines() #将y_train改为list类型 dd = [] for j in train: jj = eval(j[:-1])[0] # print(jj,type(jj)) dd.append(jj) self.se = pd.Series(dd) # 将X_train改为list类型 self.list_df = [] for i in df: self.list_df.append(eval(i[:-1])) # print(self.list_df) #创建计数函数 def creat_wordall(self,doucts): word_all = set() for douct in doucts: word_all = word_all | set(douct) #|是并集 word_all = list(word_all) return word_all # 创建词向量函数 def create_wordVec(self,douct,all_words): dic = {} for word in all_words: if word in douct: dic[word] = 1 else: dic[word] = 0 return dic #贝叶斯算法预测 def predict(self,answer): docuts = self.creat_wordall(self.list_df) trainmatrix = [] for i in self.list_df: trainmatrix.append(self.create_wordVec(i, docuts)) self.df_data = pd.DataFrame(trainmatrix) test = jieba.lcut(answer) A_num = self.df_data.loc[self.se[self.se=="力荐"].index,:] A_p = len(A_num)/len(self.df_data) A_word = A_num.sum() A_all_words = A_word.sum() A_p_every = A_word/A_all_words B_num = self.df_data.loc[self.se[self.se=="还行"].index,:] B_p = len(B_num)/len(self.df_data) B_word = B_num.sum() B_all_words = B_word.sum() B_p_every = B_word/B_all_words C_num = self.df_data.loc[self.se[self.se=="很差"].index,:] C_p = len(C_num)/len(self.df_data) C_word = C_num.sum() C_all_words = C_word.sum() C_p_every = C_word/C_all_words A_test = A_p_every[test] A_prod = A_test.prod() * A_p print(A_prod) B_test = B_p_every[test] B_prod = B_test.prod() * B_p print(B_prod) C_test = C_p_every[test] C_prod = C_test.prod() * C_p print(C_prod) if A_prod > B_prod and A_prod > C_prod: print("你的评论属于:正面!") elif B_prod > A_prod and B_prod > C_prod: print("你的评论属于:中性!") elif C_prod > A_prod and C_prod > B_prod: print("你的评论属于:负面!") #程序入口 if __name__ == "__main__": answer = input("请输入你的影评:\n") #实例化类 pred = Predict() pred.predict(answer)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。