赞
踩
from snownlp import SnowNLP import pandas as pd from collections import defaultdict import os import re import jieba import codecs ''' #读取评论内容的.txt文件 txt = open('C:/Users/24224/Desktop/1.txt',encoding='utf-8') text = txt.readlines() print(text) #确认读取文件成功,并关闭文件节省资源 print('读入成功') txt.close() #遍历每一条评论,得到每条评论是positive文本的概率,每条评论计算完成后输出ok确认执行成功 comments = [] comments_score = [] for i in text: a1 = SnowNLP(i) a2 = a1.sentiments comments.append(i) comments_score.append(a2) print('ok') #将结果数据框存为.xlsx表格,查看结果及分布 table = pd.DataFrame(comments, comments_score) print(table) table.to_excel('C:/Users/24224/Desktop/emotion_analyse.xlsx', sheet_name='result') #打分范围是[0-1],此次定义[0,0.5]为负向评论,(0.5,1]为正向评论,观察其分布。 #基于波森情感词典计算情感值 def getscore(text): df = pd.read_table(r"BosonNLP_sentiment_score\BosonNLP_sentiment_score.txt", sep=" ", names=['key', 'score']) key = df['key'].values.tolist() score = df['score'].values.tolist() # jieba分词 segs = jieba.lcut(text,cut_all = False) #返回list # 计算得分 score_list = [score[key.index(x)] for x in segs if(x in key)] return sum(score_list) #读取文件 def read_txt(filename): with open(filename,'r',encoding='utf-8')as f: txt = f.read() return txt #写入文件 def write_data(filename,data): with open(filename,'a',encoding='utf-8')as f: f.write(data) if __name__=='__main__': text = read_txt('C:/Users/24224/Desktop/1.txt') lists = text.split('\n') i = 0 for list in lists: if list != '': sentiments = round(getscore(list),2) #情感值为正数,表示积极;为负数表示消极 print(list) print("情感值:",sentiments) if sentiments > 0: print("机器标注情感倾向:积极\n") s = "机器判断情感倾向:积极\n" else: print('机器标注情感倾向:消极\n') s = "机器判断情感倾向:消极"+'\n' sentiment = '情感值:'+str(sentiments)+'\n' #文件写入 filename = 'BosonNLP情感分析结果.txt' write_data(filename,'情感分析文本:') write_data(filename,list+'\n') #写入待处理文本 write_data(filename,sentiment) #写入情感值 #write_data(filename,al_sentiment) #写入机器判断情感倾向 write_data(filename,s+'\n') #写入人工标注情感 i = i+1 ''' # 生成stopword表,需要去除一些否定词和程度词汇 stopwords = set() fr = open('停用词.txt', 'r', encoding='utf-8') for word in fr: stopwords.add(word.strip()) # Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。 # 读取否定词文件 not_word_file = open('否定词.txt', 'r+', encoding='utf-8') not_word_list = not_word_file.readlines() not_word_list = [w.strip() for w in not_word_list] # 读取程度副词文件 degree_file = open('程度副词.txt', 'r+',encoding='utf-8') degree_list = degree_file.readlines() degree_list = [item.split(',')[0] for item in degree_list] # 生成新的停用词表 with open('stopwords.txt', 'w', encoding='utf-8') as f: for word in stopwords: if (word not in not_word_list) and (word not in degree_list): f.write(word + '\n') # jieba分词后去除停用词 def seg_word(sentence): seg_list = jieba.cut(sentence) seg_result = [] for i in seg_list: seg_result.append(i) stopwords = set() with open('stopwords.txt', 'r',encoding='utf-8') as fr: for i in fr: stopwords.add(i.strip()) return list(filter(lambda x: x not in stopwords, seg_result)) # 找出文本中的情感词、否定词和程度副词 def classify_words(word_list): # 读取情感词典文件 sen_file = open('BosonNLP_sentiment_score\BosonNLP_sentiment_score.txt', 'r+', encoding='utf-8') # 获取词典文件内容 sen_list = sen_file.readlines() # 创建情感字典 sen_dict = defaultdict() # 读取词典每一行的内容,将其转换成字典对象,key为情感词,value为其对应的权重 for i in sen_list: if len(i.split(' ')) == 2: sen_dict[i.split(' ')[0]] = i.split(' ')[1] # 读取否定词文件 not_word_file = open('否定词.txt', 'r+', encoding='utf-8') not_word_list = not_word_file.readlines() # 读取程度副词文件 degree_file = open('程度副词.txt', 'r+', encoding='utf-8') degree_list = degree_file.readlines() degree_dict = defaultdict() for i in degree_list: degree_dict[i.split(',')[0]] = i.split(',')[0] sen_word = dict() not_word = dict() degree_word = dict() # 分类 for i in range(len(word_list)): word = word_list[i] if word in sen_dict.keys() and word not in not_word_list and word not in degree_dict.keys(): # 找出分词结果中在情感字典中的词 sen_word[i] = sen_dict[word] elif word in not_word_list and word not in degree_dict.keys(): # 分词结果中在否定词列表中的词 not_word[i] = -1 elif word in degree_dict.keys(): # 分词结果中在程度副词中的词 degree_word[i] = degree_dict[word] # 关闭打开的文件 sen_file.close() not_word_file.close() degree_file.close() # 返回分类结果 return sen_word, not_word, degree_word # 计算情感词的分数 def score_sentiment(sen_word, not_word, degree_word, seg_result): # 权重初始化为1 W = 1 score = 0 # 情感词下标初始化 sentiment_index = -1 # 情感词的位置下标集合 sentiment_index_list = list(sen_word.keys()) # 遍历分词结果 for i in range(0, len(seg_result)): # 如果是情感词 if i in sen_word.keys(): # 权重*情感词得分 score += W * float(sen_word[i]) # 情感词下标加一,获取下一个情感词的位置 sentiment_index += 1 if sentiment_index < len(sentiment_index_list) - 1: # 判断当前的情感词与下一个情感词之间是否有程度副词或否定词 for j in range(sentiment_index_list[sentiment_index], sentiment_index_list[sentiment_index + 1]): # 更新权重,如果有否定词,权重取反 if j in not_word.keys(): W *= -1 elif j in degree_word.keys(): W *= float(degree_word[j]) # 定位到下一个情感词 if sentiment_index < len(sentiment_index_list) - 1: i = sentiment_index_list[sentiment_index + 1] return score # 计算得分 def sentiment_score(sentence): # 1.对文档分词 seg_list = seg_word(sentence) # 2.将分词结果转换成字典,找出情感词、否定词和程度副词 sen_word, not_word, degree_word = classify_words(seg_list) # 3.计算得分 score = score_sentiment(sen_word, not_word, degree_word, seg_list) return score #读取文件 def read_txt(filename): with open(filename,'r',encoding='utf-8')as f: txt = f.read() return txt def write_data(filename,data): with open(filename,'a',encoding='utf-8')as f: f.write(data) #基于波森情感词典计算情感值 text = read_txt('C:/Users/24224/Desktop/1.txt') lists = text.split('\n') i = 0 for l in lists: if l != '': sentiments =sentiment_score(l) #情感值为正数,表示积极;为负数表示消极 print("情感值:",sentiments) if sentiments > 0: print(l) print("机器标注情感倾向:积极\n") s = "机器判断情感倾向:积极\n" else: print(l) print('机器标注情感倾向:消极\n') s = "机器判断情感倾向:消极"+'\n' sentiment = '情感值:'+str(sentiments)+'\n' #文件写入 filename = 'BosonNLP情感分析结果.txt' write_data(filename,'情感分析文本:') write_data(filename,l+'\n') #写入待处理文本 write_data(filename,sentiment) #写入情感值 #write_data(filename,al_sentiment) #写入机器判断情感倾向 write_data(filename,s+'\n') #写入人工标注情感 i = i+1
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。