赞
踩
目录
# 基于文本类的情感分析(下)'哈哈哈哈,我很喜欢', '今天很开心']
# LDA下from gensim import corpora, models, similarities
import re
import jieba
import codecs
from collections import defaultdict # 导入collections用于创建空白词典
def seg_word(sentence):
seg_list = jieba.cut(sentence)
seg_result = []
for word in seg_list:
seg_result.append(word)
stopwords = set()
stopword = codecs.open('../data/stopwords.txt', 'r',
encoding='utf-8') # 加载停用词
for word in stopword:
stopwords.add(word.strip())
stopword.close()
return list(filter(lambda x: x not in stopwords, seg_result))
def sort_word(word_dict):
sen_file = open('../data/BosonNLP_sentiment_score.txt', 'r+',
encoding='utf-8') # 加载Boson情感词典
sen_list = sen_file.readlines()
sen_dict = defaultdict() # 创建词典
for s in sen_list:
s = re.sub('\n', '', s) # 去除每行最后的换行符
if s:
# 构建以key为情感词,value为对应分值的词典
sen_dict[s.split(' ')[0]] = s.split(' ')[1]
not_file = open('../data/否定词.txt', 'r+',
encoding='utf-8') # 加载否定词词典
not_list = not_file.readlines()
for i in range(len(not_list)):
not_list[i] = re.sub('\n', '', not_list[i])
degree_file = open('../data/程度副词(中文).txt', 'r+',
encoding='utf-8') # 加载程度副词词典
degree_list = degree_file.readlines()
degree_dic = defaultdict()
for d in degree_list:
d = re.sub('\n', '', d)
if d:
degree_dic[d.split(' ')[0]] = d.split(' ')[1]
sen_file.close()
degree_file.close()
not_file.close()
sen_word = dict()
not_word = dict()
degree_word = dict()
for word in word_dict.keys():
if word in sen_dict.keys() and word not in not_list and word not in degree_dic.keys():
sen_word[word_dict[word]] = sen_dict[word] # 情感词典中的包含分词结果的词
elif word in not_list and word not in degree_dic.keys():
not_word[word_dict[word]] = -1 # 程度副词词典中的包含分词结果的词
elif word in degree_dic.keys():
# 否定词典中的包含分词结果的词
degree_word[word_dict[word]] = degree_dic[word]
return sen_word, not_word, degree_word # 返回分类结果
def list_to_dict(word_list):
data = {}
for x in range(0, len(word_list)):
data[word_list[x]] = x
return data
def socre_sentiment(sen_word, not_word, degree_word, seg_result):
W = 1 # 初始化权重
score = 0
sentiment_index = -1 # 情感词下标初始化
for i in range(0, len(seg_result)):
if i in sen_word.keys():
score += W * float(sen_word[i])
sentiment_index += 1 # 下一个情感词
for j in range(len(seg_result)):
if j in not_word.keys():
score *= -1 # 否定词反转情感
elif j in degree_word.keys():
score *= float(degree_word[j]) # 乘以程度副词
return score
def setiment(sentence):
# 对文本进行分词和去停用词,去除跟情感词无关的词语
seg_list = seg_word(sentence)
# 对分词结果进行分类,找出其中的情感词、程度副词和否定词
sen_word, not_word, degree_word = sort_word(list_to_dict(seg_list))
# 计算并汇总情感词的得分
score = socre_sentiment(sen_word, not_word, degree_word, seg_list)
return seg_list, sen_word, not_word, degree_word, score
if __name__ == '__main__':
print(setiment('我今天特别开心'))
print(setiment('我今天很开心、非常兴奋'))
print(setiment('我昨天开心,今天不开心'))
import nltk.classify as cf
import nltk.classify.util as cu
import jieba
def setiment(sentences):
# 文本转换为特征及特征选取
pos_data = []
with open('../data/pos.txt', 'r+', encoding='utf-8') as pos: # 读取积极评论
while True:
words = pos.readline()
if words:
positive = {} # 创建积极评论的词典
words = jieba.cut(words) # 对评论数据结巴分词
for word in words:
positive[word] = True
pos_data.append((positive, 'POSITIVE')) # 对积极词赋予POSITIVE标签
else:
break
neg_data = []
with open('../data/neg.txt', 'r+', encoding='utf-8') as neg: # 读取消极评论
while True:
words = neg.readline()
if words:
negative = {} # 创建消极评论的词典
words = jieba.cut(words) # 对评论数据结巴分词
for word in words:
negative[word] = True
neg_data.append((negative, 'NEGATIVE')) # 对消极词赋予NEGATIVE标签
else:
break
# 划分训练集(80%)与测试集(20%)
pos_num, neg_num = int(len(pos_data) * 0.8), int(len(neg_data) * 0.8)
train_data = pos_data[: pos_num] + neg_data[: neg_num] # 抽取80%数据
test_data = pos_data[pos_num: ] + neg_data[neg_num: ] # 剩余20%数据
# 构建分类器(朴素贝叶斯)
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model, test_data)
print('准确率为:' + str(ac))
tops = model.most_informative_features() # 信息量较大的特征
print('\n信息量较大的前10个特征为:')
for top in tops[: 10]:
print(top[0])
for sentence in sentences:
feature = {}
words = jieba.cut(sentence)
for word in words:
feature[word] = True
pcls = model.prob_classify(feature)
sent = pcls.max() # 情绪面标签(POSITIVE或NEGATIVE)
prob = pcls.prob(sent) # 情绪程度
print('\n','‘',sentence,'’', '的情绪面标签为', sent, '概率为','%.2f%%' % round(prob * 100, 2))
if __name__ == '__main__':
# 测试
sentences = ['破烂平板', '手感不错,推荐购买', '刚开始吧还不错,但是后面越来越卡,差评',
setiment(sentences)
# 代码9-3
from snownlp import SnowNLP # 调用情感分析函数
# 创建snownlp对象,设置要测试的语句
s1 = SnowNLP('这东西真的挺不错的')
s2 = SnowNLP('垃圾东西')
print('调用sentiments方法获取s1的积极情感概率为:',s1.sentiments)
print('调用sentiments方法获取s2的积极情感概率为:',s2.sentiments)
import pandas as pd
from snownlp import SnowNLP
import jieba
data = pd.read_csv('../data/comment.csv', sep=',', encoding='utf-8', header=0)
comment_data = data.loc[: , ['评论']] # 只提取评论数据
# 去除重复值
comment_data = comment_data.drop_duplicates()
# 短句删除
comments_data = comment_data.iloc[: , 0]
comments = comments_data[comments_data.apply(len) >= 4] # 剔除字数少于4的数据
# 语料压缩,句子中常出现重复语句,需要进行压缩
def yasuo(string):
for i in [1, 2]:
j = 0
while j < len(string) - 2 * i:
if string[j: j + i] == string[j + i: j + 2 * i] and (
string[j + i: j + 2 * i] == string[j + i: j + 3 * i]):
k = j + 2 * i
while k + i < len(string) and string[j: j + i] == string[j: j + 2 * i]:
k += i
string = string[: j + i] + string[k + i:]
j += 1
for i in [3, 4, 5]:
j = 0
while j < len(string) - 2 * i:
if string[j: j + i] == string[j + i: j + 2 * i]:
k = j + 2 * i
while k + i < len(string) and string[j: j + i] == string[j: j + 2 * i]:
k += i
string = string[: j + i] + string[k + i:]
j += 1
if string[: int(len(string) / 2)] == string[int(len(string) / 2):]:
string = string[: int(len(string) / 2)]
return string
comments = comments.astype('str').apply(lambda x: yasuo(x))
# 情感分析
coms = []
coms = comments.apply(lambda x: SnowNLP(x).sentiments)
# 情感分析,coms在0~1之间,以0.5分界,大于0.5,则为正面情感
pos_data = comments[coms >= 0.6] # 正面情感数据集,取0.6是为了增强情感
neg_data = comments[coms < 0.4] # 负面情感数据集
# 分词
mycut = lambda x: ' '.join(jieba.cut(x)) # 自定义简单分词函数
pos_data = pos_data.apply(mycut)
neg_data = neg_data.apply(mycut)
pos_data.head(5)
neg_data.tail(5)
print(len(pos_data))
print(len(neg_data))
# 去停用词
stop = pd.read_csv('../data/stopwords.txt', sep='bucunzai', encoding='utf-8', header=None)
stop = ['', ''] + list(stop[0]) # 添加空格符号,pandas过滤了空格符
pos = pd.DataFrame(pos_data)
neg = pd.DataFrame(neg_data)
pos[1] = pos['评论'].apply(lambda s: s.split(' ')) # 空格分词
pos[2] = pos[1].apply(lambda x: [i for i in x if i not in stop]) # 去除停用词
neg[1] = neg['评论'].apply(lambda s: s.split(' '))
neg[2] = neg[1].apply(lambda x: [i for i in x if i not in stop])
# 正面主题分析
pos_dict = corpora.Dictionary(pos[2]) # 建立词典
pos_corpus = [pos_dict.doc2bow(i) for i in pos[2]] # 建立语料库
pos_lda = models.LdaModel(pos_corpus, num_topics=3, id2word=pos_dict) # LDA模型训练
for i in range(3):
print('pos_topic' + str(i))
print(pos_lda.print_topic(i)) # 输出每个主题
# 负面主题分析
neg_dict = corpora.Dictionary(neg[2]) # 建立词典
neg_corpus = [neg_dict.doc2bow(i) for i in neg[2]] # 建立语料库,bag of word
neg_lda = models.LdaModel(neg_corpus, num_topics=3, id2word=neg_dict) # LDA模型训练
for i in range(3):
print('neg_topic' + str(i))
print(neg_lda.print_topic(i)) # 输出每个主题
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。