赞
踩
# 读取样本数据信息,流浪地球_影评01.txt为随机抽取的样本
sens = []
with open('流浪地球_影评01.txt','r',
encoding='utf-8-sig') as f:
datas = f.readlines()
for data in datas:
sen = data.strip()
sens.append(sen)
print(len(sens))
# 获取停用词列表
with open('files/stopwords_1208.txt','r',
encoding='utf-8-sig') as f:
stopwords = f.readlines()
print(len(stopwords))
# 定义句子分词函数
def sent2word(sentence,stopwords):
# 句子分词处理
segList = jieba.cut(sentence)
words = []
for word in segList:
if word not in stopwords:
words.append(word)
return words
# 导入中文分词相关工具包
from collections import defaultdict
import os, re, codecs
import jieba
import matplotlib.pyplot as mp
# 构建分词分类模型(情感定位) def classifyWords(words): # (1) 情感词,备用情感词文件存储在files文件夹里 senDict = defaultdict() with open('files/BosonNLP_sentiment_score.txt', 'r', encoding='utf-8-sig') as f: senList = f.readlines() # 用评语和评分标签(二分)构建字典 for st in senList: sen = st.strip().split(' ') if len(sen) <2: continue senDict[sen[0]] = sen[1] # (2) 否定词,files[1]为备用否定词文件名 with open('files/negwords_70.txt', 'r', encoding='utf-8-sig') as f: negLists = f.readlines() # 否定词处理 negList = [] for neg in negLists: neg = neg.strip() if not neg: continue negList.append(neg) # (3) 程度副词,files[2]为备用程度副词文件名 with open('files/degreeDict.txt','r', encoding='utf-8-sig') as f: degreeList = f.readlines() degreeDict = defaultdict() for dg in degreeList: degree = dg.strip().split('\t') degreeDict[degree[0]] = degree[1] # 情感词处理 senWord = defaultdict() negWord = defaultdict() degreeWord = defaultdict() # 细化分词的词性感情色彩特征 for idx,wd in enumerate(words): if wd in senDict.keys() and wd not in\ negList and wd not in degreeDict.keys(): senWord[idx] = senDict[wd] elif wd in negList and wd not in degreeDict.keys(): negWord[idx] = -1 elif wd in degreeDict.keys(): degreeWord[idx] = degreeDict[wd] return senWord, negWord, degreeWord
# 定义情感得分函数 def scoreSent(senWord, negWord, degreeWord, words): W = 1 score = 0 # 存储所有情感词位置的列表 senLoc = list(senWord.keys()) negLoc = list(negWord.keys()) degreeLoc = list(degreeWord.keys()) senloc = -1 # 遍历句子中所有单词的segResult,i为单词的绝对位置 for i in range(len(words)): # 如果该词为情感词 if i in senLoc: # loc 为情感词位置列表的序号 senloc += 1 # 直接添加该情感词分数 score += W * float(senWord[i]) # print("score = %f"% score) if senloc < len(senLoc) - 1: # 判断该情感词与下一情感词之间是否存在否定词或程度副词 # j为绝对位置 for j in range(senLoc[senloc], senLoc[senloc+1]): # 如果含有否定词 if j in negLoc: W *= -1 # 如果含有程度副词 elif j in degreeLoc: W *= float(degreeWord[j]) return score
# 对样本进行分词处理
scores = []
for sentence in sens:
words = sent2word(sentence,stopwords)
senWord, negWord, degreeWord = \
classifyWords(words)
score = scoreSent(senWord,
negWord, degreeWord, words)
# 为排除句子长度(分词个数)对分值的影响,进行归一化处理
score /= len(words)
scores.append(score)
print(len(scores))
# 以得分作为纵坐标,顺序值作为横坐标画散点图
import numpy as np
x = []
y = []
scores = sorted(scores)
min_score = min(scores)
max_score = max(scores)
wid = max_score - min_score
for idx,score in enumerate(scores):
x.append(idx)
y.append(float((score-min_score)/wid))
x = np.array(x)
y = np.array(y)
# 可视化代码
mp.figure('Wardering Earth Sentiment Score', facecolor='lightgray')
mp.title('Wardering Earth Sentiment Score', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
ax = mp.gca()
ax.xaxis.set_minor_locator(
mp.MultipleLocator(20))
ax.yaxis.set_minor_locator(
mp.MultipleLocator())
ax.grid(which='major', axis='both', linewidth=0.5,
linestyle='--', color='orangered')
mp.scatter(x, y, c='dodgerblue')
mp.show()
(未完待续……)
github源代码地址:
https://github.com/Willsgao/Personal-projects
参考网址:
https://www.jianshu.com/p/4cfcf1610a73
https://blog.csdn.net/chenpe32cp/article/details/77801600
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。