赞
踩
def _update(self):
"""遍历整个数据集,更新各个变量"""
for sentence in self.dataSet: # 此处sentence是一个经过分词过后的列表
n = len(sentence)
self.count += n * (n - 1) / 2
self.num += n
for word in sentence:
self.wordCount[word] += 1
for i in range(n): # 这样的话无论是[x][y]还会[y][x]的结果就会是一样了
for j in range(n):
if sentence[i] in self.pos_feature or sample[j] in self.pos_feature:
self.pos_pairsCount[sample[i]][sample[j]] += 1
elif sample[i] in self.neg_feature or sample[j] in self.neg_feature:
self.neg_pairsCount[sample[i]][sample[j]] += 1
def calculate_sentiment_value(self,word): sum_pos = [] # 找到所有与该单词相关的正向词汇,遍历每一个的互信息 最终进行 for feature in self.pos_feature: single_sentiment_value = self.pos_query(word,feature) # 求这两个单词的互信息 sum_pos.append(single_sentiment_value) print(sum_pos) neg_pos = [] for feature in self.neg_feature: single_sentiment_value = self.neg_query(word,feature) # 求这两个单词的互信息 neg_pos.append(single_sentiment_value) neg_pos = [] score = sum(sum_pos) - sum(neg_pos) print(neg_pos) return score
上面就是比较核心的部分了
from collections import defaultdict from math import log2 class I(): # 使用互信息计算两者之间的相似性 def __init__(self, dataSet): self.wordCount = defaultdict(lambda: 0) # 统计单词频率 self.pos_pairsCount = defaultdict(lambda: defaultdict(lambda: 0)) # 统计两两单词共同出现的频率 self.neg_pairsCount = defaultdict(lambda: defaultdict(lambda: 0)) self.count = 0.0 # 统计单词无序对数目 self.num = 0.0 # 统计总的单词数 self.dataSet = dataSet self.sentiment_value = defaultdict(lambda: 0) self.pos_feature = ['r','z'] self.neg_feature = ['y','t'] self._update() def _update(self): """遍历整个数据集,更新各个变量""" for sample in self.dataSet: n = len(sample) self.count += n * (n - 1) / 2 self.num += n for word in sample: self.wordCount[word] += 1 for i in range(n): for j in range(n): if sample[i] in self.pos_feature or sample[j] in self.pos_feature: self.pos_pairsCount[sample[i]][sample[j]] += 1 elif sample[i] in self.neg_feature or sample[j] in self.neg_feature: self.neg_pairsCount[sample[i]][sample[j]] += 1 def pos_query(self, x, y): """计算x和y的互信息, I(x,y)=p(x,y)log[p(x,y) / (p(x)p(y))]""" p_x = self.wordCount[x] / self.num p_y = self.wordCount[y] / self.num p_xy = self.pos_pairsCount[x][y] / self.count if p_x == 0 or p_y == 0: # 表示数据集里面没有x或者y,无法得到二者关系 return -1 return p_xy * log2(p_xy / p_x / p_y) def neg_query(self, x, y): """计算x和y的互信息, I(x,y)=p(x,y)log[p(x,y) / (p(x)p(y))]""" p_x = self.wordCount[x] / self.num p_y = self.wordCount[y] / self.num p_xy = self.neg_pairsCount[x][y] / self.count if p_x == 0 or p_y == 0: # 表示数据集里面没有x或者y,无法得到二者关系 return -1 return p_xy * log2(p_xy / p_x / p_y) def calculate_sentiment_value(self,word): # sum_pos = [] # words = list(test.wordCount.keys()) # for i in words: sum_pos = [] # 找到所有与该单词相关的正向词汇,遍历每一个的互信息 最终进行 for feature in self.pos_feature: single_sentiment_value = self.pos_query(word,feature) # 求这两个单词的互信息 sum_pos.append(single_sentiment_value) print(sum_pos) neg_pos = [] for feature in self.neg_feature: single_sentiment_value = self.neg_query(word,feature) # 求这两个单词的互信息 neg_pos.append(single_sentiment_value) neg_pos = [] score = sum(sum_pos) - sum(neg_pos) print(neg_pos) return score if __name__ == '__main__': dataSet = [['r', 'z'], ['x', 'y', 't', 's', 'z'], ['z'], ['s', 'x', 'r'], ['x', 't', 'r', 'z', 'y'], ['z', 'x', 't', 's', 'y']] test = I(dataSet) words = list(test.wordCount.keys()) for i in words: print(i) temp = test.calculate_sentiment_value(i) print("sentiment orientation score of '{}' is {}".format(i,temp))
在计算的过程中可能会出现一些问题,当然只是一些小问题,就留给读者自行去解决了
# 逐行读取文件句子并处理 # 也可以用with改写一下 word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计正常文本中的词频和广告文本中的词频 def new_read_line(filename, content): f = open(filename, 'r', encoding='utf-8') line = f.readline() while line: s = line s = re.sub('\n', '', s) s = re.split(' ', s) content.append(s) line = f.readline() def new_read_dic(filename): f = open(filename, 'r', encoding='gbk') line = f.readline() str = [] while line: s = line s = re.sub('\n', '', s) s = re.sub(' ', '', s) str.append(s) line = f.readline() return str
import pynlpir
import re
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC, libsvm, liblinear
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from random import shuffle
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
from collections import defaultdict
from math import log2
# 个人认为defaultdict相当好用
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。