赞
踩
使用pretreatment_txt
函数读取数据并进行预处理:去除噪声url,并将原文本按照位置、文本、用户ID、日期四种属性将原文本分开存入列表location, text, user_id, date
使用cut_word
函数对文本进行分词,具体函数如下:
def pretreatment_txt(file_path): text = [] user_id = [] location = [] date = [] with open(file_path, 'r', encoding='utf-8') as f: data = f.readlines() for i in range(1, 10000): # 使用正则表达式去除url del_url = re.split(r'http://[a-zA-Z0-9.?/&=:]*', data[i]) # 将每一行按照四种属性 location,text,user_id,date划分 att_split = ''.join(del_url).split('\t') # 去除伴随url的‘我在:’,‘我在这里:’等中文文本 text_i = att_split[1].split() text_i.pop() text_i = ' '.join(text_i) # 将四种属性添加到相应列表存储 text.append(text_i) date.append(att_split[3]) user_id.append(att_split[2]) location.append(att_split[0]) return location, text, user_id, date def cut_word(text): text_cut = [] jieba.load_userdict(r'emotion_lexicon\joy.txt') jieba.load_userdict(r'emotion_lexicon\anger.txt') jieba.load_userdict(r'emotion_lexicon\fear.txt') jieba.load_userdict(r'emotion_lexicon\sadness.txt') jieba.load_userdict(r'emotion_lexicon\disgust.txt') stopwords = [line.strip() for line in open('stopwords_list.txt', 'r', encoding='utf-8').readlines()] for sentence in text: words = jieba.lcut(sentence) cut_word_i = [] for word in words: if word not in stopwords and word != ' ': cut_word_i.append(word) text_cut.append(cut_word_i) return text_cut
对文件中的每一句话进行情感分析,计算混合情绪的特征向量,其中向量各个位置代表的元素分别是['anger', 'disgust', 'fear', 'joy', 'sadness']
。采用单一情绪进行分析,得到句子中出现最多的情绪(如果情绪出现次数相同,则取后出现的作为本句的情绪词)。
使用闭包结构,实现外函数outer_analysis
一次加载情绪词典,在子函数inner_analysis
中多次调用,该结构代码如下。
def outer_analysis(): # nonlocal joy,anger,fear,sadness,disgust joy = [line.strip() for line in open(r'emotion_lexicon\joy.txt', 'r', encoding='utf-8').readlines()] anger = [line.strip() for line in open(r'emotion_lexicon\anger.txt', 'r', encoding='utf-8').readlines()] fear = [line.strip() for line in open(r'emotion_lexicon\fear.txt', 'r', encoding='utf-8').readlines()] sadness = [line.strip() for line in open(r'emotion_lexicon\sadness.txt', 'r', encoding='utf-8').readlines()] disgust = [line.strip() for line in open(r'emotion_lexicon\disgust.txt', 'r', encoding='utf-8').readlines()] def inner_analysis(sentence_cut): vector = [0] * 5 sentiment = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'no-sentiment'] nonlocal joy, anger, fear, sadness, disgust for word in sentence_cut: if word in anger: vector[0] += 1 elif word in disgust: vector[1] += 1 elif word in fear: vector[2] += 1 elif word in joy: vector[3] += 1 elif word in sadness: vector[4] += 1 if sum(vector) == 0: maxiture_vec = [0] * len(vector) else: maxiture_vec = np.array(vector) / sum(vector) max_index = 0 if sum(vector) == 0: max_index = 5 else: for i in range(5): if vector[i] > vector[max_index]: max_index = i single_sen = sentiment[max_index] # 返回值对应sentiment中的情绪值 return maxiture_vec, single_sen return inner_analysis def text_sentiment_analysis(text_cut): analysis = outer_analysis() maxi_senti = [] sing_senti = [] for sentence_cut in text_cut: maxi, sing = analysis(sentence_cut) maxi_senti.append(maxi) sing_senti.append(sing) return sing_senti, maxi_senti
在main函数中调用上述闭包,使用代码如下:
sing_senti, maxi_senti = text_sentiment_analysis(text_cut) # 对每句话进行情绪分析(两种方式)
print('----------文本 | 单一情绪 | 混合情绪----------')
for i in range(length):
print('{}\t{}\t{}'.format(sing_senti[i], maxi_senti[i], text[i]))
将结果按照:单一情绪词—混合情绪向量— 文本原文 的顺序输出,结果如下:
可以看到统计结果符合句子本身的情感。
使用定义类Time
来处理时间
其中方法week_pattern
是周模式(以一周七天为周期),day_pattern
是日模式(以一月31天为周期),hour_pattern
是时模式(以一天24小时为周期),minute_pattern
是分钟模式(以一小时60分钟为周期)
每种方法内的sentiment
参数为指定的情绪类型,可以根据不同的情绪类型和不同的周期统计数据,计算某种情绪的占比。
源代码如下:
class Time(object): def __init__(self, date, sing_senti): self.length = len(date) self.date = date self.sentiment = sing_senti self.week = [] self.month = [] self.day = [] self.time = [] self.year = [] self.hour = [] self.minute = [] self.second = [] self.sentiment_lis = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'no-sentiment'] for d in self.date: date_lis = d.split() time = date_lis[3].split(':') self.week.append(date_lis[0]) self.month.append(date_lis[1]) self.day.append(int(date_lis[2])) self.hour.append(int(time[0])) self.minute.append(int(time[1])) self.second.append(int(time[2])) self.year.append(date_lis[5]) # print(self.day) print(self.minute) def week_pattern(self, sentiment): self.proportion = [0] * 7 self.minute_sen = [[0] * 7, [0] * 7, [0] * 7, [0] * 7, [0] * 7] week_sen_sum = [0] * 7 weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] for i in range(self.length): for j in range(7): # 周 for k in range(5): # 情绪 k if self.sentiment[i] == self.sentiment_lis[k] and self.week[i] == weekdays[j]: self.minute_sen[k][j] += 1 week_sen_sum[j] += 1 sen_index = 0 for i in range(5): if sentiment == self.sentiment_lis[i]: sen_index = i sen_vec = self.minute_sen[sen_index] for i in range(7): if week_sen_sum[i] == 0: self.proportion[i] = 0 else: self.proportion[i] = sen_vec[i] / week_sen_sum[i] return self.proportion def day_pattern(self, sentiment): self.proportion = [0] * 31 self.day_sen = [[0] * 31, [0] * 31, [0] * 31, [0] * 31, [0] * 31] # 'anger', 'disgust', 'fear', 'joy', 'sadness','no-sentiment' day_sen_sum = [0] * 31 for i in range(self.length): for k in range(5): # 情绪 if self.sentiment[i] == self.sentiment_lis[k]: self.day_sen[k][self.day[i] - 1] += 1 day_sen_sum[self.day[i] - 1] += 1 sen_index = 0 for i in range(5): if sentiment == self.sentiment_lis[i]: sen_index = i sen_vec = self.day_sen[sen_index] for i in range(31): if day_sen_sum[i] == 0: self.proportion[i] = 0 else: self.proportion[i] = sen_vec[i] / day_sen_sum[i] return self.proportion def hour_pattern(self, sentiment): self.proportion = [0] * 24 self.minute_sen = [[0] * 24, [0] * 24, [0] * 24, [0] * 24, [0] * 24] hour_sen_sum = [0] * 24 for i in range(self.length): for k in range(5): # 情绪 k if self.sentiment[i] == self.sentiment_lis[k]: self.minute_sen[k][self.hour[i] - 1] += 1 hour_sen_sum[self.hour[i] - 1] += 1 sen_index = 0 for i in range(5): if sentiment == self.sentiment_lis[i]: sen_index = i sen_vec = self.minute_sen[sen_index] for i in range(24): if hour_sen_sum[i] == 0: self.proportion[i] = 0 else: self.proportion[i] = sen_vec[i] / hour_sen_sum[i] return self.proportion def minute_pattern(self, sentiment): self.proportion = [0] * 60 self.minute_sen = [[0] * 60, [0] * 60, [0] * 60, [0] * 60, [0] * 60] minute_sen_sum = [0] * 60 for i in range(self.length): for k in range(5): # 情绪 k if self.sentiment[i] == self.sentiment_lis[k]: self.minute_sen[k][self.minute[i] - 1] += 1 minute_sen_sum[self.minute[i] - 1] += 1 sen_index = 0 for i in range(5): if sentiment == self.sentiment_lis[i]: sen_index = i sen_vec = self.minute_sen[sen_index] for i in range(60): if minute_sen_sum[i] == 0: self.proportion[i] = 0 else: self.proportion[i] = sen_vec[i] / minute_sen_sum[i] return self.proportion
在main函数中使用Time类,并调用相应方法,主函数中的代码和得到结果如下:
time = Time(date, sing_senti)
week_joy_stat = time.week_pattern('joy')
day_sadness_stat = time.day_pattern('sadness')
hour_joy_stat = time.hour_pattern('joy')
hour_anger_stat = time.hour_pattern('anger')
hour_sadness_stat = time.hour_pattern('sadness')
hour_disgust_stat = time.hour_pattern('disgust')
print('----------以周为周期,‘joy’情绪占比变化为----------')
print(week_joy_stat)
print('----------以月为周期,‘sadness’情绪占比变化为----------')
print(day_sadness_stat)
print('----------以日为周期,‘anger’情绪占比变化为----------')
print(hour_anger_stat)
根据上图结果可以看出,
对每天的五种情绪的占比变化进行可视化,在主函数中采用如下代码:
hour_joy_stat = time.hour_pattern('joy')
hour_anger_stat = time.hour_pattern('anger')
hour_sadness_stat = time.hour_pattern('sadness')
hour_disgust_stat = time.hour_pattern('disgust')
plt.plot([i for i in range(1, 25)], hour_disgust_stat, label='hour_disgust')
plt.plot([i for i in range(1, 25)], hour_sadness_stat, label='hour_sadness')
plt.plot([i for i in range(1, 25)], hour_anger_stat, label='hour_anger')
plt.plot([i for i in range(1, 25)], hour_joy_stat, label='hour_joy')
plt.legend()
plt.show()
得到如下结果,从中可以观察到:
在这一部分选用北京西单作为中心centre = [39.911377, 116.374367]
,使用函数sentiment_around(location_str, sing_senti, r, sentiment)
来统计某指定情绪在中心点某一半径范围内的占比,其中参数r
是距离中心点的半径,sentiment
是指定的情绪类型,代码如下:
def sentiment_around(location_str, sing_senti, r, sentiment): location = [] proportion = 0 senti_stat = [0] * 6 sum = 0 for lo in location_str: location.append(eval(lo)) centre = [39.911377, 116.374367] sentiment_dic = {'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'sadness': 4, 'no-sentiment': 5} radius = lambda a, b: math.sqrt((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) for i in range(len(location)): if radius(location[i], centre) <= r: senti_stat[sentiment_dic[sing_senti[i]]] += 1 # 计算各情绪占比 for i in range(5): sum += senti_stat[i] if sum == 0: proportion = 0 else: proportion = senti_stat[sentiment_dic[sentiment]] / sum return proportion
在main函数中分别取半径为r = [0.05,0.10,0.15,0.20,0.25,0.35,0.45],计算“joy”情绪所占比例,计算结果如下:
从结果中可见,随着半径范围的扩大,“joy”情绪的比例基本稳定,有着微小的增加,在r>0.1时情绪占比变化幅度并不明显。
采用Python的pyecharts
模块进行可视化,定义location_geo
函数处理经纬度位置location_str和情绪sing_senti,并根据不同的情绪对位置点上色,呈现在北京地图上,代码如下:
def location_geo(location_str, sing_senti): city = '北京' sentiment_dic = {'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'sadness': 4, 'no-sentiment': 5} g = Geo() g.add_schema(maptype=city) location = [] data_pair = [] for lo in location_str: location.append(eval(lo)) for i in range(len(location)): g.add_coordinate(i, location[i][1], location[i][0]) data_pair.append([i, sentiment_dic[sing_senti[i]]]) # 定义数据对 g.add('', data_pair, type_=GeoType.EFFECT_SCATTER, symbol_size=1) # 将数据添加到地图上 g.set_series_opts(label_opts=opts.LabelOpts(is_show=False)) # 设置样式 # 自定义分段 color 可以用取色器取色 pieces = [ {'max': 0, 'min': 0, 'label': 'anger', 'color': '#50A3BA'}, {'min': 1, 'max': 1, 'label': 'digust', 'color': '#3700A4'}, {'min': 2, 'max': 2, 'label': 'fear', 'color': '#81AE9F'}, {'min': 3, 'max': 3, 'label': 'joy', 'color': '#E2C568'}, {'min': 4, 'max': 4, 'label': 'sadness', 'color': '#DD0200'}, {'min': 5, 'max': 5, 'label': 'no-sentiment', 'color': '#FCF84D'} ] # is_piecewise 是否自定义分段, 变为true 才能生效 g.set_global_opts( visualmap_opts=opts.VisualMapOpts(is_piecewise=True, pieces=pieces), title_opts=opts.TitleOpts(title="{}-微博情绪分布".format(city)), ) return g
在main函数中调用上述函数,并以html格式存储点图,代码如下:
# 将位置信息绘制成点图
g = location_geo(location, sing_senti)
# 渲染成html, 可用浏览器直接打开
g.render('weibo_sentiment.html')
os.system("weibo_sentiment.html")
在浏览器中呈现可视化结果如下(比例尺由大到小):
使用字典方法有如下缺点:
扩充字典的思路如下(自动扩充):
微博情绪会随时间变化而变化,也会在空间分布上存在差异,根据这些特点,可以有针对地利用人们的情绪来管理公司员工和营销产品。
例如,早上八点到下午五点这段时间里,微博反映人们情绪普遍较为愉悦且稳定(第三题结果),波动不大,因此适合一定强度的工作,可能有着不错的效率。在下午五点以后,微博反映人们情绪有着两次波动,首先是五点左右“joy”心情占比逐渐下降,可能工作状态会下滑,不适合高强度工作,因此可能需要短暂休息;
其次是晚上八点到深夜十二点这段时间,“joy”心情占比又有着明显的提高,“sadness”,”anger”,”digust”,”fear”等负面情绪显著下降。因此在这段时间加大广告投放力度更有可能会让人们有兴趣挑选,增强广告效益。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。