当前位置:   article > 正文

Python自然语言处理实战(8):情感分析技术_自然语言boson情感分析实操

自然语言boson情感分析实操

实战电影评论情感分析

        情感分析是一段文字表达的情绪状态。其中,一段文本可以使一个句子、一个段落或者一个文档。主要涉及两个问题:文本表达和文本分类。在深度学习出现之前,主流的表示方法有BOW(词袋模型)和topic model(主题模型),分类模型主要有SVM和LR。

         载入数据:IMDB情感分析数据集,训练集和测试集分别包含了25000条已标注的电影评论,满分了10分,小于等于4为负面评论。

  1. # -*- coding: utf-8 -*-
  2. import numpy as np
  3. # 加载已训练好的词典向量模型,包含400000的文本向量,每行有50维的数据
  4. words_list = np.load('wordsList.npy')
  5. print('载入word列表')
  6. words_list = words_list.tolist() # 转化为list
  7. words_list = [word.decode('UTF-8') for word in words_list]
  8. word_vectors = np.load('wordVectors.npy')
  9. print('载入文本向量')
  10. print(len(words_list))
  11. print(word_vectors.shape)
  12. Home_index = words_list.index("home")
  13. print(word_vectors[Home_index])
  14. # 加载电影数据
  15. import os
  16. from os.path import isfile, join
  17. pos_files = ['pos/' + f for f in os.listdir('pos/') if isfile(join('pos/', f))]
  18. neg_files = ['neg/' + f for f in os.listdir('neg/') if isfile(join('neg/', f))]
  19. num_words = []
  20. for pf in pos_files:
  21. with open(pf, "r", encoding='utf-8') as f:
  22. line = f.readline()
  23. counter = len(line.split())
  24. num_words.append(counter)
  25. print('正面评价完结')
  26. for pf in neg_files:
  27. with open(pf, "r", encoding='utf-8') as f:
  28. line = f.readline()
  29. counter = len(line.split())
  30. num_words.append(counter)
  31. print('负面评价完结')
  32. num_files = len(num_words)
  33. print('文件总数', num_files)
  34. print('所有的词的数量', sum(num_words))
  35. print('平均文件词的长度', sum(num_words)/len(num_words))
  36. '''
  37. # 可视化
  38. import matplotlib
  39. import matplotlib.pyplot as plt
  40. matplotlib.use('qt4agg')
  41. # 指定默认字体
  42. matplotlib.rcParams['font.sans-serif'] = ['SimHei']
  43. matplotlib.rcParams['font.family'] = 'sans-serif'
  44. #%matplotlib inline
  45. plt.hist(num_words, 50, facecolor='g')
  46. plt.xlabel('文本长度')
  47. plt.ylabel('频次')
  48. plt.axis([0, 1200, 0, 8000])
  49. plt.show()
  50. '''
  51. # 大部分文本都在230之内
  52. max_seg_len = 300
  53. # 将文本生成一个索引矩阵,得到一个25000x300矩阵
  54. import re
  55. strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
  56. def cleanSentence(string):
  57. string = string.lower().replace("<br />", " ")
  58. return re.sub(strip_special_chars, "", string.lower())
  59. print('保存idxMatrix...')
  60. max_seg_num = 300
  61. ids = np.zeros((num_files, max_seg_num), dtype="int32")
  62. file_count = 0
  63. '''
  64. for pf in pos_files:
  65. with open(pf, "r", encoding="utf-8") as f:
  66. indexCounter = 0
  67. line = f.readline()
  68. cleanedLine = cleanSentence(line)
  69. split = cleanedLine.split()
  70. for word in split:
  71. try:
  72. ids[file_count][indexCounter] = words_list.index(word)
  73. except ValueError:
  74. ids[file_count][indexCounter] = 399999 # 未知的词
  75. indexCounter = indexCounter + 1
  76. if indexCounter >= max_seg_num:
  77. break
  78. file_count = file_count + 1
  79. print(file_count)
  80. print('保存完成1')
  81. for nf in neg_files:
  82. with open(nf, "r", encoding="utf-8") as f:
  83. indexCounter = 0
  84. line = f.readline()
  85. cleanedLine = cleanSentence(line)
  86. split = cleanedLine.split()
  87. for word in split:
  88. try:
  89. ids[file_count][indexCounter] = words_list.index(word)
  90. except ValueError:
  91. ids[file_count][indexCounter] = 399999 # 未知的词
  92. indexCounter = indexCounter + 1
  93. if indexCounter >= max_seg_num:
  94. break
  95. file_count = file_count + 1
  96. # 保存到文件
  97. np.save('idxMatrix', ids)
  98. print('保存完成2')
  99. '''
  100. # 模型设置
  101. batch_size = 24
  102. lstm_units = 64
  103. num_labels = 2
  104. iterations = 200000
  105. max_seg_num = 250
  106. ids = np.load('idsMatrix.npy')
  107. # 返回一个数据集的迭代器, 返回一批训练集合
  108. from random import randint
  109. def get_train_batch():
  110. labels = []
  111. arr = np.zeros([batch_size, max_seg_num])
  112. for i in range(batch_size):
  113. if (i % 2 == 0):
  114. num = randint(1, 11499)
  115. labels.append([1, 0])
  116. else:
  117. num = randint(13499, 24999)
  118. labels.append([0, 1])
  119. arr[i] = ids[num-1: num]
  120. return arr, labels
  121. def get_test_batch():
  122. labels = []
  123. arr = np.zeros([batch_size, max_seg_num])
  124. for i in range(batch_size):
  125. num = randint(11499, 13499)
  126. if (num <= 12499):
  127. labels.append([1, 0])
  128. else:
  129. labels.append([0, 1])
  130. arr[i] = ids[num-1:num]
  131. return arr, labels
  132. num_dimensions = 300 # Dimensions for each word vector
  133. import tensorflow as tf
  134. tf.reset_default_graph()
  135. labels = tf.placeholder(tf.float32, [batch_size, num_labels])
  136. input_data = tf.placeholder(tf.int32, [batch_size, max_seg_num])
  137. data = tf.Variable(tf.zeros([batch_size, max_seg_num, num_dimensions]), dtype=tf.float32)
  138. data = tf.nn.embedding_lookup(word_vectors, input_data)
  139. # 配置LSTM网络
  140. lstmCell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
  141. lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75) # 避免一些过拟合
  142. value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
  143. # 第一个输出可以被认为是最后的隐藏状态,该向量将重新确定维度,然后乘以一个权重加上偏置,获得最终的label
  144. weight = tf.Variable(tf.truncated_normal([lstm_units, num_labels]))
  145. bias = tf.Variable(tf.constant(0.1, shape=[num_labels]))
  146. value = tf.transpose(value, [1, 0, 2])
  147. last = tf.gather(value, int(value.get_shape()[0]) - 1)
  148. prediction = (tf.matmul(last, weight) + bias)
  149. # 预测函数以及正确率评估参数
  150. correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
  151. accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
  152. # 将标准的交叉熵损失函数定义为损失值,选择Adam作为优化函数
  153. loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
  154. optimizer = tf.train.AdamOptimizer().minimize(loss)
  155. #sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement, log_device_placement))
  156. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))
  157. #saver = tf.train.Saver()
  158. #saver.restore(sess, tf.train.latest_checkpoint('models'))
  159. iterations = 10
  160. for i in range(iterations):
  161. next_batch, next_batch_labels = get_test_batch()
  162. print("正确率:", (sess.run(
  163. accuracy, {input_data: next_batch, labels: next_batch_labels})) * 100)
  164. '''
  165. # 使用tensorboard可视化损失值和正确值
  166. import datetime
  167. sess = tf.InteractiveSession()
  168. #tf.device("/cpu:0")
  169. saver = tf.train.Saver()
  170. sess.run(tf.global_variables_initializer())
  171. tf.summary.scalar('Loss', loss)
  172. tf.summary.scalar('Accuracy', accuracy)
  173. merged = tf.summary.merge_all()
  174. logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
  175. writer = tf.summary.FileWriter(logdir, sess.graph)
  176. for i in range(iterations):
  177. # 下个批次的数据
  178. nextBatch, nextBatchLabels = get_train_batch();
  179. sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
  180. # 每50次写入一次leadboard
  181. if (i % 50 == 0):
  182. summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
  183. writer.add_summary(summary, i)
  184. # 每10,000次保存一个模型
  185. if (i % 10000 == 0 and i != 0):
  186. save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
  187. print("saved to %s" % save_path)
  188. writer.close()
  189. '''

 

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/354763
推荐阅读
相关标签
  

闽ICP备14008679号