当前位置:   article > 正文

python_NLP实战之情感分析_nlp情感分析模型训练 python

nlp情感分析模型训练 python

情感分析的基本方法有:词法分析,基于机器学习的分析,混合分析

词法分析运用了由预标记词汇组成的词典,使用词法分析器将输入文本转换为单词序列,将每个新的单词与字典中的词汇进行匹配。

机器学习方法的关键是合适特征的选择。通常有unigram,bigrams,trigrams选为特征向量

实战电影评论情感分析

分为5部分

1、训练或者载入一个词向量生成模型

2、创建一个用于训练集的ID矩阵

3、创建LSTM计算单元

4、训练

5、测试

step1: 载入并分析数据

  1. # encoding:utf-8
  2. import numpy as np
  3. wordsList = np.load('wordsList.npy')
  4. print('载入word列表')
  5. wordsList = wordsList.tolist()
  6. wordsList = [word.decode('UTF-8')
  7. for word in wordsList]
  8. wordVectors = np.load('wordVectors.npy')
  9. print('载入文本向量')
  10. print(len(wordsList))
  11. print(wordVectors.shape)
  12. import os
  13. from os.path import isfile, join
  14. pos_files = ['pos/' + f for f in os.listdir(
  15. 'pos/') if isfile(join('pos/', f))]
  16. neg_files = ['neg/' + f for f in os.listdir(
  17. 'neg/') if isfile(join('neg/', f))]
  18. num_words = []
  19. for pf in pos_files:
  20. with open(pf, "r", encoding='utf-8') as f:
  21. line = f.readline()
  22. counter = len(line.split())
  23. num_words.append(counter)
  24. print('正面评价完结')
  25. for nf in neg_files:
  26. with open(nf, "r", encoding='utf-8') as f:
  27. line = f.readline()
  28. counter = len(line.split())
  29. num_words.append(counter)
  30. print('负面评价完结')
  31. num_files = len(num_words)
  32. print('文件总数', num_files)
  33. print('所有的词的数量', sum(num_words))
  34. print('平均文件词的长度', sum(num_words) / len(num_words))

step2:将文本生成一个索引矩阵

  1. import re
  2. strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
  3. num_dimensions = 300 # Dimensions for each word vector
  4. def cleanSentences(string):
  5. string = string.lower().replace("<br />", " ")
  6. return re.sub(strip_special_chars, "", string.lower())
  7. max_seq_num = 250
  8. ids = np.zeros((num_files, max_seq_num), dtype='int32')
  9. file_count = 0
  10. for pf in pos_files:
  11. with open(pf, "r", encoding='utf-8') as f:
  12. indexCounter = 0
  13. line = f.readline()
  14. cleanedLine = cleanSentences(line)
  15. split = cleanedLine.split()
  16. for word in split:
  17. try:
  18. ids[file_count][indexCounter] = wordsList.index(word)
  19. except ValueError:
  20. ids[file_count][indexCounter] = 399999 # 未知的词
  21. indexCounter = indexCounter + 1
  22. if indexCounter >= max_seq_num:
  23. break
  24. file_count = file_count + 1
  25. for nf in neg_files:
  26. with open(nf, "r",encoding='utf-8') as f:
  27. indexCounter = 0
  28. line = f.readline()
  29. cleanedLine = cleanSentences(line)
  30. split = cleanedLine.split()
  31. for word in split:
  32. try:
  33. ids[file_count][indexCounter] = wordsList.index(word)
  34. except ValueError:
  35. ids[file_count][indexCounter] = 399999 # 未知的词语
  36. indexCounter = indexCounter + 1
  37. if indexCounter >= max_seq_num:
  38. break
  39. file_count = file_count + 1
  40. np.save('idsMatrix', ids)

step3: 辅助函数,用来生成一批训练数据集

  1. def get_train_batch():
  2. labels = []
  3. arr = np.zeros([batch_size, max_seq_num])
  4. for i in range(batch_size):
  5. if (i % 2 == 0):
  6. num = randint(1, 11499)
  7. labels.append([1, 0])
  8. else:
  9. num = randint(13499, 24999)
  10. labels.append([0, 1])
  11. arr[i] = ids[num - 1:num]
  12. return arr, labels
  13. def get_test_batch():
  14. labels = []
  15. arr = np.zeros([batch_size, max_seq_num])
  16. for i in range(batch_size):
  17. num = randint(11499, 13499)
  18. if (num <= 12499):
  19. labels.append([1, 0])
  20. else:
  21. labels.append([0, 1])
  22. arr[i] = ids[num - 1:num]
  23. return arr, labels

step4:模型设置

  1. import tensorflow as tf
  2. tf.reset_default_graph()
  3. labels = tf.placeholder(tf.float32, [batch_size, num_labels])
  4. input_data = tf.placeholder(tf.int32, [batch_size, max_seq_num])
  5. data = tf.Variable(
  6. tf.zeros([batch_size, max_seq_num, num_dimensions]), dtype=tf.float32)
  7. 获得文本向量
  8. data = tf.nn.embedding_lookup(wordVectors, input_data)
  9. 配置LSTM的数量
  10. lstmCell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
  11. 避免过拟合
  12. lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.5)
  13. 输入模型中,用来展开整个网络
  14. value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
  15. weight = tf.Variable(tf.truncated_normal([lstm_units, num_labels]))
  16. bias = tf.Variable(tf.constant(0.1, shape=[num_labels]))
  17. value = tf.transpose(value, [1, 0, 2])
  18. last = tf.gather(value, int(value.get_shape()[0]) - 1)
  19. prediction = (tf.matmul(last, weight) + bias)
  20. 定义正确的评估函数以及正确率评估参数
  21. correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
  22. accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
  23. loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
  24. logits=prediction, labels=labels))
  25. optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
  26. saver = tf.train.Saver()
  27. with tf.Session() as sess:
  28. if os.path.exists("models") and os.path.exists("models/checkpoint"):
  29. saver.restore(sess, tf.train.latest_checkpoint('models'))
  30. else:
  31. if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
  32. init = tf.initialize_all_variables()
  33. else:
  34. init = tf.global_variables_initializer()
  35. sess.run(init)
  36. iterations = 100
  37. for step in range(iterations):
  38. next_batch, next_batch_labels = get_test_batch()
  39. if step % 20 == 0:
  40. print("step:", step, " 正确率:", (sess.run(
  41. accuracy, {input_data: next_batch, labels: next_batch_labels})) * 100)
  42. if not os.path.exists("models"):
  43. os.mkdir("models")
  44. save_path = saver.save(sess, "models/model.ckpt")
  45. print("Model saved in path: %s" % save_path)

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/72764
推荐阅读
相关标签
  

闽ICP备14008679号