赞
踩
情感分析的基本方法有:词法分析,基于机器学习的分析,混合分析
词法分析运用了由预标记词汇组成的词典,使用词法分析器将输入文本转换为单词序列,将每个新的单词与字典中的词汇进行匹配。
机器学习方法的关键是合适特征的选择。通常有unigram,bigrams,trigrams选为特征向量
分为5部分
1、训练或者载入一个词向量生成模型
2、创建一个用于训练集的ID矩阵
3、创建LSTM计算单元
4、训练
5、测试
step1: 载入并分析数据
- # encoding:utf-8
-
- import numpy as np
-
- wordsList = np.load('wordsList.npy')
- print('载入word列表')
- wordsList = wordsList.tolist()
- wordsList = [word.decode('UTF-8')
- for word in wordsList]
- wordVectors = np.load('wordVectors.npy')
- print('载入文本向量')
-
- print(len(wordsList))
- print(wordVectors.shape)
-
- import os
- from os.path import isfile, join
-
- pos_files = ['pos/' + f for f in os.listdir(
- 'pos/') if isfile(join('pos/', f))]
- neg_files = ['neg/' + f for f in os.listdir(
- 'neg/') if isfile(join('neg/', f))]
- num_words = []
- for pf in pos_files:
- with open(pf, "r", encoding='utf-8') as f:
- line = f.readline()
- counter = len(line.split())
- num_words.append(counter)
- print('正面评价完结')
-
- for nf in neg_files:
- with open(nf, "r", encoding='utf-8') as f:
- line = f.readline()
- counter = len(line.split())
- num_words.append(counter)
- print('负面评价完结')
-
- num_files = len(num_words)
- print('文件总数', num_files)
- print('所有的词的数量', sum(num_words))
- print('平均文件词的长度', sum(num_words) / len(num_words))
step2:将文本生成一个索引矩阵
- import re
-
- strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
- num_dimensions = 300 # Dimensions for each word vector
-
-
- def cleanSentences(string):
- string = string.lower().replace("<br />", " ")
- return re.sub(strip_special_chars, "", string.lower())
-
-
- max_seq_num = 250
- ids = np.zeros((num_files, max_seq_num), dtype='int32')
- file_count = 0
- for pf in pos_files:
- with open(pf, "r", encoding='utf-8') as f:
- indexCounter = 0
- line = f.readline()
- cleanedLine = cleanSentences(line)
- split = cleanedLine.split()
- for word in split:
- try:
- ids[file_count][indexCounter] = wordsList.index(word)
- except ValueError:
- ids[file_count][indexCounter] = 399999 # 未知的词
- indexCounter = indexCounter + 1
- if indexCounter >= max_seq_num:
- break
- file_count = file_count + 1
-
- for nf in neg_files:
- with open(nf, "r",encoding='utf-8') as f:
- indexCounter = 0
- line = f.readline()
- cleanedLine = cleanSentences(line)
- split = cleanedLine.split()
- for word in split:
- try:
- ids[file_count][indexCounter] = wordsList.index(word)
- except ValueError:
- ids[file_count][indexCounter] = 399999 # 未知的词语
- indexCounter = indexCounter + 1
- if indexCounter >= max_seq_num:
- break
- file_count = file_count + 1
-
- np.save('idsMatrix', ids)
step3: 辅助函数,用来生成一批训练数据集
- def get_train_batch():
- labels = []
- arr = np.zeros([batch_size, max_seq_num])
- for i in range(batch_size):
- if (i % 2 == 0):
- num = randint(1, 11499)
- labels.append([1, 0])
- else:
- num = randint(13499, 24999)
- labels.append([0, 1])
- arr[i] = ids[num - 1:num]
- return arr, labels
-
-
- def get_test_batch():
- labels = []
- arr = np.zeros([batch_size, max_seq_num])
- for i in range(batch_size):
- num = randint(11499, 13499)
- if (num <= 12499):
- labels.append([1, 0])
- else:
- labels.append([0, 1])
- arr[i] = ids[num - 1:num]
- return arr, labels
step4:模型设置
- import tensorflow as tf
-
- tf.reset_default_graph()
-
- labels = tf.placeholder(tf.float32, [batch_size, num_labels])
- input_data = tf.placeholder(tf.int32, [batch_size, max_seq_num])
- data = tf.Variable(
- tf.zeros([batch_size, max_seq_num, num_dimensions]), dtype=tf.float32)
- 获得文本向量
- data = tf.nn.embedding_lookup(wordVectors, input_data)
-
- 配置LSTM的数量
- lstmCell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
- 避免过拟合
- lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.5)
- 输入模型中,用来展开整个网络
- value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
-
- weight = tf.Variable(tf.truncated_normal([lstm_units, num_labels]))
- bias = tf.Variable(tf.constant(0.1, shape=[num_labels]))
- value = tf.transpose(value, [1, 0, 2])
- last = tf.gather(value, int(value.get_shape()[0]) - 1)
- prediction = (tf.matmul(last, weight) + bias)
-
- 定义正确的评估函数以及正确率评估参数
- correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
- accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
-
- loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
- logits=prediction, labels=labels))
- optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
-
- saver = tf.train.Saver()
-
- with tf.Session() as sess:
- if os.path.exists("models") and os.path.exists("models/checkpoint"):
- saver.restore(sess, tf.train.latest_checkpoint('models'))
- else:
- if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
- init = tf.initialize_all_variables()
- else:
- init = tf.global_variables_initializer()
- sess.run(init)
-
- iterations = 100
- for step in range(iterations):
- next_batch, next_batch_labels = get_test_batch()
- if step % 20 == 0:
- print("step:", step, " 正确率:", (sess.run(
- accuracy, {input_data: next_batch, labels: next_batch_labels})) * 100)
-
- if not os.path.exists("models"):
- os.mkdir("models")
- save_path = saver.save(sess, "models/model.ckpt")
- print("Model saved in path: %s" % save_path)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。