赞
踩
#!/usr/bin/env python3 # encoding: utf-8 ''' @file: keras_emotional_analysis_mlp.py @time: 2020/7/4 0004 12:06 @author: Jack @contact: jack18588951684@163.com ''' import string import re from os import listdir from numpy import array from nltk.corpus import stopwords from keras.preprocessing.text import Tokenizer from keras.utils.vis_utils import plot_model from keras.models import Sequential from keras.layers import Dense ## 加载文档 def load_doc(filename): file = open(filename, 'r') text = file.read() file.close() return text ## 清洗文档 def clean_doc(doc): tokens = doc.split() re_punc = re.compile('[%s]]' % re.escape(string.punctuation)) tokens = [re_punc.sub('', w) for w in tokens] tokens = [w for w in tokens if w.isalpha()] stop_words = set(stopwords.words('english')) tokens = [w for w in tokens if not w in stop_words] tokens = [w for w in tokens if len(w) > 1] return tokens ## 编码文档 def doc_to_line(filename, vocab): doc = load_doc(filename) tokens = clean_doc(doc) tokens = [w for w in tokens if w in vocab] return ' '.join(tokens) def process_docs(directory, vocab, is_train): lines = list() for filename in listdir(directory): if is_train and filename.startswith('cv9'): continue if not is_train and not filename.startswith('cv9'): continue path = directory + '/' + filename line = doc_to_line(path, vocab) lines.append(line) return lines def load_clean_dataset(vocab, is_train): neg = process_docs('txt_sentoken/neg', vocab, is_train) pos = process_docs('txt_sentoken/pos', vocab, is_train) docs = neg + pos labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]) return docs, labels ## 训练分词器 def create_tokenizer(lines): tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer ## 定义模型 def define_model(n_words): model = Sequential() model.add(Dense(50, input_shape=(n_words,), activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() # plot_model(model, to_file='model.png', show_shapes=True) return model if __name__ == "__main__": ## 加载词典 vocab_filename = 'vocab.txt' vocab = load_doc(vocab_filename) vocab = set(vocab.split()) ## 加载所有评论文档 train_docs, ytrain = load_clean_dataset(vocab, True) test_docs, ytest = load_clean_dataset(vocab, False) ## 创建分词器 tokenizer = create_tokenizer(train_docs) ## 编码文档 Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq') Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq') ## 定义模型 n_words = Xtest.shape[1] model = define_model(n_words) ## 训练模型 model.fit(Xtrain, ytrain, epochs=10, verbose=2) ## 评估模型 loss, acc = model.evaluate(Xtest, ytest, verbose=0) print('Test Accuracy:{}'.format(acc * 100))
代码使用Keras中的Dense全连接层构建简单的多层感知器MLP将编码文档分类为正面或负面。模型的输入层的大小等于词汇表中的单词数即输入文档的长度,将其存储在一个名为n_words的新变量中。模型定义一个具有50个神经元和使用relu激活函数的隐藏层,输出层是使用sigmoid激活函数的单个神经元,用于预测最终评论的类别:0为负面评论,1为正面评论。模型训练过程中梯度下降使用Adam算法和二元交叉熵损失函数,在训练和评估模型时,跟踪模型准确性accuracy数据。
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 50) 1181350 _________________________________________________________________ dense_1 (Dense) (None, 1) 51 ================================================================= Total params: 1,181,401 Trainable params: 1,181,401 Non-trainable params: 0 _________________________________________________________________ Epoch 1/10 57/57 - 0s - loss: 0.6912 - accuracy: 0.6300 Epoch 2/10 57/57 - 0s - loss: 0.6809 - accuracy: 0.8617 Epoch 3/10 57/57 - 0s - loss: 0.6613 - accuracy: 0.9172 Epoch 4/10 57/57 - 0s - loss: 0.6317 - accuracy: 0.9128 Epoch 5/10 57/57 - 0s - loss: 0.5948 - accuracy: 0.9383 Epoch 6/10 57/57 - 0s - loss: 0.5519 - accuracy: 0.9456 Epoch 7/10 57/57 - 0s - loss: 0.5067 - accuracy: 0.9456 Epoch 8/10 57/57 - 0s - loss: 0.4610 - accuracy: 0.9589 Epoch 9/10 57/57 - 0s - loss: 0.4187 - accuracy: 0.9561 Epoch 10/10 57/57 - 0s - loss: 0.3782 - accuracy: 0.9667 Test Accuracy:88.99999856948853
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。