赞
踩
在NLP进行文本情感分析时,第一步常常需要对进行分析的数据集进行处理,本文旨在描
述对NLP常用数据集MR的预处理操作,包括标签、切片、分词、并利用预训练词向量进行初始
化,然后利用keras.SimpleRNN进行情感分类。
数据集:MR地址
python库:tensorflow.keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import SimpleRNN
import numpy as np
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Embedding, concatenate, Flatten
from tensorflow.keras.constraints import max_norm
通过上述链接得到的数据集包括.neg和.pos两个文件。
neg情感label为0,pos为1。遍历两个文件,对样本数据加上标签。
# %% 对数据加标签 with open('rt-polarity.neg', 'r', encoding='utf-8', errors='ignore') as file: lines = file.readlines() data_set = [] for line in lines: line = line.strip(' ').strip('\n') data_set.append(line + '\t' + str(0) + '\n') with open('neg_label.txt', 'w', encoding='utf-8') as file: file.writelines(data_set) with open('rt-polarity.pos', 'r', encoding='utf-8', errors='ignore') as file: lines = file.readlines() data_set = [] for line in lines: line = line.strip(' ').strip('\n') data_set.append(line + '\t' + str(1) + '\n') with open('pos_label.txt', 'w', encoding='utf-8') as file: file.writelines(data_set)
得到新的两个文件:
neg_label.txt中的数据格式为
需要将句子和标签切片,并把每个word分开(代码中print(x_pos)可显示切片效果)
pos_label = [] x_pos = [] neg_label = [] x_neg = [] max_len = 64 # max_pos = max_neg = 0 # 两类句子的最大长度55/57 with open('pos_label.txt', 'r') as file: lines = file.readlines() for line in lines: line = line.split('\t') line0 = line[0].strip(' ').strip('.').strip(' ') # 去除每个句子最后的‘ . ’ line0 = line0.split(' ') # 对每个句子进行了单词的拆分 # max_pos = np.maximum(max_pos, len(line0)) pos_label.append(int(line[1][0])) x_pos.append(line0) with open('neg_label.txt', 'r') as file: lines = file.readlines() for line in lines: line = line.split('\t') line0 = line[0].strip(' ').strip('.').strip(' ') # 去除每个句子最后的‘ . ’ line0 = line0.split(' ') # 对每个句子进行了单词的拆分 # max_neg = np.maximum(max_neg, len(line0)) neg_label.append(int(line[1][0])) x_neg.append(line0) print(x_pos)
本文采用glove预训练词向量进行初始化
# %% 建立三个字典 word_to_vector = {} word_to_index = {} index_to_vector = {} with open('E:\sentiment_classification\dataset\pre_trained\glove.6B\glove.6B.50d.txt', 'r', encoding='utf-8') as file: lines = file.readlines() i = 1 # lines[0] = lines[0].strip('\n').split(' ') # 用' '分隔每个部分 # print(len(lines[0])) # print(lines[0]) for line in lines: line = line.strip('\n').split(' ') word = line[0] word_to_vector[word] = np.array(line[1:]) word_to_index[word] = i index_to_vector[str(i)] = np.array(line[1:]) i += 1 # print(word_to_vector) # print(word_to_index) # print(index_to_vector) # %%创建一个预训练的词向量矩阵 embedding_dim = 50 # 每个样本时是64*50 vocab_size = len(index_to_vector) + 1 embedding_martix = np.zeros([vocab_size, embedding_dim]) for i in range(1, embedding_martix.shape[0]): embedding_martix[i] = index_to_vector[str(i)]
在进行word embedding之前,要保证输入的句子使用数字(word的索引)表示,且每个
sentence的长度一致。
# %% 将切片的word转化为index for i in range(len(x_pos)): for j in range(len(x_pos[i])): word = x_pos[i][j] if word not in word_to_index: word = 0 else: word = word_to_index[word] x_pos[i][j] = word for i in range(len(x_neg)): for j in range(len(x_neg[i])): word = x_neg[i][j] if word not in word_to_index: word = 0 else: word = word_to_index[word] x_neg[i][j] = word # %% 句子长度填充 max_len=60, post: 向后填充 x_pos = tf.keras.preprocessing.sequence.pad_sequences(x_pos, value=0, padding='post', maxlen=max_len) x_neg = tf.keras.preprocessing.sequence.pad_sequences(x_neg, value=0, padding='post', maxlen=max_len)
经过以上操作,对句子和标签都进行了处理。
一般来说,此时预处理过程就结束了,但是原始MR数据集并没有划分训练和测试集,接下
来进行训练和测试数据集的划分。
前面的数据探测可知pos和neg各有5331个样本,本文直接将pos和neg的前4000个作为训
练集,剩下的作为测试集(此处测试集和验证集是同一个)。
x_pos_train = x_pos[:4000]
x_pos_test = x_pos[4001:]
x_neg_train = x_neg[:4000]
x_neg_test = x_neg[4001:]
y_pos_train = pos_label[:4000]
y_pos_test = pos_label[4001:]
y_neg_train = neg_label[:4000]
y_neg_test = neg_label[4001:]
print(len(x_pos_train))
train_x = np.append(x_pos_train, x_neg_train, axis=0)
train_y = np.append(y_pos_train, y_neg_train)
test_x = np.append(x_pos_test, x_neg_test, axis=0)
test_y = np.append(y_pos_test, y_neg_test)
print(len(x_pos_train))
由于本文重点在于描述预训练过程,因此模块的搭建不作赘述,直接上代码。
class MyRNN(tf.keras.Model): def __init__(self): super(MyRNN, self).__init__() self.embed = tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim, input_length=max_len, weights=[embedding_martix]) self.rnn = tf.keras.layers.SimpleRNN(32, dropout=0.5) self.drop = tf.keras.layers.Dropout(rate=0.2) self.flatten = tf.keras.layers.Flatten() self.f1 = tf.keras.layers.Dense(16) self.f2 = tf.keras.layers.Dense(1) def call(self, inputs): x = inputs x = self.embed(x) x = self.rnn(x) x = self.drop(x) x = self.flatten(x) x = self.f1(x) x = self.f2(x) y = tf.sigmoid(x) return y model = MyRNN()
模型搭建和运行:
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_x, train_y, epochs=20, validation_data=(test_x, test_y), shuffle=True, batch_size=64,
verbose=2)
model.summary()
结果为:
可以看出模型的准确率并不高,个人猜测可能的原因包括:
(1)所选模型不适合MR数据集
(2)MR数据集的train/test划分有问题。
接下来对这两个原因进行排查:
首先更换模型为TextCNN,验证是否是模型的问题。
代码如下:
def TextCNN(vocab_size, output_dim, embedding_dim, embedding_matrix=None): x_input = Input(shape=(max_len,)) if embedding_matrix is None: x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(x_input) else: x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=True)(x_input) x = x[..., tf.newaxis] filters = [100, 100, 100] output_pool = [] kernel_sizes = [3, 4, 5] for i, kernel_size in enumerate(kernel_sizes): conv = Conv2D(filters=filters[i], kernel_size=(kernel_size, embedding_dim), padding='valid', kernel_constraint=max_norm(3, [0, 1, 2]))(x) conv = tf.keras.layers.BatchNormalization()(conv) conv = tf.keras.layers.ReLU()(conv) pool = MaxPool2D(pool_size=(max_len - kernel_size + 1, 1))(conv) # pool = tf.keras.layers.GlobalAveragePooling2D()(conv) # 1_max pooling output_pool.append(pool) # logging.info("kernel_size: {}, conv.shape: {}, pool.shape: {}".format(kernel_size, conv.shape, pool.shape)) print("kernel_size: {}, conv.shape: {}, pool.shape: {}".format(kernel_size, conv.shape, pool.shape)) output_pool = concatenate([p for p in output_pool]) # logging.info("output_pool.shape: {}".format(output_pool.shape)) print("output_pool.shape: {}".format(output_pool.shape)) x = Dropout(rate=0.5)(output_pool) x = Flatten()(x) y = Dense(output_dim, activation='sigmoid')(x) model = tf.keras.Model([x_input], y) return model output_dim = 1 batchsz = 64 model = TextCNN(vocab_size, output_dim, embedding_dim, embedding_matrix=embedding_martix) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) history = model.fit(train_x, train_y, epochs=20, validation_data=(test_x, test_y), shuffle=True, batch_size=64, verbose=2) model.summary()
结果为:
可以看出准确率较RNN确实有显著上升,证明RNN确实不是很适合MR数据集.另外,可以发
现train_acc接近1,而val_acc接近0.76,说明模型过拟合了,当然也可能是第(2)种情况造成
的.
imdb数据集已经划分了train/test,可用以验证第(2)种情况:
代码:
imdb = tf.keras.datasets.imdb
(train_x, train_y), (test_x, test_y) = imdb.load_data(num_words=5000)
train_x = tf.keras.preprocessing.sequence.pad_sequences(train_x, value=0, padding='post',
maxlen=max_len)
test_x = tf.keras.preprocessing.sequence.pad_sequences(test_x, value=0, padding='post',
maxlen=max_len)
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(1000).batch(64, drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices((test_x, test_y)).batch(64, drop_remainder=True)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_dataset, epochs=20, validation_data=test_dataset,
verbose=2)
model.summary()
结果为:
可以发现将RNN用在IMDB上面时,train_acc与val_acc均接近77.3%,即不存在过拟合的
情况,说明训练和测试集的分布大致相同。另外,该准确率比在MR上的准确率高,只能说可能
MR数据集确实存在训练测试集分配不合理的问题。
本文已将代码切分到各个步骤的模块中,如需使用本代码,直接将各模块代码按顺序堆叠
即可(但需注意在TextCNN、SimpleRNN和使用imdb数据集的三种情况中,只选择其中一种的
代码。)
本文为原创博客,地址为https://editor.csdn.net/md?articleId=109670809,如需
转载请注明出处。另外本博客持续更新,欢迎大家批评指正。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。