赞
踩
在每个 neg/pos 目录下面就是一大堆 .txt 文件了,每个里面是一条评论。
下面,我们将 train 评论转换成字符串列表,一个字符串一条评论,并把对应的标签(neg/pos)写到 labels 列表。
- # 处理 IMDB 原始数据的标签
-
- import os
-
- imdb_dir = 'D:\\2022Thesis\\Deep Learning with Python\\Code\\aclImdb\\aclImdb'
- train_dir = os.path.join(imdb_dir, 'train')
-
- texts = []
- labels = []
-
- for label_type in ['neg', 'pos']:
- dir_name = os.path.join(train_dir, label_type)
- for fname in os.listdir(dir_name):
- if fname.endswith('.txt'):
- with open(os.path.join(dir_name, fname),encoding='utf-8') as f:
- texts.append(f.read())
- labels.append(0 if label_type == 'neg' else 1)

- print(labels[0], texts[0], sep=' --> ')
- print(labels[-1], texts[-1], sep=' --> ')
- print(len(texts), len(labels))
现在来分词,顺便划分一下训练集和验证集。为了体验预训练词嵌入,我们再把训练集搞小一点,只留200条数据用来训练。
- # 对 IMDB 原始数据的文本进行分词
-
- import numpy as np
-
- from tensorflow.keras.preprocessing.text import Tokenizer
- from tensorflow.keras.preprocessing.sequence import pad_sequences
-
- maxlen = 100 # 只看每条评论的前100个词
- training_samples = 200
- validation_samples = 10000
- max_words = 10000
-
- tokenizer = Tokenizer(num_words=max_words)
- tokenizer.fit_on_texts(texts)
- sequences = tokenizer.texts_to_sequences(texts)
- word_index = tokenizer.word_index
- print(f'Found {len(word_index)} unique tokens.')
-
- data = pad_sequences(sequences, maxlen=maxlen)
-
- labels = np.asarray(labels)
-
- print('Shape of data tensor:', data.shape)
- print('Shape of label tensor:', labels.shape)
-
- # 打乱数据
- indices = np.arange(labels.shape[0])
- np.random.shuffle(indices)
- data = data[indices]
- labels = labels[indices]
-
- # 划分训练、验证集
- x_train = data[:training_samples]
- y_train = labels[:training_samples]
- x_val = data[training_samples: training_samples + validation_samples]
- y_val = labels[training_samples: training_samples + validation_samples]

下载预训练好的 GloVe 词嵌入: http://nlp.stanford.edu/data/glove.6B.zip
写下来把它解压,里面用纯文本保存了训练好的 400000 个 tokens 的 100 维词嵌入向量。
- #解析GloVe词嵌入文件
- glove_dir = 'D:/2022Thesis/Deep Learning with Python/code/glove.6B'
-
- embeddings_index = {}
-
- with open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='utf-8') as f:
- for line in f:
- values = line.split()
- word = values[0]
- coefs = np.asarray(values[1:], dtype='float32')
- embeddings_index[word] = coefs
-
- print(f'Found {len(embeddings_index)} word vectors.')
然后,我们要构建一个可以加载进 Embedding 层的嵌入矩阵,其形状为 (max_words, embedding_dim)。
- embedding_dim = 100
-
- embedding_matrix = np.zeros((max_words, embedding_dim))
- for word, i in word_index.items():
- if i < max_words:
- embedding_vector = embeddings_index.get(word) # 有的就用 embeddings_index 里的词向量
- if embedding_vector is not None: # 没有就用全零
- embedding_matrix[i] = embedding_vector
-
- print(embedding_matrix)
- from tensorflow.keras.models import Sequential
- from tensorflow.keras.layers import Embedding, Flatten, Dense
-
- model = Sequential()
- model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
- model.add(Flatten())
- model.add(Dense(32, activation='relu'))
- model.add(Dense(1, activation='sigmoid'))
-
- model.summary()
- model.layers[0].set_weights([embedding_matrix])
- model.layers[0].trainable = False
- model.compile(optimizer='rmsprop',
- loss='binary_crossentropy',
- metrics=['acc'])
-
- history = model.fit(x_train, y_train,
- epochs=10,
- batch_size=32,
- validation_data=(x_val, y_val))
- model.save_weights('pre_trained_glove_model.h5')
- # 绘制结果
-
- import matplotlib.pyplot as plt
-
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
-
- epochs = range(1,len(acc)+1)
-
- plt.plot(epochs, acc, 'bo', label='Training acc')
- plt.plot(epochs, val_acc, 'b', label='Validation acc')
- plt.title('Training and validation accuracy')
- plt.legend()
-
- plt.figure()
-
- plt.plot(epochs, loss, 'bo', label='Training loss')
- plt.plot(epochs, val_loss, 'b', label='Validation loss')
- plt.title('Training and validation loss')
- plt.legend()
-
- plt.show()

只用 200 个训练样本还是太难了,但用预训练词嵌入还是得到了不错的成果的。作为对比,看看如果不使用预训练,会是什么样的:
- # 构建模型
-
- from tensorflow.keras.models import Sequential
- from tensorflow.keras.layers import Embedding, Flatten, Dense
-
- model = Sequential()
- model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
- model.add(Flatten())
- model.add(Dense(32, activation='relu'))
- model.add(Dense(1, activation='sigmoid'))
-
- model.summary()
-
- # 不使用 GloVe 词嵌入
-
- # 训练
-
- model.compile(optimizer='rmsprop',
- loss='binary_crossentropy',
- metrics=['acc'])
- history = model.fit(x_train, y_train,
- epochs=10,
- batch_size=32,
- validation_data=(x_val, y_val))
-
- # 绘制结果
-
- import matplotlib.pyplot as plt
-
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
-
- epochs = range(len(acc))
-
- plt.plot(epochs, acc, 'bo-', label='Training acc')
- plt.plot(epochs, val_acc, 'rs-', label='Validation acc')
- plt.title('Training and validation accuracy')
- plt.legend()
-
- plt.figure()
-
- plt.plot(epochs, loss, 'bo-', label='Training loss')
- plt.plot(epochs, val_loss, 'rs-', label='Validation loss')
- plt.title('Training and validation loss')
- plt.legend()
-
- plt.show()

- # 对测试集数据进行分词
-
- test_dir = os.path.join(imdb_dir, 'test')
-
- texts = []
- labels = []
-
- for label_type in ['neg', 'pos']:
- dir_name = os.path.join(test_dir, label_type)
- for fname in sorted(os.listdir(dir_name)):
- if fname.endswith('.txt'):
- with open(os.path.join(dir_name, fname), encoding='utf-8') as f:
- texts.append(f.read())
- labels.append(0 if label_type == 'neg' else 1)
-
- sequences = tokenizer.texts_to_sequences(texts)
- x_test = pad_sequences(sequences, maxlen=maxlen)
- y_test = np.asarray(labels)
-

- model.load_weights('pre_trained_glove_model.h5')
- model.evaluate(x_test, y_test)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。