当前位置:   article > 正文

《Python 深度学习》6.1.3 整合在一起:从原始文本到词嵌入_for label_type in ['neg', 'pos']:

for label_type in ['neg', 'pos']:

1. 处理 IMDB 原始数据的标签

在每个 neg/pos 目录下面就是一大堆 .txt 文件了,每个里面是一条评论。

下面,我们将 train 评论转换成字符串列表,一个字符串一条评论,并把对应的标签(neg/pos)写到 labels 列表。

  1. # 处理 IMDB 原始数据的标签
  2. import os
  3. imdb_dir = 'D:\\2022Thesis\\Deep Learning with Python\\Code\\aclImdb\\aclImdb'
  4. train_dir = os.path.join(imdb_dir, 'train')
  5. texts = []
  6. labels = []
  7. for label_type in ['neg', 'pos']:
  8. dir_name = os.path.join(train_dir, label_type)
  9. for fname in os.listdir(dir_name):
  10. if fname.endswith('.txt'):
  11. with open(os.path.join(dir_name, fname),encoding='utf-8') as f:
  12. texts.append(f.read())
  13. labels.append(0 if label_type == 'neg' else 1)
  1. print(labels[0], texts[0], sep=' --> ')
  2. print(labels[-1], texts[-1], sep=' --> ')
  3. print(len(texts), len(labels))

2. 对数据进行分词


现在来分词,顺便划分一下训练集和验证集。为了体验预训练词嵌入,我们再把训练集搞小一点,只留200条数据用来训练。 

  1. # 对 IMDB 原始数据的文本进行分词
  2. import numpy as np
  3. from tensorflow.keras.preprocessing.text import Tokenizer
  4. from tensorflow.keras.preprocessing.sequence import pad_sequences
  5. maxlen = 100 # 只看每条评论的前100个词
  6. training_samples = 200
  7. validation_samples = 10000
  8. max_words = 10000
  9. tokenizer = Tokenizer(num_words=max_words)
  10. tokenizer.fit_on_texts(texts)
  11. sequences = tokenizer.texts_to_sequences(texts)
  12. word_index = tokenizer.word_index
  13. print(f'Found {len(word_index)} unique tokens.')
  14. data = pad_sequences(sequences, maxlen=maxlen)
  15. labels = np.asarray(labels)
  16. print('Shape of data tensor:', data.shape)
  17. print('Shape of label tensor:', labels.shape)
  18. # 打乱数据
  19. indices = np.arange(labels.shape[0])
  20. np.random.shuffle(indices)
  21. data = data[indices]
  22. labels = labels[indices]
  23. # 划分训练、验证集
  24. x_train = data[:training_samples]
  25. y_train = labels[:training_samples]
  26. x_val = data[training_samples: training_samples + validation_samples]
  27. y_val = labels[training_samples: training_samples + validation_samples]

 3. 下载 GloVe 词嵌入


下载预训练好的 GloVe 词嵌入: http://nlp.stanford.edu/data/glove.6B.zip

写下来把它解压,里面用纯文本保存了训练好的 400000 个 tokens 的 100 维词嵌入向量。

 4. 对嵌入进行预处理 

  1. #解析GloVe词嵌入文件
  2. glove_dir = 'D:/2022Thesis/Deep Learning with Python/code/glove.6B'
  3. embeddings_index = {}
  4. with open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='utf-8') as f:
  5. for line in f:
  6. values = line.split()
  7. word = values[0]
  8. coefs = np.asarray(values[1:], dtype='float32')
  9. embeddings_index[word] = coefs
  10. print(f'Found {len(embeddings_index)} word vectors.')

准备GloVe词嵌入矩阵 

然后,我们要构建一个可以加载进 Embedding 层的嵌入矩阵,其形状为 (max_words, embedding_dim)。

  1. embedding_dim = 100
  2. embedding_matrix = np.zeros((max_words, embedding_dim))
  3. for word, i in word_index.items():
  4. if i < max_words:
  5. embedding_vector = embeddings_index.get(word) # 有的就用 embeddings_index 里的词向量
  6. if embedding_vector is not None: # 没有就用全零
  7. embedding_matrix[i] = embedding_vector
  8. print(embedding_matrix)

 

5. 定义模型  

  1. from tensorflow.keras.models import Sequential
  2. from tensorflow.keras.layers import Embedding, Flatten, Dense
  3. model = Sequential()
  4. model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
  5. model.add(Flatten())
  6. model.add(Dense(32, activation='relu'))
  7. model.add(Dense(1, activation='sigmoid'))
  8. model.summary()

 6. 把 GloVe 词嵌入加载进模型

  1. model.layers[0].set_weights([embedding_matrix])
  2. model.layers[0].trainable = False

7.训练与评估模型 

  1. model.compile(optimizer='rmsprop',
  2. loss='binary_crossentropy',
  3. metrics=['acc'])
  4. history = model.fit(x_train, y_train,
  5. epochs=10,
  6. batch_size=32,
  7. validation_data=(x_val, y_val))
  8. model.save_weights('pre_trained_glove_model.h5')

8. 绘制结果

  1. # 绘制结果
  2. import matplotlib.pyplot as plt
  3. acc = history.history['acc']
  4. val_acc = history.history['val_acc']
  5. loss = history.history['loss']
  6. val_loss = history.history['val_loss']
  7. epochs = range(1,len(acc)+1)
  8. plt.plot(epochs, acc, 'bo', label='Training acc')
  9. plt.plot(epochs, val_acc, 'b', label='Validation acc')
  10. plt.title('Training and validation accuracy')
  11. plt.legend()
  12. plt.figure()
  13. plt.plot(epochs, loss, 'bo', label='Training loss')
  14. plt.plot(epochs, val_loss, 'b', label='Validation loss')
  15. plt.title('Training and validation loss')
  16. plt.legend()
  17. plt.show()

 只用 200 个训练样本还是太难了,但用预训练词嵌入还是得到了不错的成果的。作为对比,看看如果不使用预训练,会是什么样的:

9. 在不使用预训练词嵌入的情况下,训练相同的模型:

  1. # 构建模型
  2. from tensorflow.keras.models import Sequential
  3. from tensorflow.keras.layers import Embedding, Flatten, Dense
  4. model = Sequential()
  5. model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
  6. model.add(Flatten())
  7. model.add(Dense(32, activation='relu'))
  8. model.add(Dense(1, activation='sigmoid'))
  9. model.summary()
  10. # 不使用 GloVe 词嵌入
  11. # 训练
  12. model.compile(optimizer='rmsprop',
  13. loss='binary_crossentropy',
  14. metrics=['acc'])
  15. history = model.fit(x_train, y_train,
  16. epochs=10,
  17. batch_size=32,
  18. validation_data=(x_val, y_val))
  19. # 绘制结果
  20. import matplotlib.pyplot as plt
  21. acc = history.history['acc']
  22. val_acc = history.history['val_acc']
  23. loss = history.history['loss']
  24. val_loss = history.history['val_loss']
  25. epochs = range(len(acc))
  26. plt.plot(epochs, acc, 'bo-', label='Training acc')
  27. plt.plot(epochs, val_acc, 'rs-', label='Validation acc')
  28. plt.title('Training and validation accuracy')
  29. plt.legend()
  30. plt.figure()
  31. plt.plot(epochs, loss, 'bo-', label='Training loss')
  32. plt.plot(epochs, val_loss, 'rs-', label='Validation loss')
  33. plt.title('Training and validation loss')
  34. plt.legend()
  35. plt.show()

 

 

10. 对测试集数据进行分词 

  1. # 对测试集数据进行分词
  2. test_dir = os.path.join(imdb_dir, 'test')
  3. texts = []
  4. labels = []
  5. for label_type in ['neg', 'pos']:
  6. dir_name = os.path.join(test_dir, label_type)
  7. for fname in sorted(os.listdir(dir_name)):
  8. if fname.endswith('.txt'):
  9. with open(os.path.join(dir_name, fname), encoding='utf-8') as f:
  10. texts.append(f.read())
  11. labels.append(0 if label_type == 'neg' else 1)
  12. sequences = tokenizer.texts_to_sequences(texts)
  13. x_test = pad_sequences(sequences, maxlen=maxlen)
  14. y_test = np.asarray(labels)

11.在测试集上评估模型 

  1. model.load_weights('pre_trained_glove_model.h5')
  2. model.evaluate(x_test, y_test)

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/376211
推荐阅读
相关标签
  

闽ICP备14008679号