赞
踩
【参考:NLP 从入门到精通(全六讲)_哔哩哔哩_bilibili】
import tensorflow as tf from tensorflow import keras from keras.preprocessing.text import Tokenizer sentences=[ 'I love my dog', 'I love my cat' ] # num_words 需要保留的最大单词数 tokenizer = Tokenizer(num_words=100) tokenizer.fit_on_texts(sentences) # 根据文本列表更新内部词汇 word_index=tokenizer.word_index # 单词_数字 print(word_index) print(tokenizer.index_word) # 数字_单词
{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}
{1: 'i', 2: 'love', 3: 'my', 4: 'dog', 5: 'cat'}
word_index 单词:数字 相当于单词字典
import tensorflow as tf from tensorflow import keras from keras.preprocessing.text import Tokenizer sentences=[ 'I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?', ] # num_words 需要保留的最大单词数 tokenizer = Tokenizer(num_words=100) tokenizer.fit_on_texts(sentences) # 根据文本列表更新内部词汇 word_index=tokenizer.word_index # 单词:数字 print(word_index) sequences=tokenizer.texts_to_sequences(sentences) # 将文本中的每个文本转换为一个整数序列。 print(sequences)
{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5,
'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6],
[5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
import tensorflow as tf from tensorflow import keras from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences sentences=[ 'I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?', ] # num_words 需要保留的最大单词数 tokenizer = Tokenizer(num_words=100,oov_token='<OOV>') # 用OOV代替不认识的单词 tokenizer.fit_on_texts(sentences) # 根据文本列表更新内部词汇 word_index=tokenizer.word_index # 单词:数字 print(word_index) sequences=tokenizer.texts_to_sequences(sentences) # 将文本中的每个文本转换为一个整数序列。 print(sequences) pad_seq=pad_sequences(sequences) # 填充至相同长度 默认向前填充,和最长的文本一致 print(pad_seq) test_data=[ 'i really love my dog', 'my dog loves my manatee ' ] test_sequences=tokenizer.texts_to_sequences(test_data) # 将文本中的每个文本转换为一个整数序列。 print(test_sequences)
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6,
'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0 0 0 5 3 2 4]
[ 0 0 0 5 3 2 7]
[ 0 0 0 6 3 2 4]
[ 8 6 9 2 4 10 11]]
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
json文件 https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection/version/2
这里只用文本标题,不使用文本具体的内容
sarcastic 讽刺的
{"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5",
"headline": "former versace store clerk sues over secret 'black code' for minority shoppers",
"is_sarcastic": 0}
import json
f=open('Sarcasm_Headlines_Dataset.json')
datastore=[]
for line in f:
datastore.append(json.loads(line))
f.close()
# print(datastore[0])
sentences=[] labels=[] urls=[] for item in datastore: sentences.append(item['headline']) labels.append(item['is_sarcastic']) urls.append(item['article_link']) from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences tokenizer=Tokenizer(oov_token='<OOV>') tokenizer.fit_on_texts(sentences) word_index=tokenizer.word_index sequences=tokenizer.texts_to_sequences(sentences) padded=pad_sequences(sequences,padding='post') # 填充值相同长度 print(padded[0]) print(padded.shape)
[ 308 15115 679 3337 2298 48 382 2576 15116 6 2577 8434
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
(26709, 40)
import json import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences vocab_size = 10000 embedding_dim = 16 max_length = 100 trunc_type='post' padding_type='post' oov_tok = "<OOV>" training_size = 20000 # 下载文件 !wget --no-check-certificate \ https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \ -O /tmp/sarcasm.json with open("/tmp/sarcasm.json", 'r') as f: datastore = json.load(f) sentences = [] labels = [] for item in datastore: sentences.append(item['headline']) labels.append(item['is_sarcastic']) # 划分数据集 training_sentences = sentences[0:training_size] testing_sentences = sentences[training_size:] training_labels = labels[0:training_size] testing_labels = labels[training_size:] tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index # 字典 training_sequences = tokenizer.texts_to_sequences(training_sentences) # 序列 training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # 填充 testing_sequences = tokenizer.texts_to_sequences(testing_sentences) testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Need this block to get it to work with TensorFlow 2.x import numpy as np training_padded = np.array(training_padded) training_labels = np.array(training_labels) testing_padded = np.array(testing_padded) testing_labels = np.array(testing_labels) model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(24, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 100, 16) 160000 global_average_pooling1d (G (None, 16) 0 lobalAveragePooling1D) dense (Dense) (None, 24) 408 dense_1 (Dense) (None, 1) 25 ================================================================= Total params: 160,433 Trainable params: 160,433 Non-trainable params: 0 _________________________________________________________________
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)
Epoch 1/30 625/625 - 7s - loss: 0.6809 - accuracy: 0.5590 - val_loss: 0.6565 - val_accuracy: 0.5725 - 7s/epoch - 11ms/step Epoch 2/30 625/625 - 3s - loss: 0.5159 - accuracy: 0.7788 - val_loss: 0.4191 - val_accuracy: 0.8261 - 3s/epoch - 5ms/step Epoch 3/30 625/625 - 3s - loss: 0.3471 - accuracy: 0.8623 - val_loss: 0.3661 - val_accuracy: 0.8468 - 3s/epoch - 5ms/step Epoch 4/30 625/625 - 3s - loss: 0.2879 - accuracy: 0.8871 - val_loss: 0.3491 - val_accuracy: 0.8554 - 3s/epoch - 5ms/step Epoch 5/30 625/625 - 3s - loss: 0.2500 - accuracy: 0.9036 - val_loss: 0.3450 - val_accuracy: 0.8562 - 3s/epoch - 5ms/step ... Epoch 25/30 625/625 - 3s - loss: 0.0453 - accuracy: 0.9868 - val_loss: 0.7406 - val_accuracy: 0.8275 - 3s/epoch - 5ms/step Epoch 26/30 625/625 - 3s - loss: 0.0421 - accuracy: 0.9882 - val_loss: 0.7737 - val_accuracy: 0.8214 - 3s/epoch - 5ms/step Epoch 27/30 625/625 - 3s - loss: 0.0395 - accuracy: 0.9887 - val_loss: 0.7965 - val_accuracy: 0.8243 - 3s/epoch - 5ms/step Epoch 28/30 625/625 - 3s - loss: 0.0366 - accuracy: 0.9898 - val_loss: 0.8316 - val_accuracy: 0.8246 - 3s/epoch - 5ms/step Epoch 29/30 625/625 - 3s - loss: 0.0341 - accuracy: 0.9903 - val_loss: 0.8655 - val_accuracy: 0.8177 - 3s/epoch - 5ms/step Epoch 30/30 625/625 - 3s - loss: 0.0317 - accuracy: 0.9913 - val_loss: 0.9000 - val_accuracy: 0.8162 - 3s/epoch - 5ms/step
import matplotlib.pyplot as plt
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))
[[9.1123056e-01]
[1.8805741e-04]]
把字典存入文本
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_sentence(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_sentence(training_padded[0]))
print(training_sentences[2])
print(labels[2])
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)
import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
word = reverse_word_index[word_num]
embeddings = weights[word_num]
out_m.write(word + "\n")
out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()
try:
from google.colab import files
except ImportError:
pass
else:
files.download('vecs.tsv')
files.download('meta.tsv')
vecs.tsv
0.035059918 0.06969988 -0.046476096 0.033137582 0.06961141 0.05123605 -0.069506176 -0.12660429 0.041010037 0.0003397553 -0.039760135 0.049228124 -0.014847669 -0.022333661 -0.027154364 -0.025999745
0.13617258 0.07031224 -0.04279314 0.15561616 0.14426595 -0.022535648 -0.16350295 -0.21112327 0.018741624 -0.033655755 -0.020546112 0.17472167 -0.03205964 -0.16326663 0.07432485 -0.1281586
0.15122083 0.19668455
meta.tsv
<OOV> to of the in for a on and with is new trump man from at about you by this after be
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。