【NLP】Google 《NLP 从入门到精通》视频

作者：空白诗007 | 2024-07-27 13:31:57

踩

【参考：NLP 从入门到精通（全六讲）_哔哩哔哩_bilibili】

分词 Tokenization


import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

sentences=[
    'I love my dog',
    'I love my cat'
]

# num_words 需要保留的最大单词数
tokenizer = Tokenizer(num_words=100)

tokenizer.fit_on_texts(sentences) # 根据文本列表更新内部词汇
word_index=tokenizer.word_index # 单词_数字
print(word_index)
print(tokenizer.index_word) # 数字_单词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}
{1: 'i', 2: 'love', 3: 'my', 4: 'dog', 5: 'cat'}
1
2
'运行

word_index 单词:数字相当于单词字典

序列：将文本转化为数据


import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

sentences=[
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?',
]

# num_words 需要保留的最大单词数
tokenizer = Tokenizer(num_words=100)

tokenizer.fit_on_texts(sentences) # 根据文本列表更新内部词汇
word_index=tokenizer.word_index # 单词:数字 
print(word_index)

sequences=tokenizer.texts_to_sequences(sentences) # 将文本中的每个文本转换为一个整数序列。

print(sequences)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5,
 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], 
 [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
1
2
3
4
'运行


import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
sentences=[
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?',
]

# num_words 需要保留的最大单词数
tokenizer = Tokenizer(num_words=100,oov_token='<OOV>') # 用OOV代替不认识的单词

tokenizer.fit_on_texts(sentences) # 根据文本列表更新内部词汇
word_index=tokenizer.word_index # 单词:数字 
print(word_index)

sequences=tokenizer.texts_to_sequences(sentences) # 将文本中的每个文本转换为一个整数序列。

print(sequences)
pad_seq=pad_sequences(sequences) # 填充至相同长度 默认向前填充，和最长的文本一致

print(pad_seq)
test_data=[
    'i really love my dog',
    'my dog loves my manatee '
]
test_sequences=tokenizer.texts_to_sequences(test_data) # 将文本中的每个文本转换为一个整数序列。

print(test_sequences)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6,
 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
1
2
3
4
5
6
7
8

打造识别文本情感的模型

json文件 https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection/version/2

这里只用文本标题，不使用文本具体的内容

sarcastic 讽刺的

{"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5",
 "headline": "former versace store clerk sues over secret 'black code' for minority shoppers",
 "is_sarcastic": 0}

1
2
3
4
'运行

在这里插入图片描述

import json

f=open('Sarcasm_Headlines_Dataset.json')

datastore=[]

for line in f:
    datastore.append(json.loads(line))
f.close()

# print(datastore[0])
1
2
3
4
5
6
7
8
9
10
11

sentences=[]
labels=[]
urls=[]

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index=tokenizer.word_index

sequences=tokenizer.texts_to_sequences(sentences)
padded=pad_sequences(sequences,padding='post') # 填充值相同长度

print(padded[0])
print(padded.shape)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)
1
2
3
4
5

代码链接需要科学上网
https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow In Practice/Course 3 - NLP/Course 3 - Week 2 - Lesson 2.ipynb#scrollTo=7SBdAZAenvzL

import json
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

# 下载文件
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

# 划分数据集
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index # 字典

training_sequences = tokenizer.texts_to_sequences(training_sentences) # 序列
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # 填充

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
=================================================================
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)
1
2

Epoch 1/30
625/625 - 7s - loss: 0.6809 - accuracy: 0.5590 - val_loss: 0.6565 - val_accuracy: 0.5725 - 7s/epoch - 11ms/step
Epoch 2/30
625/625 - 3s - loss: 0.5159 - accuracy: 0.7788 - val_loss: 0.4191 - val_accuracy: 0.8261 - 3s/epoch - 5ms/step
Epoch 3/30
625/625 - 3s - loss: 0.3471 - accuracy: 0.8623 - val_loss: 0.3661 - val_accuracy: 0.8468 - 3s/epoch - 5ms/step
Epoch 4/30
625/625 - 3s - loss: 0.2879 - accuracy: 0.8871 - val_loss: 0.3491 - val_accuracy: 0.8554 - 3s/epoch - 5ms/step
Epoch 5/30
625/625 - 3s - loss: 0.2500 - accuracy: 0.9036 - val_loss: 0.3450 - val_accuracy: 0.8562 - 3s/epoch - 5ms/step
...
Epoch 25/30
625/625 - 3s - loss: 0.0453 - accuracy: 0.9868 - val_loss: 0.7406 - val_accuracy: 0.8275 - 3s/epoch - 5ms/step
Epoch 26/30
625/625 - 3s - loss: 0.0421 - accuracy: 0.9882 - val_loss: 0.7737 - val_accuracy: 0.8214 - 3s/epoch - 5ms/step
Epoch 27/30
625/625 - 3s - loss: 0.0395 - accuracy: 0.9887 - val_loss: 0.7965 - val_accuracy: 0.8243 - 3s/epoch - 5ms/step
Epoch 28/30
625/625 - 3s - loss: 0.0366 - accuracy: 0.9898 - val_loss: 0.8316 - val_accuracy: 0.8246 - 3s/epoch - 5ms/step
Epoch 29/30
625/625 - 3s - loss: 0.0341 - accuracy: 0.9903 - val_loss: 0.8655 - val_accuracy: 0.8177 - 3s/epoch - 5ms/step
Epoch 30/30
625/625 - 3s - loss: 0.0317 - accuracy: 0.9913 - val_loss: 0.9000 - val_accuracy: 0.8162 - 3s/epoch - 5ms/step
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
1
2
3
4
5
6
7
8
9
10
11
12
13

在这里插入图片描述

sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))
1
2
3
4

[[9.1123056e-01]
 [1.8805741e-04]]
1
2

把字典存入文本

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(training_padded[0]))
print(training_sentences[2])
print(labels[2])
1
2
3
4
5
6
7
8

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)
1
2
3

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()
1
2
3
4
5
6
7
8
9
10
11

try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')
1
2
3
4
5
6
7
'运行

vecs.tsv

0.035059918	0.06969988	-0.046476096	0.033137582	0.06961141	0.05123605	-0.069506176	-0.12660429	0.041010037	0.0003397553	-0.039760135	0.049228124	-0.014847669	-0.022333661	-0.027154364	-0.025999745
0.13617258	0.07031224	-0.04279314	0.15561616	0.14426595	-0.022535648	-0.16350295	-0.21112327	0.018741624	-0.033655755	-0.020546112	0.17472167	-0.03205964	-0.16326663	0.07432485	-0.1281586
0.15122083	0.19668455
1
2
3

meta.tsv

<OOV>
to
of
the
in
for
a
on
and
with
is
new
trump
man
from
at
about
you
by
this
after
be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

用循环神经网络进行机器学习

长短期记忆网络 (LSTM)

本文内容由网友自发贡献，转载请注明出处：https://www.wpsshop.cn/w/空白诗007/article/detail/890362