NLP - LSTM 文本预测（多分类）_lstm多分类

作者：小小林熬夜学编程 | 2024-06-16 05:34:37

踩

lstm多分类

文章目录

项目说明

代码改编转载自唐国梁Tommy：12-01 轻松学 PyTorch LSTM文本生成_字符级

代码实现

知识点：

1、LSTM 层的输入（input）格式 —> (batch_size, sequence_length, number_features)
参数讲解：

batch_size : 每批多少个序列
sequence_length : 序列长度（步长）
number_features : 特征个数

2、LSTM 层的输出（output）格式 --> (batch_size, sequence_length, hidden_size)
参数讲解：

batch_size : 每批多少个序列
sequence_length : 序列长度
hidden_size : 隐藏层节点node个数

3、Linear 层的输入（input）格式 --> (batches, n_hidden)
参数讲解：

batches : 有多少个batch_size
n_hidden : 隐藏层节点node个数

模拟演示

input_size = 1 # 特征个数
hidden_size = 100 # LSTM层中100个隐层结点
n_layers = 2 # 如果是堆叠LSTM，表示为2层lstm
output_size = 1 # LSTM的输出大小

lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True) # LSTM层
linear = nn.Linear(hidden_size, 1) # 全连接层

x = get_batches(data) # 构造新的数据集，输入格式：(batch_size, seq_len, num_features)
x, h_s = lstm(x, hidden_size) # LSTM output : (batch_size, seq_len, hidden_size)
x = x.reshape(-1, hidden_size) # Linear in : (batch_size * seq_len, hidden_size)
x = linear(x) # linear out : (batch_size * seq_len, output_size)

文本预测（本质上是分类问题）

import torch
from torch import nn, optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
1
2
3
4
5
6
7
8
9
10
11

定义网络

class lstm_model(nn.Module):
  
    def __init__(self, vocab, hidden_size, num_layers, dropout=0.5):
  
  			super(lstm_model, self).__init__()
	        self.vocab = vocab # 字符数据集
	        
	        # 索引 : 字符
	        self.int_char = {i : char for i, char in enumerate(vocab)} 
	        # 另一种写法：self.int_char = dict(enumerate(vocab))
	        
	        # 字符 : 索引
	        self.char_int = {char : i for i, char in self.int_char.items()}
	        
	        # 对字符进行one-hot encoding
	        # 这里需要对vocab进行shape转换
	        self.encoder = OneHotEncoder(sparse=False).fit(vocab.reshape(-1, 1)) 
	        
	        self.hidden_size = hidden_size
	        self.num_layers = num_layers
	        
	        # lstm层
	        self.lstm = nn.LSTM(len(vocab), hidden_size, num_layers, batch_first=True, dropout=dropout)
	        
	        # 全连接层
	        self.linear = nn.Linear(hidden_size, len(vocab)) # 这里的输出shape是每个字符的得分
        
    def forward(self, sequence, hs=None):
	    # lstm的输出格式：（batch_size, sequence_length, hidden_size）
        out, hs = self.lstm(sequence, hs) 
        
        # 这里需要将out转换为linear的输入格式，即(batch_size*sequence_length, hidden_size)
        out = out.reshape(-1, self.hidden_size) 
		
		# linear的输出格式：((batch_size*sequence_length, vocab_size)
        output = self.linear(out) 
        return output, hs
        
    def onehot_encode(self, data):
        return self.encoder.transform(data)
    
    def onehot_decode(self, data):
        return self.encoder.inverse_transform(data)
    
    def label_encode(self, data):
        return np.array([self.char_int[ch] for ch in data])
    
    def label_decode(self, data):
        return np.array([self.int_char[ch] for ch in data])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

构建数据集

# 定义构建新数据集的批处理方法
def get_batches(data, batch_size, seq_len):
    """
    参数
    -------------
    data : 源数据，输入格式（num_samples, num_features）
    batch_size : batch的大小
    seq_len : 序列的长度（跨度）
    
    return
    -------------
    新的数据集，格式：（batch_size, seq_len, num_features）
    """
    # 数据的列数，即特征数(本案例83个不同的字符)
    num_features = data.shape[1] 
    #print("num_features : ", num_features)
    
    # 一个batch_size的字符数量（本案例12800个字符，128 * 100 = 12800）
    num_chars = batch_size * seq_len 
    #print("num_chars : ", num_chars)
    
    # 计算出有多少个batches（本案例中文本数据最多有124个batches）
    num_batches = int(np.floor(len(data) / num_chars)) 
    #print("num_batches : ", num_batches)
    
    # 根据batch_size 和 batches 计算出所需的总字符数量（本案例共需1587200个字符）
    need_chars = num_batches * num_chars 
    #print("need_chars : ", need_chars)
    
    # 标签数据，注意：标签数据是往后全部挪一位
    targets = np.append(data[1:], data[0]).reshape(data.shape) 
    #print("targets shape ", targets.shape) # [1588179, 83]
    #print("data.shape : ", data.shape) # [1588179, 83]
    
    # 从原始数据data中截取所需的字符数量need_words
    inputs = data[:need_chars] 
    
    # 从原始标签targets中截取所需的字符数量need_words
    targets = targets[:need_chars] 
    
    #print("inputs.shape : ", inputs.shape) # (1587200, 83)
    #print("targets.shape : ", targets.shape) # (1587200, 83)
    
    # shape转换
    inputs = inputs.reshape(batch_size, -1, num_features)
    targets = targets.reshape(batch_size, -1, num_features)
    #print("inputs reshape : ", inputs.shape) # (128, 12400, 83)
    #print("targets reshape : ", targets.shape)
    
    # 构建新的数据集
    for i in range(0, inputs.shape[1], seq_len):
        x = inputs[:, i : i+seq_len]
        y = targets[:, i : i+seq_len]
        yield x, y

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

定义训练函数

def train(model, data, batch_size, seq_len, epochs, lr=0.01, valid=None):
    '''
    参数说明
    -----------
    model : 定义的字符级网络模型
    data  : 文本数据
    batch_size : 一个batch多少个数据
    seq_len : 序列长度（步长）
    epochs : 训练循环次数
    lr : 学习率
    valid : 验证数据
    '''
    # 是否有cuda
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # 部署模型到device
    model = model.to(device)
    # 优化器
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # 损失函数
    criterion = nn.CrossEntropyLoss()
    
    # 判断是否有valid数据（即是否边训练边验证）
    if valid is not None:
        data = model.onehot_encode(data.reshape(-1, 1))
        valid = model.onehot_encode(valid.reshape(-1, 1))
    else:
        data = model.onehot_encode(data.reshape(-1, 1))
        
    # 保存损失值
    train_loss = []
    val_loss = []
    
    # 循环训练（验证）
    for epoch in range(epochs):
        model.train()
        
        # hs 等于 hidden_size,隐藏层结点
        hs = None 
        train_ls = 0.0
        val_ls = 0.0
        
        for x, y in get_batches(data, batch_size, seq_len):
            #print("y.shape_1 : ", y.shape) # (128, 100, 83)
            # 每一轮循环，生成一批 数据+标签（data+target）
            optimizer.zero_grad() # 梯度置零
            x = torch.tensor(x).float().to(device) # 类型转换
            #print("x shape : ", x.shape) # torch.Size([128, 100, 83])
            
            # 模型训练
            out, hs = model(x, hs) # 模型输出shape : （batch_size, sequence_length, hidden_size）
            hs = ([h.data for h in hs]) # 读取每一个hidden_size的结点
            
            # 对targets的one-hot encoding进行逆向转换
            y = y.reshape(-1, len(model.vocab))
            #print("y.shape_2 : ", y.shape) # (12800, 83)
            
            y = model.onehot_decode(y)
            #print("y.shape_3 : ", y.shape) # (12800, 1)
            
            # 对y进行label encoding
            y = model.label_encode(y.squeeze())
            #print("y.shape_4 : ", y.shape) # (12800,)
            
            # 类型转换
            y = torch.from_numpy(y).long().to(device)
            #print("y.shape_5 : ", y.shape) # torch.Size([12800])
            
            # 计算损失函数
            loss = criterion(out, y.squeeze())
            
            # 反向传播
            loss.backward()
            
            # 参数更新
            optimizer.step()
            
            # 累计训练损失
            train_ls += loss.item()
        
        
        if valid is not None:
            # 开始验证
            model.eval()
            hs = None
            with torch.no_grad():
                for x, y in get_batches(valid, batch_size, seq_len):
                    x = torch.tensor(x).float().to(device)
                    out, hs = model(x, hs) # 预测输出
                    hs = ([h.data for h in hs])
                    
                    y = y.reshape(-1, len(model.vocab))
                    y = model.onehot_decode(y)
                    y = model.label_encode(y.squeeze())
                    y = torch.from_numpy(y).long().to(device)
                    
                    loss = criterion(out, y.squeeze())
                    val_ls += loss.item()
                    
                val_loss.append(np.mean(val_ls)) # 求出每一轮的损失均值，并累计
                
            train_loss.append(np.mean(train_ls)) # 求出每一轮的损失均值，并累计
            
        print(f'--------------Epochs{epochs} | {epoch}---------------')
        print(f'Train Loss : {train_loss[-1]}') # 这里-1为最后添加进去的loss值，即本轮batch的loss
        if val_loss:
            print(f'Val Loss : {val_loss[-1]}')
            
    # 绘制loss曲线
    plt.plot(train_loss, label='Train Loss')
    plt.plot(val_loss, label='Val Loss')
    plt.title('Loss vs Epochs')
    plt.legend()
    plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

数据准备

# 获取数据
with open("anna.txt") as data:
    text = data.read()

# 筛选出文本数据中不同的字符
vocab = np.array(sorted(set(text)))
# 字符的数量
vocab_size = len(vocab) 
1
2
3
4
5
6
7
8

查看数据

# 显示前100个字符
text[:100]
# 'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin' 
# vocab
array(['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-',
       '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',
       ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
       'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
       'X', 'Y', 'Z', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
       'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
       'v', 'w', 'x', 'y', 'z'], dtype='<U1')
1
2
3
4
5
6
7
8
9
10
11

# 切分text为train和val
# 假设 val 占比 20%
val_len = int(np.floor(0.2 * len(text))) 

# train 和 val
trainset = np.array(list(text[:-val_len]))
validset = np.array(list(text[-val_len:]))

print(trainset.shape) # (1588179,)
print(validset.shape) # (397044,)
1
2
3
4
5
6
7
8
9
10

训练

# 定义超参数
hidden_size = 512
num_layers = 2
batch_size = 128
seq_len = 100
epochs = 20
lr = 0.01

# 创建模型对象
model = lstm_model(vocab, hidden_size, num_layers)
1
2
3
4
5
6
7
8
9
10

# model momo'ximo'xing模型nenei'rnei'ron内容
lstm_model(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (linear): Linear(in_features=512, out_features=83, bias=True)
)
1
2
3
4
5

train(model, trainset, batch_size, seq_len, epochs, lr=lr, valid=validset)
1

# 训练日志
--------------Epochs20 | 0---------------
Train Loss : 388.6049892902374
Val Loss : 96.39854288101196
--------------Epochs20 | 1---------------
Train Loss : 361.311678647995
Val Loss : 86.85394430160522
...
--------------Epochs20 | 19---------------
Train Loss : 266.43750071525574
Val Loss : 64.93880939483643
1
2
3
4
5
6
7
8
9
10
11

请添加图片描述

模型预测

def predict(model, char, top_k = None, hidden_size = None):
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        char = np.array([char]) # 转换为array
        char = char.reshape(-1, 1) # shape转换
        char_encoding = model.onehot_encode(char) # encoding
        char_encoding = char_encoding.reshape(1, 1, -1) # (batch_size, seq_len, num_features)
        char_tensor = torch.tensor(char_encoding, dtype=torch.float32) # 类型转换
        char_tensor = char_tensor.to(device) # 部署到device上
        
        out, hidden_size = model(char_tensor, hidden_size) # 模型预测
   
        probs = F.softmax(out, dim=1).squeeze() # torch.Size([1, 83]) --> torch.Size([83])
        #probs = F.softmax(out, dim=1).data # 另一种写法，结果一致
        
        
        if top_k is None:
            indices = np.arange(vocab_size)
        else:
            probs, indices = probs.topk(top_k) # 选取概率最大的前top_k个
            indices = indices.cpu().numpy()
        
        probs = probs.cpu().numpy()
        
        char_index = np.random.choice(indices, p = probs / probs.sum()) # 随机选取一个索引
        char = model.int_char[char_index] # 获取索引对应的字符
        
    return char, hidden_size
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

# 获取一个样本
def sample(model, length, top_k = None, sentence="every unhappy family "):
    hidden_size = None # 初始化
    new_sentence = [char for char in sentence] # 初始化
    for i in range(length):
        next_char, hidden_size = predict(model, new_sentence[-1], top_k = top_k, hidden_size = hidden_size) # 预测下一个字符
        new_sentence.append(next_char)
        
    return ''.join(new_sentence)
1
2
3
4
5
6
7
8
9

new_text = sample(model, 2000, top_k=5)
# 'every unhappy family cheed ale tall.\nAnd he welle as than stitt hou the hart and sooker..".\n\n"Yons the with and ale\nfise ale,...'
1
2

保存模型

model_name = "lstm_model.net"

checkpoint = {
    'hidden_size' : model.hidden_size,
    'num_layers' : model.num_layers,
    'state_dict' : model.state_dict()
}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)
1
2
3
4
5
6
7
8
9
10

伊织 2023-02-22（三）

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/725344