赞
踩
接着,导入所需的库:
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.nn.utils import clip_grad_norm_ from torchtext.data.metrics import bleu_score from torch.utils.data import Dataset, DataLoader from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from typing import List, Tuple import jieba import random from torch.nn.utils.rnn import pad_sequence import sacrebleu import time import math
定义Tokenizer:
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
zh_tokenizer = lambda x: list(jieba.cut(x))
读取数据:
def read_data(file_path: str) -> List[str]:
with open(file_path, 'r', encoding='utf-8') as f:
return [line.strip() for line in f]
预处理数据:
def preprocess_data(en_data: List[str], zh_data: List[str]) -> List[Tuple[List[str], List[str]]]:
processed_data = []
for en, zh in zip(en_data, zh_data):
en_tokens = en_tokenizer(en.lower())[:MAX_LENGTH]
zh_tokens = zh_tokenizer(zh)[:MAX_LENGTH]
if en_tokens and zh_tokens:
processed_data.append((en_tokens, zh_tokens))
return processed_data
构建词汇表:
def build_vocab(data: List[Tuple[List[str], List[str]]]):
en_vocab = build_vocab_from_iterator(
(en for en, _ in data),
specials=['<unk>', '<pad>', '<bos>', '<eos>']
)
zh_vocab = build_vocab_from_iterator(
(zh for _, zh in data),
specials=['<unk>', '<pad>', '<bos>', '<eos>']
)
en_vocab.set_default_index(en_vocab['<unk>'])
zh_vocab.set_default_index(zh_vocab['<unk>'])
return en_vocab, zh_vocab
自定义数据集:
class TranslationDataset(Dataset):
def __init__(self, data: List[Tuple[List[str], List[str]]], en_vocab, zh_vocab):
self.data = data
self.en_vocab = en_vocab
self.zh_vocab = zh_vocab
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
en, zh = self.data[idx]
en_indices = [self.en_vocab['<bos>']] + [self.en_vocab[token] for token in en] + [self.en_vocab['<eos>']]
zh_indices = [self.zh_vocab['<bos>']] + [self.zh_vocab[token] for token in zh] + [self.zh_vocab['<eos>']]
return en_indices, zh_indices
批次处理函数:
def collate_fn(batch):
en_batch, zh_batch = [], []
for en_item, zh_item in batch:
if en_item and zh_item:
en_batch.append(torch.tensor(en_item))
zh_batch.append(torch.tensor(zh_item))
if not en_batch or not zh_batch:
return torch.tensor([]), torch.tensor([])
en_batch = nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=en_vocab['<pad>'])
zh_batch = nn.utils.rnn.pad_sequence(zh_batch, batch_first=True, padding_value=zh_vocab['<pad>'])
return en_batch, zh_batch
加载数据:
def load_data(train_path: str, dev_en_path: str, dev_zh_path: str, test_en_path: str): # 读取训练数据 train_data = read_data(train_path) train_en, train_zh = zip(*(line.split('\t') for line in train_data)) # 读取开发集和测试集 dev_en = read_data(dev_en_path) dev_zh = read_data(dev_zh_path) test_en = read_data(test_en_path) # 预处理数据 train_processed = preprocess_data(train_en, train_zh) dev_processed = preprocess_data(dev_en, dev_zh) test_processed = [(en_tokenizer(en.lower())[:MAX_LENGTH], []) for en in test_en if en.strip()] # 构建词汇表 global en_vocab, zh_vocab en_vocab, zh_vocab = build_vocab(train_processed) # 创建数据集 train_dataset = TranslationDataset(train_processed, en_vocab, zh_vocab) dev_dataset = TranslationDataset(dev_processed, en_vocab, zh_vocab) test_dataset = TranslationDataset(test_processed, en_vocab, zh_vocab) from torch.utils.data import Subset indices = list(range(N)) train_dataset = Subset(train_dataset, indices) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True) dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, drop_last=True) return train_loader, dev_loader, test_loader, en_vocab, zh_vocab
位置编码:
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
Transformer模型:
class TransformerModel(nn.Module): def __init__(self, src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout): super(TransformerModel, self).__init__() self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout) self.src_embedding = nn.Embedding(len(src_vocab), d_model) self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model) self.positional_encoding = PositionalEncoding(d_model, dropout) self.fc_out = nn.Linear(d_model, len(tgt_vocab)) self.src_vocab = src_vocab self.tgt_vocab = tgt_vocab self.d_model = d_model def forward(self, src, tgt): src = src.transpose(0, 1) tgt = tgt.transpose(0, 1) src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device) tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device) src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1) tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1) src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model)) tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)) output = self.transformer(src_embedded, tgt_embedded, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask) return self.fc_out(output) def encode(self, src): src = src.transpose(0, 1) src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device) src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1) src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model)) memory = self.transformer.encoder(src_embedded, src_mask, src_padding_mask) return memory def decode(self, tgt, memory): tgt = tgt.transpose(0, 1) tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device) tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1) tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)) output = self.transformer.decoder(tgt_embedded, memory, tgt_mask, None, tgt_padding_mask, tgt_padding_mask) return self.fc_out(output)
训练函数:
def train_epoch(model, data_loader, criterion, optimizer, device):
model.train()
epoch_loss = 0
for en, zh in data_loader:
en, zh = en.to(device), zh.to(device)
optimizer.zero_grad()
output = model(en, zh[:, :-1])
loss = criterion(output.reshape(-1, output.shape[-1]), zh[:, 1:].reshape(-1))
loss.backward()
clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(data_loader)
评估函数:
def evaluate(model, data_loader, criterion, device):
model.eval()
epoch_loss = 0
with torch.no_grad():
for en, zh in data_loader:
en, zh = en.to(device), zh.to(device)
output = model(en, zh[:, :-1])
loss = criterion(output.reshape(-1, output.shape[-1]), zh[:, 1:].reshape(-1))
epoch_loss += loss.item()
return epoch_loss / len(data_loader)
训练循环:
def train_model(train_loader, dev_loader, model, optimizer, criterion, device, num_epochs):
for epoch in range(num_epochs):
start_time = time.time()
train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
dev_loss = evaluate(model, dev_loader, criterion, device)
end_time = time.time()
epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f}')
print(f'\t Val. Loss: {dev_loss:.3f}')
翻译函数:
def translate_sentence(model, sentence, en_vocab, zh_vocab, device, max_length=50):
model.eval()
tokens = [en_vocab['<bos>']] + [en_vocab[token] for token in en_tokenizer(sentence.lower())] + [en_vocab['<eos>']]
src_tensor = torch.LongTensor(tokens).unsqueeze(1).to(device)
memory = model.encode(src_tensor)
outputs = [zh_vocab['<bos>']]
for _ in range(max_length):
tgt_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
output = model.decode(tgt_tensor, memory)
pred_token = output.argmax(2)[-1].item()
outputs.append(pred_token)
if pred_token == zh_vocab['<eos>']:
break
translated_tokens = [zh_vocab.itos[i] for i in outputs]
return translated_tokens[1:-1]
计算BLEU分数:
def calculate_bleu(data, model, en_vocab, zh_vocab, device):
trgs = []
pred_trgs = []
for src, trg in data:
src = " ".join([en_vocab.itos[token] for token in src])
pred_trg = translate_sentence(model, src, en_vocab, zh_vocab, device)
pred_trgs.append(pred_trg)
trgs.append([trg])
return bleu_score(pred_trgs, trgs)
上述代码实现了一个基于Transformer的中英翻译模型,从数据预处理、模型构建到模型训练和评估,完整地展示了如何实现一个机器翻译系统。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。