赞
踩
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from collections import Counter
import random
from torch.utils.data import Subset, DataLoader
import time
# 主函数 if __name__ == '__main__': start_time = time.time() # 开始计时 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #terminology = load_terminology_dictionary('../dataset/en-zh.dic') terminology = load_terminology_dictionary('./dataset/en-zh.dic') # 加载数据 dataset = TranslationDataset('./dataset/train.txt',terminology = terminology) # 选择数据集的前N个样本进行训练 N = 1000 #int(len(dataset) * 1) # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1) subset_indices = list(range(N)) subset_dataset = Subset(dataset, subset_indices) train_loader = DataLoader(subset_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn) # 定义模型参数 INPUT_DIM = len(dataset.en_vocab) OUTPUT_DIM = len(dataset.zh_vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYERS = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 # 初始化模型 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) # 定义优化器和损失函数 optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss(ignore_index=dataset.zh_word2idx['<pad>']) # 训练模型 N_EPOCHS = 10 CLIP = 1 for epoch in range(N_EPOCHS): train_loss = train(model, train_loader, optimizer, criterion, CLIP) print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}') # 在训练循环结束后保存模型 torch.save(model.state_dict(), 'translation_model_GRU.pth') end_time = time.time() # 结束计时 # 计算并打印运行时间 elapsed_time_minute = (end_time - start_time)/60 print(f"Total running time: {elapsed_time_minute:.2f} minutes")
输出结果如下:
Epoch: 01 | Train Loss: 6.555
Epoch: 02 | Train Loss: 6.060
Epoch: 03 | Train Loss: 6.030
Epoch: 04 | Train Loss: 5.988
Epoch: 05 | Train Loss: 5.922
Epoch: 06 | Train Loss: 5.868
Epoch: 07 | Train Loss: 5.799
Epoch: 08 | Train Loss: 5.700
Epoch: 09 | Train Loss: 5.603
Epoch: 10 | Train Loss: 5.518
Total running time: 1.31 minutes
# 主函数 if __name__ == '__main__': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 加载术语词典 terminology = load_terminology_dictionary('./dataset/en-zh.dic') # 加载数据集和模型 dataset = TranslationDataset('./dataset/train.txt',terminology = terminology) # 定义模型参数 INPUT_DIM = len(dataset.en_vocab) OUTPUT_DIM = len(dataset.zh_vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYERS = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 # 初始化模型 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) # 加载训练好的模型 model.load_state_dict(torch.load('translation_model_GRU.pth')) save_dir = './dataset/submit.txt' inference(model, dataset, src_file="./dataset/test_en.txt", save_dir = save_dir, terminology = terminology, device = device) print(f"翻译完成!文件已保存到{save_dir}")
输出结果如下:
翻译完成!文件已保存到./dataset/submit.txt
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。