from __future__ import unicode_literals, print_function, division from io import open import glob import os import random import unicodedata import string import time import math import torch.nn as nn #String模块ascii_letters和digits方法,其中ascii_letters是生成所有字母,从a-z和A-Z,digits是生成所有数字0-9. #把所有的英文字符生成一下 all_letters = string.ascii_letters + " .,;'" n_letters = len(all_letters) # Turn a Unicode string to plain ASCII # 标准化处理一下所有的字符,让所有的单词都转换为英文可以对应的单词 def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters ) #获取一个路径下所有统一后缀的文件 def findFiles(path): return glob.glob(path) # Build the category_lines dictionary, a list of names per language category_lines = { } all_categories = [] # Read a file and split into lines def readLines(filename): #读取所有文件,进行去除杂乱字符操作,然后按照'\n'来进行分词形成一个一个的数组元素 lines = open(filename, encoding='utf-8').read().strip().split('\n') return [unicodeToAscii(line) for line in lines] #处理文本数据,标签值放入到all_categories,文本值按照标签:文本的字典格式存入category_lines for filename in findFiles('/content/drive/My Drive/Colab Notebooks/data/names/*.txt'): category = os.path.splitext(os.path.basename(filename))[0] all_categories.append(category) lines = readLines(filename) category_lines[category] = lines n_categories = len(all_categories) import torch # 寻找字母在所有词汇中的编号 def letterToIndex(letter): return all_letters.find(letter) # 将这个字母转换为0nt-hot编码 def letterToTensor(letter): tensor = torch.zeros(1, n_letters) tensor[0][letterToIndex(letter)] = 1 return tensor #将一整个单词转换为ont-hot编码矩阵 def lineToTensor(line): tensor = torch.zeros(len(line), 1, n_letters) for li, letter in enumerate(line): tensor[li][0][letterToIndex(letter)] = 1 return tensor #寻找输出结果中可能性最大的分类 def categoryFromOutput(output): top_n, top_i = output.topk(1) category_i = top_i[0].item() return all_categories[category_i], category_i # def randomChoice(l): return l[random.randint(0, len(l) - 1)] def randomTrainingExample(): category = randomChoice(all_categories) line = randomChoice(category_lines[category]) category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long) line_tensor = lineToTensor(line) return category, line, category_tensor, line_tensor
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(
