赞
踩
学习一下使用pytorch来进行文本的分类,记得tensorflow关于文本的分类使用的是不用作家的翻译结果来进行一个翻译作者识别的训练,还有一个影评的感情分类的。这次看一下pytorch的教程是使用pytorch来训练识别一个名字来自哪个国家。
from __future__ import unicode_literals, print_function, division from io import open import glob import os import random import unicodedata import string import time import math import torch.nn as nn #String模块ascii_letters和digits方法,其中ascii_letters是生成所有字母,从a-z和A-Z,digits是生成所有数字0-9. #把所有的英文字符生成一下 all_letters = string.ascii_letters + " .,;'" n_letters = len(all_letters) # Turn a Unicode string to plain ASCII # 标准化处理一下所有的字符,让所有的单词都转换为英文可以对应的单词 def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters ) #获取一个路径下所有统一后缀的文件 def findFiles(path): return glob.glob(path) # Build the category_lines dictionary, a list of names per language category_lines = { } all_categories = [] # Read a file and split into lines def readLines(filename): #读取所有文件,进行去除杂乱字符操作,然后按照'\n'来进行分词形成一个一个的数组元素 lines = open(filename, encoding='utf-8').read().strip().split('\n') return [unicodeToAscii(line) for line in lines] #处理文本数据,标签值放入到all_categories,文本值按照标签:文本的字典格式存入category_lines for filename in findFiles('/content/drive/My Drive/Colab Notebooks/data/names/*.txt'): category = os.path.splitext(os.path.basename(filename))[0] all_categories.append(category) lines = readLines(filename) category_lines[category] = lines n_categories = len(all_categories) import torch # 寻找字母在所有词汇中的编号 def letterToIndex(letter): return all_letters.find(letter) # 将这个字母转换为0nt-hot编码 def letterToTensor(letter): tensor = torch.zeros(1, n_letters) tensor[0][letterToIndex(letter)] = 1 return tensor #将一整个单词转换为ont-hot编码矩阵 def lineToTensor(line): tensor = torch.zeros(len(line), 1, n_letters) for li, letter in enumerate(line): tensor[li][0][letterToIndex(letter)] = 1 return tensor #寻找输出结果中可能性最大的分类 def categoryFromOutput(output): top_n, top_i = output.topk(1) category_i = top_i[0].item() return all_categories[category_i], category_i # def randomChoice(l): return l[random.randint(0, len(l) - 1)] def randomTrainingExample(): category = randomChoice(all_categories) line = randomChoice(category_lines[category]) category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long) line_tensor = lineToTensor(line) return category, line, category_tensor, line_tensor
上面的步骤就是把不同文件名的数据按字典的结构(类别:数据)读入内存,以字母为单位对数据进行one-hot编码,定义了堆函数来实现了这个功能。
#声明一个模型类
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。