赞
踩
tokenization
是指分词,每个词称为token
。
准备词语特征的方法。
# 当n=2时
import jieba
text = "深度学习是机器学习的分支,是一种以人工神经网络为架构,对数据进行表征学习的方法"
cuted = jieba.lcut(text)
n_gram2 = [cuted[i:i+2] for i in range(len(cuted)-1)]
print(n_gram2)
out:[['深度', '学习'], ['学习', '是'], ['是', '机器'], ['机器', '学习'], ['学习', '的'], ['的', '分支'], ['分支', ','], [',', '是'], ['是', '一种'], ['一种', '以'], ['以', '人工'], ['人工', '神经'], ['神经', '网路'], ['网路', '为'], ['为', '架构'], ['架构', ','], [',', '对'], ['对', '数据'], ['数据', '进行'], ['进行', '表征'], ['表征', '学习'], ['学习', '的'], ['的', '方法']]
文本不能直接被模型计算,应将其转化为向量。转换向量的方法有两种:1.one-hot方法;2.word embedding方法。
token | one-hot encoding |
---|---|
深 | 1000 |
度 | 0100 |
学 | 0010 |
习 | 0001 |
word embedding 是深度学习表示文本常用的一种方法。与one-hot编码不同,word embedding使用了浮点型的稠密矩阵来表示token。根据词典的大小,我们的向量通常使用不同的维度,例如100,256,300等。向量中的每一个值是一个超参数,其初始值是随机生成的,之后会在训练的过程中学习而获得。
如果有20000个词语,one-hot编码会产生20000*20000
的矩阵,而word embedding方法会产生的矩阵是20000*维度
,例如20000*200
token | num | vector |
---|---|---|
深 | 1 | [w11, w12, …,w1N] |
度 | 2 | [w21, w22, …,w2N] |
学 | 3 | [w31, w32, …,w3N] |
习 | 4 | [w41, w42, …,w4N] |
即 token---->num—vector |
torch.nn.Embedding(num_embeddings, embedding_dim)
# dataset.py import os, re import torch from torch.utils.data import DataLoader, Dataset from lib import ws, max_len, BATCH_SIZE def tokenize(content): content = re.sub("<.*?>", " ", content) filters = [':', '\.', '\t', '\n', '\x93', '\x97', '\x96', '#', '$', '%', '&'] content = re.sub("|".join(filters), " ", content) tokens = [i.strip().lower() for i in content.split()] return tokens class ImdbDataset(Dataset): def __init__(self, train=True): self.train_data_path = r'C:\Users\Administrator\PycharmProjects\pythonProject\data\aclImdb\train' self.test_data_path = r'C:\Users\Administrator\PycharmProjects\pythonProject\data\aclImdb\test' data_path = self.train_data_path if train else self.test_data_path # 把所有的文件名放入列表 temp_data_path = [os.path.join(data_path, "pos"), os.path.join(data_path, "neg")] self.total_file_path = [] for path in temp_data_path: file_name_list = os.listdir(path) file_path_list = [os.path.join(path, i) for i in file_name_list if i.endswith(".txt")] self.total_file_path.extend(file_path_list) def __getitem__(self, index): file_path = self.total_file_path[index] # 获取label label_str = file_path.split('\\')[-2] label = 0 if label_str == "neg" else 1 tokens = tokenize(open(file_path, encoding='utf-8').read()) return tokens, label def __len__(self): return len(self.total_file_path) def collate_fn(batch): content, label = list(zip(*batch)) content = [ws.transform(i, max_len=max_len) for i in content] content = torch.LongTensor(content) label = torch.LongTensor(label) return content, label def get_dataloader(train=True): imdb_dataset = ImdbDataset(train) data_loader = DataLoader(imdb_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) return data_loader if __name__ == '__main__': for index, (input, target) in enumerate(get_dataloader()): print(index) print(input) print("-"*10) print(target) break
# word_sequence.py class WordSequence: # fit 获取每个单词出现的个数 # build_dict 为每个单词分配一个唯一的数字,按照出现次数的多少排序,可以去除出现较少的单词,并将该词典反正,每个数字与单词对应 def __init__(self): self.UNK = 0 self.PAD = 1 self.dict = { "UNK": 0, "PAD": 1 } self.count = {} def fit(self, sentence): """ 把句子保存在到dict中 :param sentence: :return: None """ for word in sentence: self.count[word] = self.count.get(word, 0) + 1 def build_dict(self, min=5, max=None, max_features=None): """ 构建词典 :param min: 最少出现的次数 :param max: 最多出现的次数 :param max_features: 保留的特征数 :return: None """ if min is not None: self.count = {word: value for word, value in self.count.items() if value >= min} if max is not None: self.count = {word: value for word, value in self.count.items() if value <= max} if max_features is not None: temp = sorted(self.count.items(), key=lambda x: x[-1], reverse=True)[:max_features] self.count = dict(temp) for word in self.count.items(): self.dict[word[0]] = len(self.dict) # 翻转dict self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys())) # print(self.dict) # print(self.inverse_dict) def transform(self, sentence, max_len=None): """ 把句子序列化 :return: """ if max_len is not None: if max_len > len(sentence): sentence = sentence + [self.PAD] * (max_len - len(sentence)) if max_len < len(sentence): sentence = sentence[:max_len] # print("dict长度:", len(self.dict)) return [self.dict.get(word, self.UNK) for word in sentence] def inverse_transform(self, indices): """ 把序列转化为句子 :return: """ return [self.inverse_dict.get(index, self.UNK) for index in indices] def __len__(self): return len(self.dict) if __name__ == '__main__': # data = ["your", "name", "is", "Wang", "is", "name", "bit"] word_sequence = WordSequence() word_sequence.fit(["我", "是", "谁"]) word_sequence.fit(["我", "是", "我"]) word_sequence.build_dict(min=1) ret = word_sequence.transform(["我", "爱", "我", "的", "祖", "国"]) ret2 = word_sequence.inverse_transform(ret) print(ret) print(ret2)
# main.py import pickle from tqdm import tqdm from word_sequence import WordSequence from dataset import tokenize import os if __name__ == '__main__': ws = WordSequence() data_path = r"C:\Users\Administrator\PycharmProjects\pythonProject\data\aclImdb\train" temp_paths = [os.path.join(data_path, "neg"), os.path.join(data_path, "pos")] for temp_path in temp_paths: filepaths = [os.path.join(temp_path, filename) for filename in os.listdir(temp_path) if filename.endswith(".txt")] for filepath in tqdm(filepaths): sentence = tokenize(open(filepath, encoding='utf-8').read()) ws.fit(sentence) ws.build_dict(min=10, max_features=10000) pickle.dump(ws, open("./model/ws.pkl", "wb")) ws.transform("your") print(len(ws))
# model.py from torch import nn from lib import ws, max_len from dataset import get_dataloader from torch.nn import functional from torch.optim import Adam class MyModel(nn.Module): def __init__(self): super(MyModel, self).__init__() self.embedding = nn.Embedding(len(ws), 100) self.fc = nn.Linear(max_len*100, 2) def forward(self, input): x = self.embedding(input) # [batch_size, 20, 100] x = x.view([-1, max_len * 100]) out = self.fc(x) return functional.log_softmax(out, dim=-1) model = MyModel() optimizer = Adam(model.parameters(), lr=0.001) def train(epoch): for index, (input, target) in enumerate(get_dataloader(train=True)): optimizer.zero_grad() output = model(input) loss = functional.nll_loss(output, target) loss.backward() optimizer.step() print(loss.item()) if __name__ == '__main__': for i in range(1): train(i)
# lib.py
import pickle
ws = pickle.load(open("./model/ws.pkl", "rb"))
max_len = 15
BATCH_SIZE = 128
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。