赞
踩
现在我们有一个经典的数据集IMDB
数据集,地址:http://ai.stanford.edu/~amaas/data/sentiment/
,这是一份包含了5万条流行电影的评论数据,其中训练集25000条,测试集25000条。数据格式如下:
下图左边为名称,其中名称包含两部分,分别是序号和情感评分,(1-4为neg,5-10为pos),右边为评论内容
根据上述的样本,需要使用pytorch完成模型,实现对评论情感进行预测
首先可以把上述问题定义为分类问题,情感评分分为1-10,10个类别(也可以理解为回归问题,这里当做分类问题考虑)。那么根据之前的经验,我们的大致流程如下:
知道思路之后,那么我们一步步来完成上述步骤
准备数据集和之前的方法一样,实例化dataset,准备dataloader,最终我们的数据可以处理成如下格式:
其中有两点需要注意:
import numpy as np from torch.utils.data import DataLoader, Dataset import torch import os import re BATCH_SIZE_TRAIN = 2 BATCH_SIZE_TEST = 2 MAX_LEN = 500 # =======================================进行自定义文本分词【可以用第三方分词工具】:开始======================================= def tokenlize(sentence): fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ] sentence = sentence.lower() # 把大写转化为小写 sentence = re.sub("<br />", " ", sentence) # sentence = re.sub("I'm","I am",sentence) # 当语料量足够多时,可以学习到I'm的含义。 # sentence = re.sub("isn't","is not",sentence) sentence = re.sub("|".join(fileters), " ", sentence) result = [i for i in sentence.split(" ") if len(i) > 0] return result # =======================================进行自定义文本分词【可以用第三方分词工具】:结束======================================= # =======================================Dataset数据准备:开始======================================= class ImdbDataset(Dataset): def __init__(self, wordSequence=None, train=True): super(ImdbDataset,self).__init__() self.wordSequence = wordSequence data_path = r"./data/aclImdb" data_path += r"/train" if train else r"/test" # 文件名拼接【等价于os.path.join()】 self.total_path = [] # 保存所有的文件路径 for temp_path in [r"/pos", r"/neg"]: cur_path = data_path + temp_path self.total_path += [os.path.join(cur_path, i) for i in os.listdir(cur_path) if i.endswith(".txt")] # 将所有文件路径加入到total_path列表中 def __getitem__(self, idx): file = self.total_path[idx] review = tokenlize(open(file, encoding="utf-8").read()) # 读取文件内容(评论) label = int(file.split("_")[-1].split(".")[0]) label = 0 if label < 5 else 1 if self.wordSequence is not None: review = self.wordSequence.transform(review, max_len=MAX_LEN) # 将字符串通过已经保存的“词语-数字”映射器转为数字 return review, label def __len__(self): return len(self.total_path) # =======================================Dataset数据准备:结束======================================= # =======================================DataLoader数据数据批次化:开始======================================= def get_dataloader(dataset, train=True): batch_size = BATCH_SIZE_TRAIN if train else BATCH_SIZE_TEST dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) return dataloader # =======================================DataLoader数据批次化:结束======================================= if __name__ == '__main__': dataset = ImdbDataset() print("dataset[0] = {0}".format(dataset[0])) dataLoader = get_dataloader(dataset=dataset, train=True) for batch_index,(reviews,labels,lenghts) in enumerate(dataLoader): print("batch_index = {0}".format(batch_index)) print("reviews in this batch = {0}".format(reviews)) print("labels in this batch = {0}".format(labels)) print("lenghts in this batch = {0}".format(lenghts)) break
输出如下:
batch_index = 0
reviews in this batch = [('I', 'Want'), ('thought', 'a'), ('this', 'great'), ('was', 'recipe'), ('a', 'for'), ('great', 'failure'), ('idea', 'Take'), ('but', 'a'), ('boy', 's'), ('was', 'y'), ('it', 'plot'), ('poorly', 'add'), ('executed', 'in'), ('We', 'some'), ('do', 'weak'), ('get', 'completely'), ('a', 'undeveloped'), ('broad', 'characters'), ('sense', 'and'), ('of', 'than'), ('how', 'throw'), ('complex', 'in'), ('and', 'the'), ('challenging', 'worst'), ('the', 'special'), ('backstage', 'effects'), ('operations', 'a'), ('of', 'horror'), ('a', 'movie'), ('show', 'has'), ('are', 'known'), ('but', 'Let'), ('virtually', 'stew'), ('no', 'for'), ...('show', 'somehow'), ('rather', 'destroy'), ('than', 'every'), ('anything', 'copy'), ('worth', 'of'), ('watching', 'this'), ('for', 'film'), ('its', 'so'), ('own', 'it'), ('merit', 'will')]
labels in this batch = tensor([3, 1])
明显,其中的text内容出现对应,和想象的不太相似,出现问题的原因在于Dataloader
中的参数collate_fn
collate_fn
的默认值为torch自定义的default_collate
,collate_fn
的作用就是对每个batch进行处理,而默认的default_collate
处理出错。
解决问题的思路:
手段1:考虑先把数据转化为数字序列,观察其结果是否符合要求,之前使用DataLoader并未出现类似错误
手段2:考虑自定义一个collate_fn
,观察结果
这里使用方式2,自定义一个collate_fn
,然后观察结果:
import numpy as np from torch.utils.data import DataLoader, Dataset import torch import os import re BATCH_SIZE_TRAIN = 2 BATCH_SIZE_TEST = 2 MAX_LEN = 500 # =======================================进行自定义文本分词【可以用第三方分词工具】:开始======================================= def tokenlize(sentence): fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ] sentence = sentence.lower() # 把大写转化为小写 sentence = re.sub("<br />", " ", sentence) # sentence = re.sub("I'm","I am",sentence) # 当语料量足够多时,可以学习到I'm的含义。 # sentence = re.sub("isn't","is not",sentence) sentence = re.sub("|".join(fileters), " ", sentence) result = [i for i in sentence.split(" ") if len(i) > 0] return result # =======================================进行自定义文本分词【可以用第三方分词工具】:结束======================================= # =======================================Dataset数据准备:开始======================================= class ImdbDataset(Dataset): def __init__(self, wordSequence=None, train=True): super(ImdbDataset,self).__init__() self.wordSequence = wordSequence data_path = r"./data/aclImdb" data_path += r"/train" if train else r"/test" # 文件名拼接【等价于os.path.join()】 self.total_path = [] # 保存所有的文件路径 for temp_path in [r"/pos", r"/neg"]: cur_path = data_path + temp_path self.total_path += [os.path.join(cur_path, i) for i in os.listdir(cur_path) if i.endswith(".txt")] # 将所有文件路径加入到total_path列表中 def __getitem__(self, idx): file = self.total_path[idx] review = tokenlize(open(file, encoding="utf-8").read()) # 读取文件内容(评论) label = int(file.split("_")[-1].split(".")[0]) label = 0 if label < 5 else 1 if self.wordSequence is not None: review = self.wordSequence.transform(review, max_len=MAX_LEN) # 将字符串通过已经保存的“词语-数字”映射器转为数字 return review, label def __len__(self): return len(self.total_path) # =======================================Dataset数据准备:结束======================================= # =======================================DataLoader数据数据批次化:开始======================================= # 自定义collate_fn方式,对batch数据进行处理【batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果】 def collate_fn(batch): reviews, labels = zip(*batch) lengths = [len(review) if len(review) < MAX_LEN else MAX_LEN for review in reviews] return reviews, labels, lengths def get_dataloader(dataset, train=True): batch_size = BATCH_SIZE_TRAIN if train else BATCH_SIZE_TEST dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloader # =======================================DataLoader数据批次化:结束======================================= if __name__ == '__main__': dataset = ImdbDataset() print("dataset[0] = {0}".format(dataset[0])) dataLoader = get_dataloader(dataset=dataset, train=True) for batch_index,(reviews,labels,lenghts) in enumerate(dataLoader): print("batch_index = {0}".format(batch_index)) print("reviews in this batch = {0}".format(reviews)) print("labels in this batch = {0}".format(labels)) print("lenghts in this batch = {0}".format(lenghts)) break
输出如下:
dataset[0] = (['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', 'i', 'immediately', 'recalled', 'at', 'high', 'a', 'classic', 'line', 'inspector', "i'm", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', 'student', 'welcome', 'to', 'bromwell', 'high', 'i', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'bromwell', 'high', 'is', 'far', 'fetched', 'what', 'a', 'pity', 'that', 'it', "isn't"], 1)
batch_index = 0
reviews in this batch = (
['this', 'movie', 'starts', 'out', 'with', 'an', 'execution', 'of', 'a', 'practitioner', 'of', 'witchcraft', 'and', 'his', 'mistress', 'his', 'head', 'is', 'chopped', 'off', 'and', 'buried', 'separately', 'of', 'his', 'body', 'sounds', 'like', 'the', 'thing', 'that', "wouldn't", 'die', "doesn't", 'it', 'well', 'it', 'does', 'play', 'out', 'a', 'little', 'like', 'that', 'but', 'once', 'the', 'body', 'is', 'reunited', 'with', 'the', 'head', 'all', 'the', 'interesting', 'and', 'gruesome', 'deaths', 'are', 'done', 'and', 'the', 'movie', 'moves', 'very', 'slowly', 'i', 'mean', 'the', 'movie', 'is', 'only', '88', 'minutes', 'long', 'and', 'i', 'kept', 'thinking', 'when', 'is', 'it', 'going', 'to', 'end', 'the', 'characters', 'in', 'the', 'movie', 'are', 'idiots', 'for', 'the', 'most', 'part', 'and', 'they', 'pretty', 'much', 'deserve', 'to', 'die', 'for', 'being', 'really', 'stupid', 'the', 'villain', 'is', 'also', 'very', 'bad', 'as', 'he', 'is', 'slow', 'moving', 'and', 'really', 'you', 'wonder', 'how', 'he', 'manages', 'to', 'do', 'anything', 'considering', 'he', 'is', 'afraid', 'of', 'jewelery', 'the', 'only', 'thing', 'to', 'keep', 'you', 'watching', 'after', 'the', 'head', 'is', 'reattached', 'is', 'the', 'fact', 'that', 'there', 'are', 'so', 'many', 'boobs', 'being', 'flashed', 'that', 'you', 'really', 'begin', 'to', 'lose', 'track', 'still', 'i', 'want', 'to', 'see', 'a', 'horror', 'movie', 'not', 'a', 'soft', 'core', 'porn', 'flick', 'and', 'as', 'a', 'horror', 'movie', 'it', 'is', 'way', 'to', 'slow', 'moving', 'with', 'way', 'to', 'many', 'slow', 'stretches', 'to', 'be', 'even', 'somewhat', 'enjoyable', 'and', "don't", 'read', 'the', 'back', 'of', 'the', 'box', 'as', 'it', 'made', 'it', 'out', 'like', 'there', 'were', 'flesh', 'eating', 'zombies', 'attacking', 'the', 'town', 'there', "isn't", 'only', 'a', 'small', 'scene', 'where', 'three', 'or', 'four', 'zombies', 'attack', 'a', 'house', 'and', 'are', 'so', 'easily', 'repelled', 'they', 'are', 'not', 'a', 'factor', 'in', 'the', 'movie', 'at', 'all', 'and', 'their', 'scene', 'is', 'rather', 'pointless', 'so', 'for', 'the', 'most', 'part', 'i', 'say', 'you', 'should', 'avoid', 'this', 'movie', 'unless', 'you', 'come', 'across', 'it', 'for', 'really', 'cheap'],
['this', 'movie', 'is', 'a', 'great', 'attempt', 'towards', 'the', 'revival', 'of', 'traditional', 'indian', 'values', 'which', 'are', 'being', 'replaced', 'by', 'western', 'ones', 'its', 'a', 'joint', 'family', 'story', 'showing', 'all', 'the', 'ethics', 'every', 'person', 'should', 'follow', 'while', 'communicating', 'with', 'every', 'single', 'relative', 'around', 'shahid', 'kapoor', 'gives', 'a', 'gr88', 'performance', 'as', 'a', 'desi', 'about', 'to', 'tie', 'knot', 'with', 'amrita', 'rao', 'who', 'is', 'also', 'very', 'desi', 'and', 'she', 'also', 'acts', 'pretty', 'well', 'the', 'genre', 'of', 'the', 'movie', 'is', 'the', 'same', 'as', 'hahk', 'and', 'such', 'movies', 'deserve', 'to', 'be', 'made', 'in', 'india', 'for', 'the', 'revival', 'of', 'old', 'traditional', 'values', 'the', 'movies', "doesn't", 'get', '10', 'as', 'it', "isn't", 'very', 'good', 'at', 'music', 'which', 'counts', 'a', 'lot', 'in', 'every', 'movie', 'besides', 'this', 'it', 'is', 'flawless']
)
labels in this batch = (0, 1)
lenghts in this batch = [278, 117]
在介绍word embedding的时候,我们说过,不会直接把文本转化为向量,而是先转化为数字,再把数字转化为向量,那么这个过程该如何实现呢?
这里我们可以考虑把文本中的每个词语和其对应的数字,使用字典保存,同时实现方法把句子通过字典映射为包含数字的列表。
实现文本序列化之前,考虑以下几点:
思路分析:
import numpy as np # =======================================文本序列化:开始======================================= class WordSequence: UNK_TAG = "<UNK>" # 表示未在词典库里出现的未知词汇 PAD_TAG = "<PAD>" # 句子长度不够时的填充符 SOS_TAG = "<SOS>" # 表示一句文本的开始 EOS_TAG = "<EOS>" # 表示一句文本的结束 UNK = 0 PAD = 1 SOS = 2 EOS = 3 def __init__(self): self.word_index_dict = { self.UNK_TAG: self.UNK, self.PAD_TAG: self.PAD, self.SOS_TAG: self.SOS, self.EOS_TAG: self.EOS} # 初始化词语-数字映射字典 self.index_word_dict = {} # 初始化数字-词语映射字典 self.word_count_dict = {} # 初始化词语-词频统计字典 self.fited = False def __len__(self): return len(self.word_index_dict) # 接受句子,统计词频得到 def fit(self,sentence,min_count=1,max_count=None,max_features=None): # 【min_count:最小词频; max_count: 最大词频; max_features: 最大词语数(词典容量大小)】 """ :param sentence:[word1,word2,word3] :param min_count: 最小出现的次数 :param max_count: 最大出现的次数 :param max_feature: 总词语的最大数量 :return: """ for word in sentence: self.word_count_dict[word] = self.word_count_dict.get(word,0) + 1 #所有的句子fit之后,self.word_count_dict就有了所有词语的词频 if min_count is not None: # 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count >= min_count} if max_count is not None:# 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count <= max_count} # 根据条件构造词典 if max_features is not None: # 根据条件保留高词频词语 self.word_count_dict = dict(sorted(self.word_count_dict.items(),key=lambda x:x[-1],reverse=True)[:max_features]) # 保留词频排名靠前的词汇【self.word_count_dict.items()为待排序的对象,key表示排序指标,reverse=True表示降序排列】 for word in self.word_count_dict: # 根据word_count_dict字典构造词语-数字映射字典 if word not in self.word_index_dict.keys(): # 如果当前词语word还没有添加到word_index_dict字典,则添加 self.word_index_dict[word] = len(self.word_index_dict) # 每次word对应一个数字【使用self.word_index_dict添加当前word前已有词汇的数量作为其value】 self.fited = True self.index_word_dict = dict(zip(self.word_index_dict.values(),self.word_index_dict.keys())) #把word_index_dict进行翻转【准备一个index->word的字典】 # word -> index def to_index(self,word): assert self.fited == True,"必须先进行fit操作" return self.word_index_dict.get(word,self.UNK) # 把句子转化为数字数组(向量)【输入:[str,str,str];输出:[int,int,int]】 def transform(self,sentence,max_len=None,add_eos=False): if len(sentence) > max_len: # 句子过长,截取句子 if add_eos: # 如果每句文本需要添加<EOS>结束标记 sentence = sentence[:max_len-1] + [self.EOS] else: sentence = sentence[:max_len] else: # 句子过短,填充句子 if add_eos: # 如果每句文本需要添加<EOS>结束标记 sentence = sentence + [self.EOS] + [self.PAD_TAG] *(max_len - len(sentence) - 1) else: sentence = sentence + [self.PAD_TAG] *(max_len - len(sentence)) index_sequence = [self.to_index(word) for word in sentence] return index_sequence # index -> word def to_word(self,index): assert self.fited , "必须先进行fit操作" if index in self.inversed_dict: return self.inversed_dict[index] return self.UNK_TAG # 把数字数组(向量)转化为句子【输入:[int,int,int];输出:[str,str,str]】 def inverse_transform(self,indexes): sentence = [self.index_word_dict.get(index,"<UNK>") for index in indexes] return sentence # =======================================文本序列化:结束======================================= if __name__ == '__main__': sentences = [["今天","天气","很","好"],["今天","去","吃","什么"]] ws = WordSequence() for sentence in sentences: ws.fit(sentence) print("ws.word_index_dict = {0}".format(ws.word_index_dict)) print("ws.fited = {0}".format(ws.fited)) index_sequence = ws.transform(["今天","很","热"],max_len=10) print("index_sequence = {0}".format(index_sequence))
输出结果:
ws.word_index_dict = {'<UNK>': 1, '<PAD>': 0, '今天': 2, '天气': 3, '很': 4, '好': 5, '去': 6, '吃': 7, '什么': 8}
ws.fited = True
index_sequence = [2, 4, 1, 0, 0, 0, 0, 0, 0, 0]
完成了WordSequence功能之后,接下来就是利用WordSequence将原始字符串文本进行序列化【保存现有样本中的数据字典】,方便后续的使用。
实现对IMDB数据的处理和保存
import numpy as np import pickle from torch.utils.data import DataLoader, Dataset import torch import os import re import pickle from tqdm import tqdm BATCH_SIZE_TRAIN = 1000 BATCH_SIZE_TEST = 1000 MAX_LEN = 500 # =======================================进行自定义文本分词【可以用第三方分词工具】:开始======================================= def tokenlize(sentence): fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ] sentence = sentence.lower() # 把大写转化为小写 sentence = re.sub("<br />", " ", sentence) sentence = re.sub("|".join(fileters), " ", sentence) result = [i for i in sentence.split(" ") if len(i) > 0] return result # =======================================进行自定义文本分词【可以用第三方分词工具】:结束======================================= # =======================================Dataset数据准备:开始======================================= class ImdbDataset(Dataset): def __init__(self, wordSequence=None, train=True): super(ImdbDataset,self).__init__() self.wordSequence = wordSequence data_path = r"./data/aclImdb" data_path += r"/train" if train else r"/test" # 文件名拼接【等价于os.path.join()】 self.total_path = [] # 保存所有的文件路径 for temp_path in [r"/pos", r"/neg"]: cur_path = data_path + temp_path self.total_path += [os.path.join(cur_path, i) for i in os.listdir(cur_path) if i.endswith(".txt")] # 将所有文件路径加入到total_path列表中 def __getitem__(self, idx): file = self.total_path[idx] review = tokenlize(open(file, encoding="utf-8").read()) # 读取文件内容(评论) label = int(file.split("_")[-1].split(".")[0]) label = 0 if label < 5 else 1 if self.wordSequence is not None: review = self.wordSequence.transform(review, max_len=MAX_LEN) # 将字符串通过已经保存的“词语-数字”映射器转为数字 return review, label def __len__(self): return len(self.total_path) # =======================================Dataset数据准备:结束======================================= # =======================================DataLoader数据数据批次化:开始======================================= # 自定义collate_fn方式,对batch数据进行处理【batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果】 def collate_fn(batch): reviews, labels = zip(*batch) lengths = [len(review) if len(review) < MAX_LEN else MAX_LEN for review in reviews] return reviews, labels, lengths def get_dataloader(dataset, train=True): batch_size = BATCH_SIZE_TRAIN if train else BATCH_SIZE_TEST dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloader # =======================================DataLoader数据批次化:结束======================================= # =======================================文本序列化:开始======================================= class WordSequence: UNK_TAG = "<UNK>" # 表示未在词典库里出现的未知词汇 PAD_TAG = "<PAD>" # 句子长度不够时的填充符 SOS_TAG = "<SOS>" # 表示一句文本的开始 EOS_TAG = "<EOS>" # 表示一句文本的结束 UNK = 0 PAD = 1 SOS = 2 EOS = 3 def __init__(self): self.word_index_dict = { self.UNK_TAG: self.UNK, self.PAD_TAG: self.PAD, self.SOS_TAG: self.SOS, self.EOS_TAG: self.EOS} # 初始化词语-数字映射字典 self.index_word_dict = {} # 初始化数字-词语映射字典 self.word_count_dict = {} # 初始化词语-词频统计字典 self.fited = False def __len__(self): return len(self.word_index_dict) # 接受句子,统计词频得到 def fit(self,sentence,min_count=1,max_count=None,max_features=None): # 【min_count:最小词频; max_count: 最大词频; max_features: 最大词语数(词典容量大小)】 """ :param sentence:[word1,word2,word3] :param min_count: 最小出现的次数 :param max_count: 最大出现的次数 :param max_feature: 总词语的最大数量 :return: """ for word in sentence: self.word_count_dict[word] = self.word_count_dict.get(word,0) + 1 #所有的句子fit之后,self.word_count_dict就有了所有词语的词频 if min_count is not None: # 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count >= min_count} if max_count is not None:# 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count <= max_count} # 根据条件构造词典 if max_features is not None: # 根据条件保留高词频词语 self.word_count_dict = dict(sorted(self.word_count_dict.items(),key=lambda x:x[-1],reverse=True)[:max_features]) # 保留词频排名靠前的词汇【self.word_count_dict.items()为待排序的对象,key表示排序指标,reverse=True表示降序排列】 for word in self.word_count_dict: # 根据word_count_dict字典构造词语-数字映射字典 if word not in self.word_index_dict.keys(): # 如果当前词语word还没有添加到word_index_dict字典,则添加 self.word_index_dict[word] = len(self.word_index_dict) # 每次word对应一个数字【使用self.word_index_dict添加当前word前已有词汇的数量作为其value】 self.fited = True self.index_word_dict = dict(zip(self.word_index_dict.values(),self.word_index_dict.keys())) #把word_index_dict进行翻转【准备一个index->word的字典】 # word -> index def to_index(self,word): assert self.fited == True,"必须先进行fit操作" return self.word_index_dict.get(word,self.UNK) # 把句子转化为数字数组(向量)【输入:[str,str,str];输出:[int,int,int]】 def transform(self,sentence,max_len=None,add_eos=False): if len(sentence) > max_len: # 句子过长,截取句子 if add_eos: # 如果每句文本需要添加<EOS>结束标记 sentence = sentence[:max_len-1] + [self.EOS] else: sentence = sentence[:max_len] else: # 句子过短,填充句子 if add_eos: # 如果每句文本需要添加<EOS>结束标记 sentence = sentence + [self.EOS] + [self.PAD_TAG] *(max_len - len(sentence) - 1) else: sentence = sentence + [self.PAD_TAG] *(max_len - len(sentence)) index_sequence = [self.to_index(word) for word in sentence] return index_sequence # index -> word def to_word(self,index): assert self.fited , "必须先进行fit操作" if index in self.inversed_dict: return self.inversed_dict[index] return self.UNK_TAG # 把数字数组(向量)转化为句子【输入:[int,int,int];输出:[str,str,str]】 def inverse_transform(self,indexes): sentence = [self.index_word_dict.get(index,"<UNK>") for index in indexes] return sentence # =======================================文本序列化:结束======================================= # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:开始======================================= def fit_save_word_sequence(): dataset_train = ImdbDataset(train=True) dataset_test = ImdbDataset(train=False) dataloader_train = get_dataloader(dataset=dataset_train,train=True) # 训练集批次化数据【文本类型】 dataloader_test = get_dataloader(dataset=dataset_test,train=False) # 测试集批次化数据【文本类型】 ws = WordSequence() # 实例化文本序列化对象 for reviews, labels, lengths in tqdm(dataloader_train, total=len(dataloader_train)): # tqdm的作用是提供运行进度条提示 for review in reviews: ws.fit(review) for reviews, labels, lengths in tqdm(dataloader_test, total=len(dataloader_test)): for review in reviews: ws.fit(review) print("构造的词典的容量大小:len(ws) = {0}".format(len(ws))) pickle.dump(ws, open("./models/ws.pkl", "wb")) # 保存文本序列化对象 # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:结束======================================= if __name__ == '__main__': fit_save_word_sequence() # 对IMDB数据的序列化处理,并保存构建的文本序列化对象【将字符串转为序号,并保存词语-数字映射】
输出结果:
在这里插入代码片
import numpy as np import pickle from torch.utils.data import DataLoader, Dataset import torch import os import re import pickle from tqdm import tqdm BATCH_SIZE_TRAIN = 2 BATCH_SIZE_TEST = 2 MAX_LEN = 100 # =======================================进行自定义文本分词【可以用第三方分词工具】:开始======================================= def tokenlize(sentence): fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ] sentence = sentence.lower() # 把大写转化为小写 sentence = re.sub("<br />", " ", sentence) # sentence = re.sub("I'm","I am",sentence) # 当语料量足够多时,可以学习到I'm的含义。 # sentence = re.sub("isn't","is not",sentence) sentence = re.sub("|".join(fileters), " ", sentence) result = [i for i in sentence.split(" ") if len(i) > 0] return result # =======================================进行自定义文本分词【可以用第三方分词工具】:结束======================================= # =======================================Dataset数据准备:开始======================================= class ImdbDataset(Dataset): def __init__(self, wordSequence=None, train=True): super(ImdbDataset,self).__init__() self.wordSequence = wordSequence data_path = r"./data/aclImdb" data_path += r"/train" if train else r"/test" # 文件名拼接【等价于os.path.join()】 self.total_path = [] # 保存所有的文件路径 for temp_path in [r"/pos", r"/neg"]: cur_path = data_path + temp_path self.total_path += [os.path.join(cur_path, i) for i in os.listdir(cur_path) if i.endswith(".txt")] # 将所有文件路径加入到total_path列表中 def __getitem__(self, idx): file = self.total_path[idx] review = tokenlize(open(file, encoding="utf-8").read()) # 读取文件内容(评论) label = int(file.split("_")[-1].split(".")[0]) label = 0 if label < 5 else 1 if self.wordSequence is not None: review = self.wordSequence.transform(review, max_len=MAX_LEN) # 将字符串通过已经保存的“词语-数字”映射器转为数字 return review, label def __len__(self): return len(self.total_path) # =======================================Dataset数据准备:结束======================================= # =======================================DataLoader数据数据批次化:开始======================================= # 自定义collate_fn方式,对batch数据进行处理【batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果】 def collate_fn(batch): reviews, labels = zip(*batch) lengths = [len(review) if len(review) < MAX_LEN else MAX_LEN for review in reviews] reviews, labels = torch.LongTensor(np.array(list(reviews))), torch.LongTensor(np.array(list(labels))) # 将tuple类型转为Tensor类型 return reviews, labels, lengths def get_dataloader(dataset, train=True): batch_size = BATCH_SIZE_TRAIN if train else BATCH_SIZE_TEST dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloader # =======================================DataLoader数据批次化:结束======================================= # =======================================文本序列化:开始======================================= class WordSequence: UNK_TAG = "<UNK>" # 表示未在词典库里出现的未知词汇 PAD_TAG = "<PAD>" # 句子长度不够时的填充符 SOS_TAG = "<SOS>" # 表示一句文本的开始 EOS_TAG = "<EOS>" # 表示一句文本的结束 UNK = 0 PAD = 1 SOS = 2 EOS = 3 def __init__(self): self.word_index_dict = { self.UNK_TAG: self.UNK, self.PAD_TAG: self.PAD, self.SOS_TAG: self.SOS, self.EOS_TAG: self.EOS} # 初始化词语-数字映射字典 self.index_word_dict = {} # 初始化数字-词语映射字典 self.word_count_dict = {} # 初始化词语-词频统计字典 self.fited = False def __len__(self): return len(self.word_index_dict) # 接受句子,统计词频得到 def fit(self,sentence,min_count=1,max_count=None,max_features=None): # 【min_count:最小词频; max_count: 最大词频; max_features: 最大词语数(词典容量大小)】 """ :param sentence:[word1,word2,word3] :param min_count: 最小出现的次数 :param max_count: 最大出现的次数 :param max_feature: 总词语的最大数量 :return: """ for word in sentence: self.word_count_dict[word] = self.word_count_dict.get(word,0) + 1 #所有的句子fit之后,self.word_count_dict就有了所有词语的词频 if min_count is not None: # 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count >= min_count} if max_count is not None:# 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count <= max_count} # 根据条件构造词典 if max_features is not None: # 根据条件保留高词频词语 self.word_count_dict = dict(sorted(self.word_count_dict.items(),key=lambda x:x[-1],reverse=True)[:max_features]) # 保留词频排名靠前的词汇【self.word_count_dict.items()为待排序的对象,key表示排序指标,reverse=True表示降序排列】 for word in self.word_count_dict: # 根据word_count_dict字典构造词语-数字映射字典 if word not in self.word_index_dict.keys(): # 如果当前词语word还没有添加到word_index_dict字典,则添加 self.word_index_dict[word] = len(self.word_index_dict) # 每次word对应一个数字【使用self.word_index_dict添加当前word前已有词汇的数量作为其value】 self.fited = True self.index_word_dict = dict(zip(self.word_index_dict.values(),self.word_index_dict.keys())) #把word_index_dict进行翻转【准备一个index->word的字典】 # word -> index def to_index(self,word): assert self.fited == True,"必须先进行fit操作" return self.word_index_dict.get(word,self.UNK) # 把句子转化为数字数组(向量)【输入:[str,str,str];输出:[int,int,int]】 def transform(self,sentence,max_len=None,add_eos=False): if len(sentence) > max_len: # 句子过长,截取句子 if add_eos: # 如果每句文本需要添加<EOS>结束标记 sentence = sentence[:max_len-1] + [self.EOS] else: sentence = sentence[:max_len] else: # 句子过短,填充句子 if add_eos: # 如果每句文本需要添加<EOS>结束标记 sentence = sentence + [self.EOS] + [self.PAD_TAG] *(max_len - len(sentence) - 1) else: sentence = sentence + [self.PAD_TAG] *(max_len - len(sentence)) index_sequence = [self.to_index(word) for word in sentence] return index_sequence # index -> word def to_word(self,index): assert self.fited , "必须先进行fit操作" if index in self.inversed_dict: return self.inversed_dict[index] return self.UNK_TAG # 把数字数组(向量)转化为句子【输入:[int,int,int];输出:[str,str,str]】 def inverse_transform(self,indexes): sentence = [self.index_word_dict.get(index,"<UNK>") for index in indexes] return sentence # =======================================文本序列化:结束======================================= # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:开始======================================= def fit_save_word_sequence(): dataloader_train = get_dataloader(True) # 训练集批次化数据【文本类型】 dataloader_test = get_dataloader(False) # 测试集批次化数据【文本类型】 ws = WordSequence() # 实例化文本序列化对象 for reviews, label in tqdm(dataloader_train, total=len(dataloader_train)): # tqdm的作用是提供运行进度条提示 for review in reviews: ws.fit(review) for reviews, label in tqdm(dataloader_test, total=len(dataloader_test)): for review in reviews: ws.fit(review) print("构造的词典的容量大小:len(ws) = {0}".format(len(ws))) pickle.dump(ws, open("./models/ws.pkl", "wb")) # 保存文本序列化对象 # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:结束======================================= if __name__ == '__main__': # 一、对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】 # fit_save_word_sequence() # 二、在dataset中使用已保存的“词语-数字”映射器 wordSequence = pickle.load(open("./models/ws.pkl", "rb")) dataset = ImdbDataset(wordSequence=wordSequence) dataLoader = get_dataloader(dataset=dataset, train=True) for batch_index,(reviews,labels,lengths) in enumerate(dataLoader): print("batch_index = {0}".format(batch_index)) print("reviews in this batch = {0}".format(reviews)) print("labels in this batch = {0}".format(labels)) print("lengths in this batch = {0}".format(lengths)) break
输出如下
batch_index = 0 reviews in this batch = ( tensor([ 58, 103, 98, 739, 607, 4, 456, 6, 2, 8, 2667, 58, 278, 259, 5242, 464, 2, 763, 290, 382, 459, 98, 215, 5808, 21, 7498, 100, 58, 175, 224, 2, 167, 15, 3353, 152, 2394, 9842, 58, 215, 7449, 14, 1456, 22, 27, 4335, 280, 182, 10739, 98, 16, 7419, 100, 21, 4, 804, 2772, 6, 82, 1609, 73, 2649, 208, 2, 2667, 21, 1643, 10740, 108, 896, 21, 3177, 3746, 8328, 13, 3845, 1238, 140, 2, 54, 55, 1862, 13, 791, 51, 202, 8995, 53, 13, 58, 2682, 3326, 3387, 248, 165, 2, 2305, 58, 909, 26, 64]), tensor([ 2, 5991, 671, 27, 4, 209, 106, 152, 6751, 4, 3762, 1626, 168, 208, 24, 19, 262, 12221, 6467, 21, 540, 53, 1026, 6, 4, 671, 230, 9, 445, 5991, 465, 3612, 87, 4, 1169, 572, 2, 168, 61, 338, 459, 4215, 15, 98, 9, 109, 165, 6751, 98, 9, 2, 133, 1312, 54, 55, 106, 959, 10310, 5693, 10606, 54, 55, 2347, 182, 7884, 651, 7838, 64, 176, 9010, 726, 491, 54, 55, 244, 316, 64, 1776, 214, 316, 64, 176, 896, 1108, 2, 54, 55, 165, 399, 671, 2, 2745, 6, 2, 11479, 11480, 290, 64, 229, 47])) labels in this batch = (tensor(1, dtype=torch.int32), tensor(0, dtype=torch.int32)) lengths in this batch = [100, 100]
思考:前面我们自定义了MAX_LEN作为句子的最大长度,如果我们需要把每个batch中的最长的句子长度作为当前batch的最大长度,该如何实现?
这里我们只练习使用word embedding,所以模型只有一层,即:
log_softmax
import torch import torch.nn as nn import torch.nn.functional as F from torch import optim from build_dataset import get_dataloader,ws,MAX_LEN class IMDBModel(nn.Module): def __init__(self,wordSequence, max_len=MAX_LEN): super(IMDBModel,self).__init__() self.embedding = nn.Embedding(num_embeddings=len(wordSequence), embedding_dim=300, padding_idx=wordSequence.PAD) # num_embeddings为词典总词汇数量; embedding_dim为词向量维度 self.fc = nn.Linear(max_len*300, 10) #[max_len*300,10] def forward(self, x): embeded = self.embedding(x) # 输入维度: [batch_size,max_len]; 输出维度: [batch_size,max_len,300] embeded = embeded.view(x.size(0), -1) # 扁平化数据之后作为全连接层的输入 out = self.fc(embeded) return F.log_softmax(out,dim=-1)
class IMDBLstmmodel(nn.Module): def __init__(self): super(IMDBLstmmodel,self).__init__() self.hidden_size = 64 self.embedding_dim = 200 self.num_layer = 2 self.bidriectional = True self.bi_num = 2 if self.bidriectional else 1 self.dropout = 0.5 #以上部分为超参数,可以自行修改 self.embedding = nn.Embedding(len(ws),self.embedding_dim,padding_idx=ws.PAD) #[N,300] self.lstm = nn.LSTM(self.embedding_dim,self.hidden_size,self.num_layer,bidirectional=True,dropout=self.dropout) #使用两个全连接层,中间使用relu激活函数 self.fc = nn.Linear(self.hidden_size*self.bi_num,20) self.fc2 = nn.Linear(20,2) def forward(self, x): x = self.embedding(x) x = x.permute(1,0,2) #进行轴交换 h_0,c_0 = self.init_hidden_state(x.size(1)) _,(h_n,c_n) = self.lstm(x,(h_0,c_0)) #只要最后一个lstm单元处理的结果,这里多去的hidden state out = torch.cat([h_n[-2, :, :], h_n[-1, :, :]], dim=-1) out = self.fc(out) out = F.relu(out) out = self.fc2(out) return F.log_softmax(out,dim=-1) def init_hidden_state(self,batch_size): h_0 = torch.rand(self.num_layer * self.bi_num, batch_size, self.hidden_size).to(device) c_0 = torch.rand(self.num_layer * self.bi_num, batch_size, self.hidden_size).to(device) return h_0,c_0
训练流程和之前相同
train_batch_size = 128 test_batch_size = 1000 imdb_model = IMDBModel(MAX_LEN) optimizer = optim.Adam(imdb_model.parameters()) criterion = nn.CrossEntropyLoss() def train(epoch, wordSequence): imdb_model.train() dataset_train = ImdbDataset(wordSequence=wordSequence, train=True) dataloader_train = get_dataloader(dataset=dataset_train, train=True) # 训练集批次化数据【文本类型】 for batch_index, (reviews, labels, lengths) in enumerate(dataloader_train): optimizer.zero_grad() output = imdb_model(reviews) loss = criterion(output, labels) # traget需要是[0,9],不能是[1-10] loss.backward() optimizer.step() if batch_index % 10 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_index * len(reviews), len(dataloader_train.dataset),100. * batch_index / len(dataloader_train), loss.item())) torch.save(imdb_model.state_dict(), "./models/mnist_net{0}.pkl".format(epoch)) torch.save(optimizer.state_dict(), './models/mnist_optimizer{0}.pkl'.format(epoch)) def test(wordSequence): test_loss = 0 correct = 0 imdb_model.eval() dataset_test = ImdbDataset(wordSequence=wordSequence, train=False) dataloader_test = get_dataloader(dataset=dataset_test, train=False) # 测试集批次化数据【文本类型】 with torch.no_grad(): for batch_index, (reviews, labels, lengths) in enumerate(dataloader_test): output = imdb_model(reviews) test_loss += F.nll_loss(output, labels, reduction="sum") pred = torch.max(output, dim=-1, keepdim=False)[-1] correct = pred.eq(labels.data).sum() test_loss = test_loss / len(dataloader_test.dataset) print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(test_loss, correct, len(dataloader_test.dataset), 100. * correct / len(dataloader_test.dataset))) if __name__ == '__main__': test() for i in range(3): train(i) test()
这里我们仅仅使用了一层全连接层,其分类效果不会很好,这里重点是理解常见的模型流程和word embedding的使用方法
import numpy as np import pickle from torch.utils.data import DataLoader, Dataset import os import re from tqdm import tqdm import torch from torch import optim import torch.nn as nn import torch.nn.functional as F from torchsummary import summary BATCH_SIZE_TRAIN = 64 BATCH_SIZE_TEST = 64 MAX_LEN = 500 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # =======================================进行自定义文本分词【可以用第三方分词工具】:开始======================================= def tokenlize(sentence): fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ] sentence = sentence.lower() # 把大写转化为小写 sentence = re.sub("<br />", " ", sentence) # sentence = re.sub("I'm","I am",sentence) # 当语料量足够多时,可以学习到I'm的含义。 # sentence = re.sub("isn't","is not",sentence) sentence = re.sub("|".join(fileters), " ", sentence) result = [i for i in sentence.split(" ") if len(i) > 0] return result # =======================================进行自定义文本分词【可以用第三方分词工具】:结束======================================= # =======================================Dataset数据准备:开始======================================= class ImdbDataset(Dataset): def __init__(self, wordSequence=None, train=True): super(ImdbDataset,self).__init__() self.wordSequence = wordSequence data_path = r"./data/aclImdb" data_path += r"/train" if train else r"/test" # 文件名拼接【等价于os.path.join()】 self.total_path = [] # 保存所有的文件路径 for temp_path in [r"/pos", r"/neg"]: cur_path = data_path + temp_path self.total_path += [os.path.join(cur_path, i) for i in os.listdir(cur_path) if i.endswith(".txt")] # 将所有文件路径加入到total_path列表中 def __getitem__(self, idx): file = self.total_path[idx] review = tokenlize(open(file, encoding="utf-8").read()) # 读取文件内容(评论) label = int(file.split("_")[-1].split(".")[0]) label = 0 if label < 5 else 1 if self.wordSequence is not None: review = self.wordSequence.transform(review, max_len=MAX_LEN) # 将字符串通过已经保存的“词语-数字”映射器转为数字 return review, label def __len__(self): return len(self.total_path) # =======================================Dataset数据准备:结束======================================= # =======================================DataLoader数据数据批次化:开始======================================= # 自定义collate_fn方式,对batch数据进行处理【batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果】 def collate_fn(batch): reviews, labels = zip(*batch) lengths = [len(review) if len(review) < MAX_LEN else MAX_LEN for review in reviews] reviews, labels = torch.LongTensor(np.array(list(reviews))),torch.LongTensor(np.array(list(labels))) # 将tuple类型转为Tensor类型 return reviews, labels, lengths def get_dataloader(dataset, train=True): batch_size = BATCH_SIZE_TRAIN if train else BATCH_SIZE_TEST dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloader # =======================================DataLoader数据批次化:结束======================================= # =======================================文本序列化:开始======================================= class WordSequence: PAD_TAG = "<PAD>" # 句子长度不够时的填充符 UNK_TAG = "<UNK>" # 表示未在词典库里出现的未知词汇 PAD = 0 UNK = 1 def __init__(self): self.word_index_dict = {self.UNK_TAG:self.UNK, self.PAD_TAG:self.PAD} # 初始化词语-数字映射字典 self.index_word_dict = {} # 初始化数字-词语映射字典 self.word_count_dict = {} # 初始化词语-词频统计字典 self.fited = False def __len__(self): return len(self.word_index_dict) # 接受句子,统计词频得到 def fit(self,sentence,min_count=5,max_count=None,max_features=None): # 【min_count:最小词频; max_count: 最大词频; max_features: 最大词语数(词典容量大小)】 for word in sentence: self.word_count_dict[word] = self.word_count_dict.get(word,0) + 1 #所有的句子fit之后,self.word_count_dict就有了所有词语的词频 if min_count is not None: # 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count >= min_count} if max_count is not None:# 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count <= max_count} # 根据条件构造词典 if max_features is not None: # 根据条件保留高词频词语 self.word_count_dict = dict(sorted(self.word_count_dict.items(),key=lambda x:x[-1],reverse=True)[:max_features]) # 保留词频排名靠前的词汇【self.word_count_dict.items()为待排序的对象,key表示排序指标,reverse=True表示降序排列】 for word in self.word_count_dict: # 根据word_count_dict字典构造词语-数字映射字典 if word not in self.word_index_dict.keys(): # 如果当前词语word还没有添加到word_index_dict字典,则添加 self.word_index_dict[word] = len(self.word_index_dict) # 每次word对应一个数字【使用self.word_index_dict添加当前word前已有词汇的数量作为其value】 self.fited = True self.index_word_dict = dict(zip(self.word_index_dict.values(),self.word_index_dict.keys())) #把word_index_dict进行翻转【准备一个index->word的字典】 # word -> index def to_index(self,word): assert self.fited == True,"必须先进行fit操作" return self.word_index_dict.get(word,self.UNK) # 把句子转化为数字数组(向量)【输入:[str,str,str];输出:[int,int,int]】 def transform(self,sentence,max_len=None): if len(sentence) > max_len: # 句子过长,截取句子 sentence = sentence[:max_len] else: # 句子过短,填充句子 sentence = sentence + [self.PAD_TAG] *(max_len- len(sentence)) index_sequence = [self.to_index(word) for word in sentence] return index_sequence # index -> word def to_word(self,index): assert self.fited , "必须先进行fit操作" if index in self.inversed_dict: return self.inversed_dict[index] return self.UNK_TAG # 把数字数组(向量)转化为句子【输入:[int,int,int];输出:[str,str,str]】 def inverse_transform(self,indexes): sentence = [self.index_word_dict.get(index,"<UNK>") for index in indexes] return sentence # =======================================文本序列化:结束======================================= # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:开始======================================= def fit_save_word_sequence(): dataloader_train = get_dataloader(True) # 训练集批次化数据【文本类型】 dataloader_test = get_dataloader(False) # 测试集批次化数据【文本类型】 ws = WordSequence() # 实例化文本序列化对象 for reviews, label in tqdm(dataloader_train, total=len(dataloader_train)): # tqdm的作用是提供运行进度条提示 for review in reviews: ws.fit(review) for reviews, label in tqdm(dataloader_test, total=len(dataloader_test)): for review in reviews: ws.fit(review) print("构造的词典的容量大小:len(ws) = {0}".format(len(ws))) pickle.dump(ws, open("./models/ws.pkl", "wb")) # 保存文本序列化对象 # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:结束======================================= # =======================================构建神经网络模型:开始======================================= class SimpleModel(nn.Module): def __init__(self,wordSequence, max_len=MAX_LEN): super(SimpleModel,self).__init__() self.embedding = nn.Embedding(num_embeddings=len(wordSequence), embedding_dim=300, padding_idx=wordSequence.PAD) # num_embeddings为词典总词汇数量; embedding_dim为词向量维度 self.fc = nn.Linear(max_len*300, 10) #[max_len*300,10] def forward(self, x): embeded = self.embedding(x) # 输入维度: [batch_size,max_len]; 输出维度: [batch_size,max_len,300] embeded = embeded.view(x.size(0), -1) # 扁平化数据之后作为全连接层的输入 out = self.fc(embeded) return F.log_softmax(out,dim=-1) # =======================================构建神经网络模型:结束======================================= def train(epoch, wordSequence): imdb_model.train() dataset_train = ImdbDataset(wordSequence=wordSequence, train=True) dataloader_train = get_dataloader(dataset=dataset_train, train=True) # 训练集批次化数据【文本类型】 for batch_index, (reviews, labels, lengths) in enumerate(dataloader_train): reviews = reviews.to(device) labels = labels.to(device) optimizer.zero_grad() output = imdb_model(reviews) loss = criterion(output, labels) # traget需要是[0,9],不能是[1-10] loss.backward() optimizer.step() if batch_index % 130 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_index * len(reviews), len(dataloader_train.dataset),100. * batch_index / len(dataloader_train), loss.item())) torch.save(imdb_model.state_dict(), "./models/mnist_net{0}.pkl".format(epoch)) torch.save(optimizer.state_dict(), './models/mnist_optimizer{0}.pkl'.format(epoch)) def test(wordSequence): test_loss = 0 correct = 0 imdb_model.eval() dataset_test = ImdbDataset(wordSequence=wordSequence, train=False) dataloader_test = get_dataloader(dataset=dataset_test, train=False) # 测试集批次化数据【文本类型】 with torch.no_grad(): for batch_index, (reviews, labels, lengths) in enumerate(dataloader_test): reviews = reviews.to(device) labels = labels.to(device) output = imdb_model(reviews) test_loss += F.nll_loss(output, labels, reduction="sum") pred = torch.max(output, dim=-1, keepdim=False)[-1] correct += pred.eq(labels.data).sum() test_loss = test_loss / len(dataloader_test.dataset) print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(test_loss, correct, len(dataloader_test.dataset), 100. * correct / len(dataloader_test.dataset))) if __name__ == '__main__': ws = WordSequence() ws = pickle.load(open("./models/ws.pkl", "rb")) # =======================================实例化神经网络各个组件:开始======================================= imdb_model = SimpleModel(wordSequence=ws, max_len=MAX_LEN).to(device) print(imdb_model) optimizer = optim.Adam(imdb_model.parameters()) criterion = nn.NLLLoss() # =======================================实例化神经网络各个组件:结束======================================= # test() for epoch in range(5): train(wordSequence=ws, epoch=epoch) test(wordSequence=ws)
输出结果:
SimpleModel( (embedding): Embedding(6457, 300, padding_idx=0) (fc): Linear(in_features=150000, out_features=10, bias=True) ) Train Epoch: 0 [0/25000 (0%)] Loss: 2.318600 Train Epoch: 0 [8320/25000 (33%)] Loss: 2.879479 Train Epoch: 0 [16640/25000 (66%)] Loss: 4.185676 Train Epoch: 0 [15600/25000 (100%)] Loss: 3.459218 Test set: Avg. loss: 3.0266, Accuracy: 15857/25000 (63.43%) Train Epoch: 1 [0/25000 (0%)] Loss: 1.198754 Train Epoch: 1 [8320/25000 (33%)] Loss: 0.714398 Train Epoch: 1 [16640/25000 (66%)] Loss: 0.794013 Train Epoch: 1 [15600/25000 (100%)] Loss: 0.927658 Test set: Avg. loss: 2.9786, Accuracy: 16858/25000 (67.43%) Train Epoch: 2 [0/25000 (0%)] Loss: 0.224821 Train Epoch: 2 [8320/25000 (33%)] Loss: 0.505306 Train Epoch: 2 [16640/25000 (66%)] Loss: 0.542476 Train Epoch: 2 [15600/25000 (100%)] Loss: 0.253546 Test set: Avg. loss: 3.0380, Accuracy: 17274/25000 (69.10%) Train Epoch: 3 [0/25000 (0%)] Loss: 0.008276 Train Epoch: 3 [8320/25000 (33%)] Loss: 0.061753 Train Epoch: 3 [16640/25000 (66%)] Loss: 0.253349 Train Epoch: 3 [15600/25000 (100%)] Loss: 0.421935 Test set: Avg. loss: 3.2476, Accuracy: 17406/25000 (69.62%) Train Epoch: 4 [0/25000 (0%)] Loss: 0.001352 Train Epoch: 4 [8320/25000 (33%)] Loss: 0.013228 Train Epoch: 4 [16640/25000 (66%)] Loss: 0.126471 Train Epoch: 4 [15600/25000 (100%)] Loss: 0.013214 Test set: Avg. loss: 3.3700, Accuracy: 17570/25000 (70.28%) Process finished with exit code 0
import numpy as np import pickle from torch.utils.data import DataLoader, Dataset import os import re from tqdm import tqdm import torch from torch import optim import torch.nn as nn import torch.nn.functional as F BATCH_SIZE_TRAIN = 64 BATCH_SIZE_TEST = 64 MAX_LEN = 500 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # =======================================进行自定义文本分词【可以用第三方分词工具】:开始======================================= def tokenlize(sentence): fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ] sentence = sentence.lower() # 把大写转化为小写 sentence = re.sub("<br />", " ", sentence) # sentence = re.sub("I'm","I am",sentence) # 当语料量足够多时,可以学习到I'm的含义。 # sentence = re.sub("isn't","is not",sentence) sentence = re.sub("|".join(fileters), " ", sentence) result = [i for i in sentence.split(" ") if len(i) > 0] return result # =======================================进行自定义文本分词【可以用第三方分词工具】:结束======================================= # =======================================Dataset数据准备:开始======================================= class ImdbDataset(Dataset): def __init__(self, wordSequence=None, train=True): super(ImdbDataset,self).__init__() self.wordSequence = wordSequence data_path = r"./data/aclImdb" data_path += r"/train" if train else r"/test" # 文件名拼接【等价于os.path.join()】 self.total_path = [] # 保存所有的文件路径 for temp_path in [r"/pos", r"/neg"]: cur_path = data_path + temp_path self.total_path += [os.path.join(cur_path, i) for i in os.listdir(cur_path) if i.endswith(".txt")] # 将所有文件路径加入到total_path列表中 def __getitem__(self, idx): file = self.total_path[idx] review = tokenlize(open(file, encoding="utf-8").read()) # 读取文件内容(评论) label = int(file.split("_")[-1].split(".")[0]) label = 0 if label < 5 else 1 if self.wordSequence is not None: review = self.wordSequence.transform(review, max_len=MAX_LEN) # 将字符串通过已经保存的“词语-数字”映射器转为数字 return review, label def __len__(self): return len(self.total_path) # =======================================Dataset数据准备:结束======================================= # =======================================DataLoader数据数据批次化:开始======================================= # 自定义collate_fn方式,对batch数据进行处理【batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果】 def collate_fn(batch): reviews, labels = zip(*batch) lengths = [len(review) if len(review) < MAX_LEN else MAX_LEN for review in reviews] reviews, labels = torch.LongTensor(np.array(list(reviews))),torch.LongTensor(np.array(list(labels))) # 将tuple类型转为Tensor类型 return reviews, labels, lengths def get_dataloader(dataset, train=True): batch_size = BATCH_SIZE_TRAIN if train else BATCH_SIZE_TEST dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloader # =======================================DataLoader数据批次化:结束======================================= # =======================================文本序列化:开始======================================= class WordSequence: PAD_TAG = "<PAD>" # 句子长度不够时的填充符 UNK_TAG = "<UNK>" # 表示未在词典库里出现的未知词汇 PAD = 0 UNK = 1 def __init__(self): self.word_index_dict = {self.UNK_TAG:self.UNK, self.PAD_TAG:self.PAD} # 初始化词语-数字映射字典 self.index_word_dict = {} # 初始化数字-词语映射字典 self.word_count_dict = {} # 初始化词语-词频统计字典 self.fited = False def __len__(self): return len(self.word_index_dict) # 接受句子,统计词频得到 def fit(self,sentence,min_count=5,max_count=None,max_features=None): # 【min_count:最小词频; max_count: 最大词频; max_features: 最大词语数(词典容量大小)】 for word in sentence: self.word_count_dict[word] = self.word_count_dict.get(word,0) + 1 #所有的句子fit之后,self.word_count_dict就有了所有词语的词频 if min_count is not None: # 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count >= min_count} if max_count is not None:# 根据条件统计词频 self.word_count_dict = {word:count for word,count in self.word_count_dict.items() if count <= max_count} # 根据条件构造词典 if max_features is not None: # 根据条件保留高词频词语 self.word_count_dict = dict(sorted(self.word_count_dict.items(),key=lambda x:x[-1],reverse=True)[:max_features]) # 保留词频排名靠前的词汇【self.word_count_dict.items()为待排序的对象,key表示排序指标,reverse=True表示降序排列】 for word in self.word_count_dict: # 根据word_count_dict字典构造词语-数字映射字典 if word not in self.word_index_dict.keys(): # 如果当前词语word还没有添加到word_index_dict字典,则添加 self.word_index_dict[word] = len(self.word_index_dict) # 每次word对应一个数字【使用self.word_index_dict添加当前word前已有词汇的数量作为其value】 self.fited = True self.index_word_dict = dict(zip(self.word_index_dict.values(),self.word_index_dict.keys())) #把word_index_dict进行翻转【准备一个index->word的字典】 # word -> index def to_index(self,word): assert self.fited == True,"必须先进行fit操作" return self.word_index_dict.get(word,self.UNK) # 把句子转化为数字数组(向量)【输入:[str,str,str];输出:[int,int,int]】 def transform(self,sentence,max_len=None): if len(sentence) > max_len: # 句子过长,截取句子 sentence = sentence[:max_len] else: # 句子过短,填充句子 sentence = sentence + [self.PAD_TAG] *(max_len- len(sentence)) index_sequence = [self.to_index(word) for word in sentence] return index_sequence # index -> word def to_word(self,index): assert self.fited , "必须先进行fit操作" if index in self.inversed_dict: return self.inversed_dict[index] return self.UNK_TAG # 把数字数组(向量)转化为句子【输入:[int,int,int];输出:[str,str,str]】 def inverse_transform(self,indexes): sentence = [self.index_word_dict.get(index,"<UNK>") for index in indexes] return sentence # =======================================文本序列化:结束======================================= # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:开始======================================= def fit_save_word_sequence(): dataloader_train = get_dataloader(True) # 训练集批次化数据【文本类型】 dataloader_test = get_dataloader(False) # 测试集批次化数据【文本类型】 ws = WordSequence() # 实例化文本序列化对象 for reviews, label in tqdm(dataloader_train, total=len(dataloader_train)): # tqdm的作用是提供运行进度条提示 for review in reviews: ws.fit(review) for reviews, label in tqdm(dataloader_test, total=len(dataloader_test)): for review in reviews: ws.fit(review) print("构造的词典的容量大小:len(ws) = {0}".format(len(ws))) pickle.dump(ws, open("./models/ws.pkl", "wb")) # 保存文本序列化对象 # =======================================对IMDB的数据集进行fit操作【将字符串转为序号,并保存词语-数字映射】:结束======================================= # =======================================构建神经网络模型:开始======================================= class LSTMModel(nn.Module): def __init__(self, wordSequence, max_len=MAX_LEN): super(LSTMModel,self).__init__() self.hidden_size = 64 self.embedding_dim = 200 self.num_layer = 2 self.bidriectional = True self.bi_num = 2 if self.bidriectional else 1 self.dropout = 0.5 #以上部分为超参数,可以自行修改 self.embedding = nn.Embedding(len(wordSequence),self.embedding_dim,padding_idx=wordSequence.PAD) #[N,300] self.lstm = nn.LSTM(self.embedding_dim,self.hidden_size,self.num_layer,bidirectional=True,dropout=self.dropout) #使用两个全连接层,中间使用relu激活函数 self.fc = nn.Linear(self.hidden_size*self.bi_num,20) self.fc2 = nn.Linear(20,2) def forward(self, x): x = self.embedding(x) x = x.permute(1,0,2) #进行轴交换 h_0,c_0 = self.init_hidden_state(x.size(1)) _,(h_n,c_n) = self.lstm(x,(h_0,c_0)) #只要最后一个lstm单元处理的结果,这里多去的hidden state out = torch.cat([h_n[-2, :, :], h_n[-1, :, :]], dim=-1) out = self.fc(out) out = F.relu(out) out = self.fc2(out) return F.log_softmax(out,dim=-1) def init_hidden_state(self,batch_size): h_0 = torch.rand(self.num_layer * self.bi_num, batch_size, self.hidden_size).to(device) c_0 = torch.rand(self.num_layer * self.bi_num, batch_size, self.hidden_size).to(device) return h_0,c_0 # =======================================构建神经网络模型:结束======================================= def train(epoch, wordSequence): lstm_model.train() dataset_train = ImdbDataset(wordSequence=wordSequence, train=True) dataloader_train = get_dataloader(dataset=dataset_train, train=True) # 训练集批次化数据【文本类型】 for batch_index, (reviews, labels, lengths) in enumerate(dataloader_train): reviews = reviews.to(device) labels = labels.to(device) optimizer.zero_grad() output = lstm_model(reviews) loss = criterion(output, labels) # traget需要是[0,9],不能是[1-10] loss.backward() optimizer.step() if batch_index % 130 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_index * len(reviews), len(dataloader_train.dataset),100. * batch_index / len(dataloader_train), loss.item())) torch.save(lstm_model.state_dict(), "./models/mnist_net{0}.pkl".format(epoch)) torch.save(optimizer.state_dict(), './models/mnist_optimizer{0}.pkl'.format(epoch)) def test(wordSequence): test_loss = 0 correct = 0 lstm_model.eval() dataset_test = ImdbDataset(wordSequence=wordSequence, train=False) dataloader_test = get_dataloader(dataset=dataset_test, train=False) # 测试集批次化数据【文本类型】 with torch.no_grad(): for batch_index, (reviews, labels, lengths) in enumerate(dataloader_test): reviews = reviews.to(device) labels = labels.to(device) output = lstm_model(reviews) test_loss += F.nll_loss(output, labels, reduction="sum") pred = torch.max(output, dim=-1, keepdim=False)[-1] correct += pred.eq(labels.data).sum() test_loss = test_loss / len(dataloader_test.dataset) print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(test_loss, correct, len(dataloader_test.dataset), 100. * correct / len(dataloader_test.dataset))) if __name__ == '__main__': ws = WordSequence() ws = pickle.load(open("./models/ws.pkl", "rb")) # =======================================实例化神经网络各个组件:开始======================================= lstm_model = LSTMModel(wordSequence=ws, max_len=MAX_LEN).to(device) #在gpu上运行,提高运行速度 print(lstm_model) optimizer = optim.Adam(lstm_model.parameters()) criterion = nn.NLLLoss() # =======================================实例化神经网络各个组件:结束======================================= # test() for epoch in range(5): train(wordSequence=ws, epoch=epoch) test(wordSequence=ws)
输出结果:
LSTMModel( (embedding): Embedding(6457, 200, padding_idx=0) (lstm): LSTM(200, 64, num_layers=2, dropout=0.5, bidirectional=True) (fc): Linear(in_features=128, out_features=20, bias=True) (fc2): Linear(in_features=20, out_features=2, bias=True) ) Train Epoch: 0 [0/25000 (0%)] Loss: 0.715346 Train Epoch: 0 [8320/25000 (33%)] Loss: 0.649131 Train Epoch: 0 [16640/25000 (66%)] Loss: 0.663399 Train Epoch: 0 [15600/25000 (100%)] Loss: 0.625576 Test set: Avg. loss: 0.5931, Accuracy: 17624/25000 (70.50%) Train Epoch: 1 [0/25000 (0%)] Loss: 0.650195 Train Epoch: 1 [8320/25000 (33%)] Loss: 0.663408 Train Epoch: 1 [16640/25000 (66%)] Loss: 0.583336 Train Epoch: 1 [15600/25000 (100%)] Loss: 0.760975 Test set: Avg. loss: 0.5335, Accuracy: 18486/25000 (73.94%) Train Epoch: 2 [0/25000 (0%)] Loss: 0.624065 Train Epoch: 2 [8320/25000 (33%)] Loss: 0.553468 Train Epoch: 2 [16640/25000 (66%)] Loss: 0.452606 Train Epoch: 2 [15600/25000 (100%)] Loss: 0.457217 Test set: Avg. loss: 0.5416, Accuracy: 17789/25000 (71.16%) Train Epoch: 3 [0/25000 (0%)] Loss: 0.585943 Train Epoch: 3 [8320/25000 (33%)] Loss: 0.449566 Train Epoch: 3 [16640/25000 (66%)] Loss: 0.447479 Train Epoch: 3 [15600/25000 (100%)] Loss: 0.490319 Test set: Avg. loss: 0.5000, Accuracy: 19007/25000 (76.03%) Train Epoch: 4 [0/25000 (0%)] Loss: 0.437283 Train Epoch: 4 [8320/25000 (33%)] Loss: 0.411202 Train Epoch: 4 [16640/25000 (66%)] Loss: 0.394551 Train Epoch: 4 [15600/25000 (100%)] Loss: 0.437027 Test set: Avg. loss: 0.4552, Accuracy: 20160/25000 (80.64%)
# -*- coding: utf-8 -*- # pip install torch # pip install torchtext # python -m spacy download en_core_web_sm # python -m spacy download en_core_web_md # https://github.com/explosion/spacy-models # 安装spacy:pip --default-timeout=10000 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz # 在torchtext中使用spacy时,由于field的默认属性是tokenizer_language=‘en’,所以需要安装en_core_web_md:pip --default-timeout=10000 install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.1/en_core_web_md-2.3.0.tar.gz import numpy as np import torch from torch import nn, optim from torchtext import data, datasets print('GPU:', torch.cuda.is_available()) torch.manual_seed(123) # 一、获取情感分类数据集 TEXT = data.Field(tokenize='spacy') LABEL = data.LabelField(dtype=torch.float) train_data, val_data = datasets.IMDB.splits(TEXT, LABEL) print('len(train_data) = {0}'.format(len(train_data))) print('len(val_data) = {0}'.format(len(val_data))) print('train_data.examples[15].text = {0}'.format(train_data.examples[15].text)) print('train_data.examples[15].label = {0}'.format(train_data.examples[15].label)) # word2vec, glove TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d') LABEL.build_vocab(train_data) batchsz = 30 device = torch.device('cuda') train_iterator, val_iterator = data.BucketIterator.splits( (train_data, val_data), batch_size=batchsz, device=device ) # 二、构建LSTM神经网络结构 class MyLSTM(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim): super(MyLSTM, self).__init__() self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim) # [b, 1] => [b, 100] 需要编码的单词数量为vocab_size,每个单词编码为一个维度为embedding_dim的vector self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, bidirectional=True, dropout=0.5) # [b, 100] => [b, 256] embedding_dim为输入的vector维度,hidden_dim为latent层的维度,num_layers表示神经网络的层数 self.fc = nn.Linear(in_features=hidden_dim * 2, out_features=1) # [b, 256*2] => [b, 1] self.dropout = nn.Dropout(0.5) def forward(self, X): X = self.embedding(X) # [seq, b, 1] => [seq, b, 100] embedding = self.dropout(X) output, (hidden, cell) = self.lstm(embedding) # output: [seq, b, hid_dim*2]; hidden/h&cell/c: [num_layers*2, b, hid_dim] hidden = torch.cat([hidden[-2], hidden[-1]], dim=1) # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2] hidden = self.dropout(hidden) out = self.fc(hidden) # [b, hid_dim*2] => [b, 1] return out # 三、实例化LSTM lstm = MyLSTM(len(TEXT.vocab), 100, 256) # 四、初始化WordEmbedding pretrained_embedding = TEXT.vocab.vectors print('pretrained_embedding:', pretrained_embedding.shape) lstm.embedding.weight.data.copy_(pretrained_embedding) # 利用已经训练好的GloVede的embedding替代原来的embedding print('embedding layer inited.') optimizer = optim.Adam(lstm.parameters(), lr=1e-3) criteon = nn.BCEWithLogitsLoss().to(device) lstm.to(device) # 准确率 def binary_acc(preds, y): preds = torch.round(torch.sigmoid(preds)) correct = torch.eq(preds, y).float() acc = correct.sum() / len(correct) return acc # 八、训练 def train(lstm, iterator, optimizer, criteon): avg_acc = [] lstm.train() # 对data进行循环遍历,使用每个batch的数据进行参数更新 for batch_index, batch in enumerate(iterator): pred = lstm(batch.text).squeeze(1) # [seq, b] => [b, 1] => [b] loss = criteon(pred, batch.label) optimizer.zero_grad() # 设置各个批次优化器初始梯度为0 loss.backward() # 误差反向传播 optimizer.step() # 参数进行更新 acc = binary_acc(pred, batch.label).item() # Training过程中的准确度 avg_acc.append(acc) if batch_index % 10 == 0: print('batch_index = {0}, acc = {1}'.format(batch_index, acc)) avg_acc = np.array(avg_acc).mean() print('avg acc:', avg_acc) def eval(lstm, iterator, criteon): avg_acc = [] lstm.eval() with torch.no_grad(): # 不需要计算梯度 for batch in iterator: # [b, 1] => [b] pred = lstm(batch.text).squeeze(1) loss = criteon(pred, batch.label) acc = binary_acc(pred, batch.label).item() avg_acc.append(acc) avg_acc = np.array(avg_acc).mean() print('>>test--avg_acc = {0}'.format(avg_acc)) for epoch in range(10): eval(lstm, val_iterator, criteon) train(lstm, train_iterator, optimizer, criteon)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。