当前位置:   article > 正文

NLP简单项目实战——ChatBOT(二)

NLP简单项目实战——ChatBOT(二)

二、Seq2Seq

(一)Seq2Seq原理

        Seq2seq模型中的encoder接收一个长度为M的序列,得到1个 context vector,之后decoder把这一个context vector转化为长度为N的序列作为输出,从而构成一个M to N的模型,能够处理很多不定长输入输出的问题

(二)Seq2Seq实现

        1.实现流程:

        文本转换为序列,使用序列,完成encoder、decoder,完成Seq2Seq模型,完成模型训练逻辑与评估逻辑。

        准备一个config.py文件记录可变参数,use_word等。

        2.准备训练数据:

        单轮次聊天数据并不好获取,这里主要用小黄鸡语料以及微博语料。

        下载地址:GitHub - codemayq/chinese-chatbot-corpus: 中文公开聊天语料库

        存放至:

        建立按词与按字分词的input与output文件:

  1. from tqdm import tqdm
  2. from jieba import lcut as jieba_cut
  3. import re
  4. def format_xiaohuangji_corpus(word=False):
  5. """处理小黄鸡的语料,分为两个部分,一个是input部分,就是发起聊天的语料
  6. output就是回应聊天的语料"""
  7. if word:
  8. corpus_path = "corpus_pre/xiaohuangji50w_nofenci.conv"
  9. input_path = "corpus/input_word.txt"
  10. output_path = "corpus/corpus/output_word.txt"
  11. else:
  12. corpus_path = "corpus_pre/xiaohuangji50w_nofenci.conv"
  13. input_path = "corpus/input.txt"
  14. output_path = "corpus/output.txt"
  15. f_input = open(input_path,"a")
  16. f_output = open(output_path,"a")
  17. pair = []
  18. for line in tqdm(open(corpus_path, encoding='utf-8'),ascii=True):
  19. if line.strip() == "E":
  20. if not pair:
  21. continue
  22. else:
  23. assert len(pair) == 2,"长度必须是2"
  24. if len(pair[0].strip())>=1 and len(pair[1].strip())>=1:
  25. f_input.write(pair[0]+"\n")
  26. f_output.write(pair[1]+"\n")
  27. pair = []
  28. elif line.startswith("M"):
  29. line = line[1:]
  30. if word:
  31. pair.append(" ".join(list(line.strip())))
  32. else:
  33. pair.append(" ".join(jieba_cut(line.strip())))
  34. def format_weibo(word=False):
  35. """
  36. 微博数据存在一些噪声,未处理
  37. :return:
  38. """
  39. if word:
  40. origin_input = "corpus_pre/stc_weibo_train_post"
  41. input_path = "corpus/input_word.txt"
  42. origin_output = "corpus_pre/stc_weibo_train_response"
  43. output_path = "corpus/output_word.txt"
  44. else:
  45. origin_input = "corpus_pre/stc_weibo_train_post"
  46. input_path = "corpus/input.txt"
  47. origin_output = "corpus_pre/stc_weibo_train_response"
  48. output_path = "corpus/output.txt"
  49. f_input = open(input_path,"a",encoding='utf-8')
  50. f_output = open(output_path, "a",encoding='utf-8')
  51. with open(origin_input, encoding='utf-8') as in_o,open(origin_output,encoding='utf-8') as out_o:
  52. for _in,_out in tqdm(zip(in_o,out_o),ascii=True):
  53. _in = _in.strip()
  54. _out = _out.strip()
  55. if _in.endswith(")") or _in.endswith("」") or _in.endswith(")"):
  56. _in = re.sub("(.*)|「.*?」|\(.*?\)"," ",_in)
  57. """由于微博语料较多表情与符号,所以需要用正则表达式进行处理"""
  58. _in = re.sub("我在.*?alink|alink|(.*?\d+x\d+.*?)|#|】|【|-+|_+|via.*?:*.*"," ",_in)
  59. _in = re.sub("\s+"," ",_in)
  60. if len(_in)<1 or len(_out)<1:
  61. continue
  62. if word:
  63. _in = re.sub("\s+","",_in) #转化为一整行,不含空格
  64. _out = re.sub("\s+","",_out)
  65. if len(_in)>=1 and len(_out)>=1:
  66. f_input.write(" ".join(list(_in)) + "\n")
  67. f_output.write(" ".join(list(_out)) + "\n")
  68. else:
  69. if len(_in) >= 1 and len(_out) >= 1:
  70. f_input.write(_in.strip()+"\n")
  71. f_output.write(_out.strip()+"\n")
  72. f_input.close()
  73. f_output.close()
  74. if __name__ == '__main__':
  75. format_xiaohuangji_corpus(False)
  76. format_xiaohuangji_corpus(True)
  77. format_weibo(False)
  78. format_weibo(True)

        报错:

        解决: 添加encoding = ’utf-8‘即可。

        调用函数即可。

       3.文本转换为序列:

        将文本转换为数字再转换为向量:

  1. import config
  2. import pickle
  3. class Word2Sequence():
  4. UNK_TAG = "UNK"
  5. PAD_TAG = "PAD"
  6. SOS_TAG = "SOS"
  7. EOS_TAG = "EOS"
  8. UNK = 0
  9. PAD = 1
  10. SOS = 2
  11. EOS = 3
  12. def __init__(self):
  13. self.dict = {
  14. self.UNK_TAG: self.UNK,
  15. self.PAD_TAG: self.PAD,
  16. self.SOS_TAG: self.SOS,
  17. self.EOS_TAG: self.EOS
  18. }
  19. self.count = {}
  20. self.fited = False
  21. def to_index(self, word):
  22. """word -> index"""
  23. assert self.fited == True, "必须先进行fit操作"
  24. return self.dict.get(word, self.UNK)
  25. def to_word(self, index):
  26. """index -> word"""
  27. assert self.fited, "必须先进行fit操作"
  28. if index in self.inversed_dict:
  29. return self.inversed_dict[index]
  30. return self.UNK_TAG
  31. def __len__(self):
  32. return len(self.dict)
  33. def fit(self, sentence):
  34. """
  35. :param sentence:[word1,word2,word3]
  36. :param min_count: 最小出现的次数
  37. :param max_count: 最大出现的次数
  38. :param max_feature: 总词语的最大数量
  39. :return:
  40. """
  41. for a in sentence:
  42. if a not in self.count:
  43. self.count[a] = 0
  44. self.count[a] += 1
  45. self.fited = True
  46. def build_vocab(self, min_count=1, max_count=None, max_feature=None):
  47. # 比最小的数量大和比最大的数量小的需要
  48. if min_count is not None:
  49. self.count = {k: v for k, v in self.count.items() if v >= min_count}
  50. if max_count is not None:
  51. self.count = {k: v for k, v in self.count.items() if v <= max_count}
  52. # 限制最大的数量
  53. if isinstance(max_feature, int):
  54. count = sorted(list(self.count.items()), key=lambda x: x[1])
  55. if max_feature is not None and len(count) > max_feature:
  56. count = count[-int(max_feature):]
  57. for w, _ in count:
  58. self.dict[w] = len(self.dict)
  59. else:
  60. for w in sorted(self.count.keys()):
  61. self.dict[w] = len(self.dict)
  62. # 准备一个index->word的字典
  63. self.inversed_dict = dict(zip(self.dict.values(), self.dict.keys()))
  64. def transform(self, sentence, max_len=None, add_eos=False):
  65. """
  66. 实现吧句子转化为数组(向量)
  67. :param sentence:
  68. :param max_len:
  69. :return:
  70. """
  71. assert self.fited, "必须先进行fit操作"
  72. r = [self.to_index(i) for i in sentence]
  73. if max_len is not None:
  74. if max_len > len(sentence):
  75. if add_eos:
  76. r += [self.EOS] + [self.PAD for _ in range(max_len - len(sentence) - 1)]
  77. else:
  78. r += [self.PAD for _ in range(max_len - len(sentence))]
  79. else:
  80. if add_eos:
  81. r = r[:max_len - 1]
  82. r += [self.EOS]
  83. else:
  84. r = r[:max_len]
  85. else:
  86. if add_eos:
  87. r += [self.EOS]
  88. # print(len(r),r)
  89. return r
  90. def inverse_transform(self, indices):
  91. """
  92. 实现从数组 转化为 向量
  93. :param indices: [1,2,3....]
  94. :return:[word1,word2.....]
  95. """
  96. sentence = []
  97. for i in indices:
  98. word = self.to_word(i)
  99. sentence.append(word)
  100. return sentence
  101. # 之后导入该word_sequence使用
  102. # word_sequence = pickle.load(open("./ws", "rb")) if not config.use_word else pickle.load(
  103. # open("./ws_word", "rb"))
  104. if __name__ == '__main__':
  105. from tqdm import tqdm
  106. import pickle
  107. word_sequence = Word2Sequence()
  108. # 词语级别
  109. input_path = "corpus/input.txt"
  110. target_path = "corpus/output.txt"
  111. for line in tqdm(open(input_path, encoding='utf-8').readlines()):
  112. word_sequence.fit(line.strip().split())
  113. for line in tqdm(open(target_path, encoding='utf-8').readlines()):
  114. word_sequence.fit(line.strip().split())
  115. # 使用max_feature=5000个数据
  116. word_sequence.build_vocab(min_count=5, max_count=None, max_feature=5000)
  117. print(len(word_sequence))
  118. pickle.dump(word_sequence, open("./ws", "wb"))

        运行结果:

       4.准备DataLoader:

  1. import torch
  2. import config
  3. from torch.utils.data import Dataset,DataLoader
  4. from wordsequence import word_sequence
  5. class ChatDataset(Dataset):
  6. def __init__(self):
  7. super(ChatDataset,self).__init__()
  8. input_path = "corpus/input.txt"
  9. target_path = "corpus/output.txt"
  10. if config.use_word:
  11. input_path = "corpus/input_word.txt"
  12. target_path = "corpus/output_word.txt"
  13. self.input_lines = open(input_path).readlines()
  14. self.target_lines = open(target_path).readlines()
  15. assert len(self.input_lines) == len(self.target_lines) ,"input和target文本的数量必须相同"
  16. def __getitem__(self, index):
  17. input = self.input_lines[index].strip().split()
  18. target = self.target_lines[index].strip().split()
  19. if len(input) == 0 or len(target)==0:
  20. input = self.input_lines[index+1].strip().split()
  21. target = self.target_lines[index+1].strip().split()
  22. #此处句子的长度如果大于max_len,那么应该返回max_len
  23. return input,target,min(len(input),config.max_len),min(len(target),config.max_len)
  24. def __len__(self):
  25. return len(self.input_lines)
  26. def collate_fn(batch):
  27. #1.排序
  28. batch = sorted(batch,key=lambda x:x[2],reverse=True)
  29. input, target, input_length, target_length = zip(*batch)
  30. # 2.进行padding的操作
  31. input = torch.LongTensor([word_sequence.transform(i, max_len=config.max_len) for i in input])
  32. target = torch.LongTensor([word_sequence.transform(i, max_len=config.max_len, add_eos=True) for i in target])
  33. input_length = torch.LongTensor(input_length)
  34. target_length = torch.LongTensor(target_length)
  35. return input, target, input_length, target_length
  36. data_loader = DataLoader(dataset=ChatDataset(),batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn,drop_last=True)

        5.encoder编码层:

        建立一个神经网络,主要有一个embedding层和GRU层(LSTM的变体)

  1. import torch.nn as nn
  2. from wordsequence import word_sequence
  3. import config
  4. class Encoder(nn.Module):
  5. def __init__(self):
  6. super(Encoder,self).__init__()
  7. self.vocab_size = len(word_sequence)
  8. self.dropout = config.dropout
  9. self.embedding_dim = config.embedding_dim
  10. self.embedding = nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embedding_dim,padding_idx=word_sequence.PAD)
  11. self.gru = nn.GRU(input_size=self.embedding_dim,
  12. hidden_size=config.hidden_size,
  13. num_layers=1,
  14. batch_first=True,
  15. dropout=config.dropout)
  16. def forward(self, input,input_length):
  17. embeded = self.embedding(input)
  18. """
  19. 可变长度的tensor
  20. """
  21. embeded = nn.utils.rnn.pack_padded_sequence(embeded,lengths=input_length,batch_first=True)
  22. #hidden:[1,batch_size,vocab_size]
  23. out,hidden = self.gru(embeded)
  24. out,outputs_length = nn.utils.rnn.pad_packed_sequence(out,batch_first=True,padding_value=word_sequence.PAD)
  25. #hidden [1,batch_size,hidden_size]
  26. return out,hidden

        6.decoder层:

        主要有四层,embedding层、Gru层、全链接层和一个输出softmax层。

        forward用于将上一次计算的的的输出与隐层状态传给下一次,forward_step用于向前计算。

  1. import torch
  2. import torch.nn as nn
  3. import config
  4. import random
  5. import torch.nn.functional as F
  6. from wordsequence import word_sequence
  7. class Decoder(nn.Module):
  8. def __init__(self):
  9. super(Decoder,self).__init__()
  10. self.max_seq_len = config.max_len
  11. self.vocab_size = len(word_sequence)
  12. self.embedding_dim = config.embedding_dim
  13. self.dropout = config.dropout
  14. self.embedding = nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embedding_dim,padding_idx=word_sequence.PAD)
  15. self.gru = nn.GRU(input_size=self.embedding_dim,
  16. hidden_size=config.hidden_size,
  17. num_layers=1,
  18. batch_first=True,
  19. dropout=self.dropout)
  20. self.log_softmax = nn.LogSoftmax()
  21. self.fc = nn.Linear(config.hidden_size,self.vocab_size)
  22. def forward(self, encoder_hidden,target,target_length):
  23. # encoder_hidden [batch_size,hidden_size]
  24. # target [batch_size,seq-len]
  25. decoder_input = torch.LongTensor([[word_sequence.SOS]]*config.batch_size).to(config.device)
  26. decoder_outputs = torch.zeros(config.batch_size,config.max_len,self.vocab_size).to(config.device) #[batch_size,seq_len,14]
  27. decoder_hidden = encoder_hidden #[batch_size,hidden_size]
  28. for t in range(config.max_len):
  29. decoder_output_t , decoder_hidden = self.forward_step(decoder_input,decoder_hidden)
  30. decoder_outputs[:,t,:] = decoder_output_t
  31. value, index = torch.topk(decoder_output_t, 1) # index [batch_size,1]
  32. decoder_input = index
  33. return decoder_outputs,decoder_hidden
  34. def forward_step(self,decoder_input,decoder_hidden):
  35. """
  36. :param decoder_input:[batch_size,1]
  37. :param decoder_hidden: [1,batch_size,hidden_size]
  38. :return: out:[batch_size,vocab_size],decoder_hidden:[1,batch_size,didden_size]
  39. """
  40. embeded = self.embedding(decoder_input) #embeded: [batch_size,1 , embedding_dim]
  41. out,decoder_hidden = self.gru(embeded,decoder_hidden) #out [1, batch_size, hidden_size]
  42. out = out.squeeze(0)
  43. out = F.log_softmax(self.fc(out),dim=-1)#[batch_Size, vocab_size]
  44. out = out.squeeze(1)
  45. # print("out size:",out.size(),decoder_hidden.size())
  46. return out,decoder_hidden

        7.完成模型:

        将encoder的隐层状态导入decoder中

  1. import torch
  2. import torch.nn as nn
  3. class Seq2Seq(nn.Module):
  4. def __init__(self,encoder,decoder):
  5. super(Seq2Seq,self).__init__()
  6. self.encoder = encoder
  7. self.decoder = decoder
  8. def forward(self, input,target,input_length,target_length):
  9. encoder_outputs,encoder_hidden = self.encoder(input,input_length)
  10. decoder_outputs,decoder_hidden = self.decoder(encoder_hidden,target,target_length)
  11. return decoder_outputs,decoder_hidden
  12. def evaluation(self,inputs,input_length):
  13. encoder_outputs,encoder_hidden = self.encoder(inputs,input_length)
  14. decoded_sentence = self.decoder.evaluation(encoder_hidden)
  15. return decoded_sentence

         8.评估:

  1. import torch
  2. import config
  3. from torch import optim
  4. import torch.nn as nn
  5. from Encoder import Encoder
  6. from Decoder import Decoder
  7. from seq2seq import Seq2Seq
  8. from Dataset_Dataloader import data_loader as train_dataloader
  9. from wordsequence import word_sequence
  10. encoder = Encoder()
  11. decoder = Decoder()
  12. model = Seq2Seq(encoder,decoder)
  13. #device在config文件中实现
  14. model.to(config.device)
  15. model.load_state_dict(torch.load("model/seq2seq_model"))
  16. optimizer = optim.Adam(model.parameters())
  17. optimizer.load_state_dict(torch.load("model/seq2seq_optimizer"))
  18. criterion= nn.NLLLoss(ignore_index=word_sequence.PAD,reduction="mean")
  19. def get_loss(decoder_outputs,target):
  20. target = target.view(-1) #[batch_size*max_len]
  21. decoder_outputs = decoder_outputs.view(config.batch_size*config.max_len,-1)
  22. return criterion(decoder_outputs,target)
  23. def train(epoch):
  24. for idx,(input,target,input_length,target_len) in enumerate(train_dataloader):
  25. input = input.to(config.device)
  26. target = target.to(config.device)
  27. input_length = input_length.to(config.device)
  28. target_len = target_len.to(config.device)
  29. optimizer.zero_grad()
  30. ##[seq_len,batch_size,vocab_size] [batch_size,seq_len]
  31. decoder_outputs,decoder_hidden = model(input,target,input_length,target_len)
  32. loss = get_loss(decoder_outputs,target)
  33. loss.backward()
  34. optimizer.step()
  35. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
  36. epoch, idx * len(input), len(train_dataloader.dataset),
  37. 100. * idx / len(train_dataloader), loss.item()))
  38. torch.save(model.state_dict(), "model/seq2seq_model")
  39. torch.save(optimizer.state_dict(), 'model/seq2seq_optimizer')
  40. if __name__ == '__main__':
  41. for i in range(10):
  42. train(i)

        电脑冒烟了,跑了12个小时:

         但可见,效果并不是很好,仅有50%的准确率,loss也高。这是因为在当前模型中,Encoder将一个句子转换为一个向量,然后在Decoder中使用,如果说Encoder输入句子过长,包含了太多其他的信息,这就导致在Decoder中进行编码的过程,有太多噪声干扰,导致较低的准确率和较高的loss。

三、attention:

        为了解决上面的问题,我们采用attention机制,即只关注重要的信息。我们不再将encoder的输出状态作为decoder的输入。

(一)attention原理:

        参考:Hung-yi Lee

        初始化decoder的初始状态Z0,然后这个Z0会与encoder的第一个timestep的output进行match操作,match操作可以是一个神经网络,具体表现为一个矩阵乘法。得到结果进行softmax,让其和为1,并将其结果与原来encoder输出就行加权求和得到c0。然后与z0一起进入decoder层,得到h1。然后以此循环直至decoder的output为终止符。

        

 (二)soft-attention与hard-attention

        1.soft-attention:

        指的是encoder中输入的每个词语都会计算得到一个注意力的概率。

        2.hard-attention:

        希望直接从input中找到一个和输出的某个词对应的那一个词。但是由于NLP中词语。和词语之间往往存在联系,不会只关注某一个词语。所以在NLP中我们一般使用soft-attention。

(三)Global-attention and Local-attention:

        Global-attention指的是使用的全部的encoder端的输入的attenion的权重

        Local-attention指的是使用了部分的encoder端的输入的权重(当前时间步上的encoder的hidden state),这样可以减少计算量,特别是当句子的长度比较长的时候。

        我们一般使用全局attention。

        全局attention计算权重方法:

        将decoder进行矩阵变换得到d1,然后将decoder与encoder的输出进行矩阵乘法得到d2,然后将d1与d2合并为一个矩阵。

(四)代码实现:

        其实attention机制简单来说就是,计算权重,采用local-attention的方法。

  1. class Attention(nn.Module):
  2. def __init__(self,method,batch_size,hidden_size):
  3. super(Attention,self).__init__()
  4. self.method = method
  5. self.hidden_size = hidden_size
  6. assert self.method in ["dot","general","concat"],"method 只能是 dot,general,concat,当前是{}".format(self.method)
  7. if self.method == "dot":
  8. pass
  9. elif self.method == "general":
  10. self.Wa = nn.Linear(hidden_size,hidden_size,bias=False)
  11. elif self.method == "concat":
  12. self.Wa = nn.Linear(hidden_size*2,hidden_size,bias=False)
  13. self.Va = nn.Parameter(torch.FloatTensor(batch_size,hidden_size))
  14. def forward(self, hidden,encoder_outputs):
  15. """
  16. :param hidden:[1,batch_size,hidden_size]
  17. :param encoder_outputs: [batch_size,seq_len,hidden_size]
  18. :return:
  19. """
  20. batch_size,seq_len,hidden_size = encoder_outputs.size()
  21. hidden = hidden.squeeze(0) #[batch_size,hidden_size]
  22. if self.method == "dot":
  23. return self.dot_score(hidden,encoder_outputs)
  24. elif self.method == "general":
  25. return self.general_score(hidden,encoder_outputs)
  26. elif self.method == "concat":
  27. return self.concat_score(hidden,encoder_outputs)
  28. def _score(self,batch_size,seq_len,hidden,encoder_outputs):
  29. # 速度太慢
  30. # [batch_size,seql_len]
  31. attn_energies = torch.zeros(batch_size,seq_len).to(config.device)
  32. for b in range(batch_size):
  33. for i in range(seq_len):
  34. #encoder_output : [batch_size,seq_len,hidden_size]
  35. #deocder_hidden :[batch_size,hidden_size]
  36. #torch.Size([256, 128]) torch.Size([128]) torch.Size([256, 24, 128]) torch.Size([128])
  37. # print("attn size:",hidden.size(),hidden[b,:].size(),encoder_output.size(),encoder_output[b,i].size())
  38. attn_energies[b,i] = hidden[b,:].dot(encoder_outputs[b,i]) #dot score
  39. return F.softmax(attn_energies).unsqueeze(1) # [batch_size,1,seq_len]
  40. def dot_score(self,hidden,encoder_outputs):
  41. """
  42. dot attention
  43. :param hidden:[batch_size,hidden_size] --->[batch_size,hidden_size,1]
  44. :param encoder_outputs: [batch_size,seq_len,hidden_size]
  45. :return:
  46. """
  47. #hiiden :[hidden_size] -->[hidden_size,1] ,encoder_output:[seq_len,hidden_size]
  48. hidden = hidden.unsqueeze(-1)
  49. attn_energies = torch.bmm(encoder_outputs, hidden)
  50. attn_energies = attn_energies.squeeze(-1) #[batch_size,seq_len,1] ==>[batch_size,seq_len]
  51. return F.softmax(attn_energies).unsqueeze(1) # [batch_size,1,seq_len]
  52. def general_score(self,hidden,encoder_outputs):
  53. """
  54. general attenion
  55. :param batch_size:int
  56. :param hidden: [batch_size,hidden_size]
  57. :param encoder_outputs: [batch_size,seq_len,hidden_size]
  58. :return:
  59. """
  60. x = self.Wa(hidden) #[batch_size,hidden_size]
  61. x = x.unsqueeze(-1) #[batch_size,hidden_size,1]
  62. attn_energies = torch.bmm(encoder_outputs,x).squeeze(-1) #[batch_size,seq_len,1]
  63. return F.softmax(attn_energies,dim=-1).unsqueeze(1) # [batch_size,1,seq_len]
  64. def concat_score(self,hidden,encoder_outputs):
  65. """
  66. concat attention
  67. :param batch_size:int
  68. :param hidden: [batch_size,hidden_size]
  69. :param encoder_outputs: [batch_size,seq_len,hidden_size]
  70. :return:
  71. """
  72. #需要先进行repeat操作,变成和encoder_outputs相同的形状,让每个batch有seq_len个hidden_size
  73. x = hidden.repeat(1,encoder_outputs.size(1),1) ##[batch_size,seq_len,hidden_size]
  74. x = torch.tanh(self.Wa(torch.cat([x,encoder_outputs],dim=-1))) #[batch_size,seq_len,hidden_size*2] --> [batch_size,seq_len,hidden_size]
  75. #va [batch_size,hidden_size] ---> [batch_size,hidden_size,1]
  76. attn_energis = torch.bmm(x,self.Va.unsqueeze(2)) #[batch_size,seq_len,1]
  77. attn_energis = attn_energis.squeeze(-1)
  78. # print("concat attention:",attn_energis.size(),encoder_outputs.size())
  79. return F.softmax(attn_energis,dim=-1).unsqueeze(1) #[batch_size,1,seq_len]

        将在seq2seq中的forward_step函数进行修改:四、

四、Beam Search

        基本的建模以及完成,在模型评估时我们应该选择概率最大的tokenid进行输出,但是得到的句子可能并不通顺,我们叫他为greedy search。所以为了解决输出句子不通顺的问题,我们可以选择累积概率最大的那一个,但是这就意味着句子将会特别长,不方便进行保存。

        为了解决以上两个问题,我们采用Beam Search。假设Beam width=2,表示每次保存的最大的概率的个数,这里每次保存两个,在下一个时间步骤一样,也是保留两个,这样就可以达到约束搜索空间大小的目的,从而提高算法的效率。

        Beam Search的实现,用数据结构大根堆。  

  1. class Beam:
  2. def __init__(self):
  3. self.heap = list() #保存数据的位置
  4. self.beam_width = config.beam_width #保存数据的总数
  5. def add(self,probility,complete,seq,decoder_input,decoder_hidden):
  6. """
  7. 添加数据,同时判断总的数据个数,多则删除
  8. :param probility: 概率乘积
  9. :param complete: 最后一个是否为EOS
  10. :param seq: list,所有token的列表
  11. :param decoder_input: 下一次进行解码的输入,通过前一次获得
  12. :param decoder_hidden: 下一次进行解码的hidden,通过前一次获得
  13. :return:
  14. """
  15. heapq.heappush(self.heap,[probility,complete,seq,decoder_input,decoder_hidden])
  16. #判断数据的个数,如果大,则弹出。保证数据总个数小于等于3
  17. if len(self.heap)>self.beam_width:
  18. heapq.heappop(self.heap)
  19. def __iter__(self):#让该beam能够被迭代
  20. return iter(self.heap)

        在seq2seq中加入以下方法:

  1. # decoder中的新方法
  2. def evaluatoin_beamsearch_heapq(self,encoder_outputs,encoder_hidden):
  3. """使用 堆 来完成beam search,对是一种优先级的队列,按照优先级顺序存取数据"""
  4. batch_size = encoder_hidden.size(1)
  5. #1. 构造第一次需要的输入数据,保存在堆中
  6. decoder_input = torch.LongTensor([[word_sequence.SOS] * batch_size]).to(config.device)
  7. decoder_hidden = encoder_hidden #需要输入的hidden
  8. prev_beam = Beam()
  9. prev_beam.add(1,False,[decoder_input],decoder_input,decoder_hidden)
  10. while True:
  11. cur_beam = Beam()
  12. #2. 取出堆中的数据,进行forward_step的操作,获得当前时间步的output,hidden
  13. #这里使用下划线进行区分
  14. for _probility,_complete,_seq,_decoder_input,_decoder_hidden in prev_beam:
  15. #判断前一次的_complete是否为True,如果是,则不需要forward
  16. #有可能为True,但是概率并不是最大
  17. if _complete == True:
  18. cur_beam.add(_probility,_complete,_seq,_decoder_input,_decoder_hidden)
  19. else:
  20. decoder_output_t, decoder_hidden,_ = self.forward_step(_decoder_input, _decoder_hidden,encoder_outputs)
  21. value, index = torch.topk(decoder_output_t, config.beam_width) # [batch_size=1,beam_widht=3]
  22. #3. 从output中选择topk(k=beam width)个输出,作为下一次的input
  23. for m, n in zip(value[0], index[0]):
  24. decoder_input = torch.LongTensor([[n]]).to(config.device)
  25. seq = _seq + [n]
  26. probility = _probility * m
  27. if n.item() == word_sequence.EOS:
  28. complete = True
  29. else:
  30. complete = False
  31. #4. 把下一个实践步骤需要的输入等数据保存在一个新的堆中
  32. cur_beam.add(probility,complete,seq,
  33. decoder_input,decoder_hidden)
  34. #5. 获取新的堆中的优先级最高(概率最大)的数据,判断数据是否是EOS结尾或者是否达到最大长度,如果是,停止迭代
  35. best_prob,best_complete,best_seq,_,_ = max(cur_beam)
  36. if best_complete == True or len(best_seq)-1 == config.max_len: #减去sos
  37. return self._prepar_seq(best_seq)
  38. else:
  39. #6. 则重新遍历新的堆中的数据
  40. prev_beam = cur_beam
  41. def _prepar_seq(self,seq):#对结果进行基础的处理,共后续转化为文字使用
  42. if seq[0].item() == word_sequence.SOS:
  43. seq= seq[1:]
  44. if seq[-1].item() == word_sequence.EOS:
  45. seq = seq[:-1]
  46. seq = [i.item() for i in seq]
  47. return seq

五、优化

(一)tearch forcing

        在RNN的训练中,如果把前一个预测结果作为下一步step的输入,可能会导致步步错,可以考虑在训练过程中将真实值作为下一步输入,也可以用预测结果进行输入,这两种方式交替使用。

  1. use_teacher_forcing = random.random() > 0.5
  2. if use_teacher_forcing: #使用teacher forcing
  3. for t in range(config.max_len):
  4. decoder_output_t, decoder_hidden, decoder_attn_t = self.forward_step(decoder_input, decoder_hidden,
  5. encoder_outputs)
  6. decoder_outputs[:, t, :] = decoder_output_t
  7. #使用正确的输出作为下一步的输入
  8. decoder_input = target[:, t].unsqueeze(1) # [batch_size,1]
  9. else:#不适用teacher forcing,使用预测的输出作为下一步的输入
  10. for t in range(config.max_len):
  11. decoder_output_t ,decoder_hidden,decoder_attn_t = self.forward_step(decoder_input,decoder_hidden,encoder_outputs)
  12. decoder_outputs[:,t,:] = decoder_output_t
  13. value, index = torch.topk(decoder_output_t, 1) # index [batch_size,1]
  14. decoder_input = index

(二)对于梯度消失和梯度爆炸的解决方法

        梯度裁剪:设置一个阈值,利用nn.utils.clip_grad_norm_方法

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/498101
推荐阅读
相关标签
  

闽ICP备14008679号