当前位置:   article > 正文

BERT(cpu)代码复现_bert代码复现

bert代码复现

逐行注释,逐行解析。可直接运行。
code
 from https://github.com/graykode/nlp-tutorial/tree/master/5-2.BERT

  1. import math
  2. import re
  3. import time
  4. from random import *
  5. import numpy as np
  6. import torch
  7. import torch.nn as nn
  8. import torch.optim as optim
  9. from torch.utils.data import Dataset, DataLoader
  10. # 10. MyDataset
  11. class MyDataset(Dataset):
  12. # 读数据
  13. def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
  14. self.input_ids = input_ids
  15. self.segment_ids = segment_ids
  16. self.masked_tokens = masked_tokens
  17. self.masked_pos = masked_pos
  18. self.isNext = isNext
  19. # 返回数据长度(有几行数据)
  20. def __len__(self):
  21. return len(self.input_ids)
  22. # return self.input_ids.shape[0]
  23. # 返回相对位置上的元素,会比make_batch函数返回的tensor数据少一个维度
  24. def __getitem__(self, idx):
  25. return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]
  26. # 9. gelu激活函数 (GAUSSIAN ERROR LINEAR UNITS:高斯误差线性单元)
  27. def gelu(x):
  28. """
  29. Implementation of the gelu activation function.
  30. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
  31. 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
  32. Also see https://arxiv.org/abs/1606.08415
  33. """
  34. return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) # x==[6, 30, 3072]
  35. # 8. PoswiseFeedForwardNet
  36. class PoswiseFeedForwardNet(nn.Module):
  37. def __init__(self):
  38. super(PoswiseFeedForwardNet, self).__init__()
  39. self.fc1 = nn.Linear(d_model, d_ff)
  40. self.fc2 = nn.Linear(d_ff, d_model)
  41. def forward(self, x):
  42. # x==[6, 30, 768]
  43. return self.fc2(gelu(self.fc1(x))) # 9.
  44. # 7. ScaledDotProductAttention
  45. class ScaledDotProductAttention(nn.Module):
  46. def __init__(self):
  47. super(ScaledDotProductAttention, self).__init__()
  48. def forward(self, Q, K, V, attn_mask):
  49. # Q,K,V==[6, 12, 30, 64] attn_mask==[6, 12, 30, 30]
  50. scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # [6, 12, 30, 64]*[6, 12, 64, 30]==[6, 12, 30, 30]
  51. scores.masked_fill_(attn_mask, -1e9)
  52. # 矩阵横行 特征维度上做softmax
  53. attn = nn.Softmax(dim=-1)(scores) # [6, 12, 30, 30]
  54. context = torch.matmul(attn, V) # [6, 12, 30, 30]*[6, 12, 30, 64]==[6, 12, 30, 64]
  55. return context, attn
  56. # 6. MultiHeadAttention
  57. class MultiHeadAttention(nn.Module):
  58. def __init__(self):
  59. super(MultiHeadAttention, self).__init__()
  60. self.W_Q = nn.Linear(d_model, d_k * n_heads)
  61. self.W_K = nn.Linear(d_model, d_k * n_heads)
  62. self.W_V = nn.Linear(d_model, d_v * n_heads)
  63. self.linear = nn.Linear(n_heads * d_v, d_model)
  64. self.layer_norm = nn.LayerNorm(d_model)
  65. def forward(self, Q, K, V, attn_mask):
  66. # Q,K,V==[6, 30, 768] attn_mask==[6, 30, 30]
  67. residual = Q # 残差连接
  68. batch_size = Q.size(0)
  69. q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # [6, 30, 12, 64]--->[6, 12, 30, 64]
  70. k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # [6, 30, 12, 64]--->[6, 12, 30, 64]
  71. v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # [6, 30, 12, 64]--->[6, 12, 30, 64]
  72. attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # [6, 1, 30, 30]--->[6, 12, 30, 30]
  73. # attn==[6, 12, 30, 30] context==[6, 12, 30, 64]
  74. context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask) # 7.
  75. context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # [6, 12, 30, 64]--->[6, 30, 12, 64]--->[6, 30, 768]
  76. output = self.linear(context) # [6, 30, 768]--->[6, 30, 768]
  77. return self.layer_norm(output + residual), attn
  78. # 5. EncoderLayer:包含两个部分,多头自注意力层和前馈神经网络
  79. class EncoderLayer(nn.Module):
  80. def __init__(self):
  81. super(EncoderLayer, self).__init__()
  82. self.enc_self_attn = MultiHeadAttention() # 6.
  83. self.pos_ffn = PoswiseFeedForwardNet() # 8.
  84. def forward(self, enc_inputs, enc_self_attn_mask):
  85. # attn==[6, 12, 30, 30] enc_outputs==[6, 30, 768]
  86. enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
  87. enc_outputs = self.pos_ffn(enc_outputs) # [6, 30, 768]
  88. return enc_outputs, attn
  89. # 4. get_attn_pad_mask
  90. def get_attn_pad_mask(seq_q, seq_k):
  91. batch_size, len_q = seq_q.size() # [6, 30]
  92. batch_size, len_k = seq_k.size() # [6, 30]
  93. pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # [6, 30]--->[6, 1, 30]
  94. return pad_attn_mask.expand(batch_size, len_q, len_k) # [6, 30, 30]
  95. # 3. 词向量层 构建词表矩阵
  96. class Embedding(nn.Module):
  97. def __init__(self):
  98. super(Embedding, self).__init__()
  99. self.tok_embed = nn.Embedding(vocab_size, d_model) # Token Embeddings [29, 768]
  100. self.seg_embed = nn.Embedding(n_segments, d_model) # Segment Embeddings [2, 768]
  101. self.pos_embed = nn.Embedding(max_len, d_model) # Position Embeddings [30, 768]
  102. self.norm = nn.LayerNorm(d_model)
  103. def forward(self, x, seg): # x==[6, 30] seg==[6, 30]
  104. seq_len = x.size(1) # 30
  105. pos = torch.arange(seq_len, dtype=torch.long) # (30,) tensor([0,1,...29])
  106. pos = pos.unsqueeze(0).expand_as(x) # [1, 30]--->[6, 30]
  107. embedding = self.tok_embed(x) + self.seg_embed(seg) + self.pos_embed(pos) # [6, 30, 768]+[6, 30, 768]+[6, 30, 768]==[6, 30, 768]
  108. return self.norm(embedding)
  109. # 2. BERT模型整体架构
  110. class BERT(nn.Module):
  111. def __init__(self):
  112. super(BERT, self).__init__()
  113. self.embedding = Embedding() # 3.
  114. self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)]) # 5.
  115. self.fc = nn.Linear(d_model, d_model) # nsp任务--->cls后面接的线性层
  116. self.activ1 = nn.Tanh() # nsp任务--->cls后面接个线性层之后再接个激活函数(Tanh)
  117. self.classifier = nn.Linear(d_model, 2) # nsp任务--->cls后面接完线性层和激活函数后 最后再接个线性层进行二分类
  118. self.linear = nn.Linear(d_model, d_model) # mlm任务--->mlm后面接的线性层
  119. self.activ2 = gelu # mlm任务--->mlm后面接个线性层之后再接个激活函数(gelu)
  120. self.norm = nn.LayerNorm(d_model) # mlm任务--->mlm后面接完线性层和激活函数后 最后在进行规范化
  121. # decoder is shared with embedding layer
  122. embed_weight = self.embedding.tok_embed.weight # Token Embeddings [29, 768]
  123. n_vocab, n_dim = embed_weight.size() # n_vocab==29 n_dim==768
  124. self.decoder = nn.Linear(n_dim, n_vocab, bias=False) # 把768映射为词表大小(29)
  125. self.decoder.weight = embed_weight # 权重w
  126. self.decoder_bias = nn.Parameter(torch.zeros(n_vocab)) # 偏置b:创建一个可学习参数(29个全0一维)
  127. def forward(self, input_ids, segment_ids, masked_pos):
  128. output = self.embedding(input_ids, segment_ids) # [6, 30, 768]
  129. # get_attn_pad_mask:是为了记录句子中pad的位置信息,传给模型后面,在计算自注意力的时候去掉pad符号的影响 4.
  130. enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids) # [6, 30, 30]
  131. for layer in self.layers:
  132. output, enc_self_attn = layer(output, enc_self_attn_mask) # enc_self_attn==[6, 12, 30, 30] output==[6, 30, 768]
  133. # 处理cls
  134. h_pooled = self.activ1(self.fc(output[:, 0])) # [6, 30, 768]--->[6, 768]
  135. logits_clsf = self.classifier(h_pooled) # [6, 2]
  136. # 处理mask
  137. masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [6, 5]--->[6, 5, 1]--->[6, 5, 768]
  138. h_masked = torch.gather(output, 1, masked_pos) # get masked_tokens:[6, 5, 768]
  139. h_masked = self.norm(self.activ2(self.linear(h_masked))) # [6, 5, 768]
  140. logits_lm = self.decoder(h_masked) + self.decoder_bias # [6, 5, 29]+[29]==[6, 5, 29] 在最后一个维度上相加
  141. return logits_lm, logits_clsf
  142. # 1. 预训练任务的数据构建
  143. def make_batch():
  144. batch = [] # 最终要返回的结果
  145. # 为了记录NSP任务中的正样本和负样本的个数,比例最好是在一个batch中接近1:1
  146. positive = negative = 0
  147. while positive != batch_size/2 or negative != batch_size/2:
  148. # 比如tokens_a_index=3,tokens_b_index=1;从整个样本中抽取对应的样本
  149. tokens_a_index = randrange(len(sentences))
  150. tokens_b_index = randrange(len(sentences))
  151. # 根据索引获取对应样本:比如tokens_a=[5, 23, 26, 20, 9, 13, 18] tokens_b=[27, 11, 23, 8, 17, 28, 12, 22, 16, 25]
  152. tokens_a = token_list[tokens_a_index]
  153. tokens_b = token_list[tokens_b_index]
  154. # 加上特殊符号,cls符号是1,sep符号是2:[1, 5, 23, 26, 20, 9, 13, 18, 2, 27, 11, 23, 8, 17, 28, 12, 22, 16, 25, 2]
  155. input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
  156. # 分割句子符号:[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  157. segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
  158. # MASK LM
  159. # n_pred=3:整个句子的15%的字符可以被mask掉,这里取和max_pred中的最小值,相当于起到截断的作用,确保每次计算损失的时候没有那么多字符以及信息充足
  160. n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15))))
  161. # cand_maked_pos=[1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]:整个句子input_ids中可以被mask的符号必须是非cls和sep符号的,要不然没意义
  162. cand_maked_pos = [i for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
  163. # 打乱顺序:cand_maked_pos=[6, 5, 17, 3, 1, 13, 16, 10, 12, 2, 9, 7, 11, 18, 4, 14, 15] 随机取mask,这里只是采用众多方式中的一种方式来取(shuffle)
  164. shuffle(cand_maked_pos)
  165. masked_tokens = [] # 被mask的元素的原始真实标签 也就是ground truth
  166. masked_pos = [] # 记录哪些位置被mask了
  167. # 随机取其中的前三个,masked_pos=[6, 5, 17]:注意这里对应的是position信息索引;masked_tokens=[13, 9, 16]:注意这里是被mask的元素之前对应的原始单字数字
  168. for pos in cand_maked_pos[:n_pred]:
  169. masked_pos.append(pos)
  170. masked_tokens.append(input_ids[pos])
  171. if random() < 0.8: # 80%概率用mask替换
  172. input_ids[pos] = word_dict['[MASK]']
  173. elif random() > 0.9: # 10%概率用其他词替换
  174. index = randint(0, vocab_size - 1)
  175. while index < 4: # 不能小于4,因为词表里面前4个词是[PAD][CLS][SEP][MASK],用这4个词替换没意义,所以再重新选
  176. index = randint(0, vocab_size - 1)
  177. input_ids[pos] = word_dict[number_dict[index]]
  178. # 还有剩下0.8到0.9之间这10%的概率不做任何替换,所以这里没写
  179. # Zero Paddings1
  180. n_pad = max_len - len(input_ids) # max_len=30 n_pad=10
  181. input_ids.extend([0] * n_pad) # 在input_ids后面补零
  182. segment_ids.extend([0] * n_pad) # segment_ids后面也要跟着补零,这里有一个问题,0和之前的重了,这里主要是为了区分不同的句子,所以无所谓,它其实是另一种维度的位置信息
  183. # Zero Paddings2 (mask 15%) tokens
  184. # 同一个batch里面每条样本的mask数量必须是相等的,是为了计算一个batch中句子的MLM损失的时候可以组成一个有效矩阵放进去;不然第一个句子预测5个字符,第二句子预测7个字符,第三个句子预测8个字符,组不成一个有效的矩阵
  185. # 对masked_tokens补0是最好的,因为0本身对应PAD,没有意义。当然补123也是可以的([CLS][SEP][MASK]),只不过不太好
  186. if max_pred > n_pred: # n_pred不可能比max_pred大,因为MASK LM里面设置截断了
  187. n_pad = max_pred - n_pred
  188. masked_tokens.extend([0] * n_pad) # masked_tokens= [13, 9, 16, 0, 0]
  189. masked_pos.extend([0] * n_pad) # masked_pos= [6, 5, 17,0,0]
  190. # 判断这两句话是否相邻
  191. if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
  192. batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
  193. positive += 1
  194. elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
  195. batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
  196. negative += 1
  197. return batch
  198. if __name__ == '__main__':
  199. # BERT Parameters
  200. max_len = 30 # 句子的最大长度 cover住95%(表示同一个batch中的所有句子都由30个token组成,不够的补PAD)
  201. batch_size = 6
  202. max_pred = 5 # 表示最多需要预测多少个mask单词,设置了一个上限(比如100个单词按照15%算有15个单词做mask,那我不做15个,只做max_pred个)
  203. n_layers = 12 # number of Encoder Layer
  204. n_heads = 12 # number of heads in Multi-Head Attention
  205. d_model = 768 # Embedding Size(Token Embeddings、Segment Embeddings、Position Embeddings)
  206. d_ff = 3072 # 4*d_model, FeedForward dimension
  207. d_k = d_v = 64 # dimension of Q(=k),V
  208. n_segments = 2 # 表示Encoder input 由几句话组成
  209. text = (
  210. 'Hello, how are you? I am Romeo.\n'
  211. 'Hello, Romeo My name is Juliet. Nice to meet you.\n'
  212. 'Nice meet you too. How are you today?\n'
  213. 'Great. My baseball team won the competition.\n'
  214. 'Oh Congratulations, Juliet\n'
  215. 'Thanks you Romeo'
  216. )
  217. sentences = re.sub(r'[.,!?\-]', '', text.lower()).split('\n') # filter '.', ',', '?', '!', '-' --->['', '', '', '', '', '']
  218. # ' '.join(sentences):将一个字符串返回成一个新的字符串,' '作用在字符串与字符串之间,用' '隔开
  219. # split():默认以空格切分,返回字符串列表['', '', ...]
  220. # set():返回去重后的集合{},会乱序
  221. # list():返回列表[]
  222. word_list = list(set(' '.join(sentences).split()))
  223. # word_to_id
  224. word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
  225. for i, w in enumerate(word_list):
  226. word_dict[w] = i + 4
  227. # id_to_word
  228. number_dict = {i: w for i, w in enumerate(word_dict)}
  229. # 29
  230. vocab_size = len(word_dict)
  231. # 原始的文本转为数字(索引),便于被计算机处理:[[15, 7, 27, 22, 8, 4, 16], ..., []]
  232. token_list = list()
  233. for sentence in sentences:
  234. arr = [word_dict[s] for s in sentence.split()]
  235. token_list.append(arr)
  236. # 数据
  237. # batch:[[[],[],[],[],F/T], [[],[],[],[],F/T], [[],[],[],[],F/T], [[],[],[],[],F/T], [[],[],[],[],F/T], [[],[],[],[],F/T]]
  238. batch = make_batch() # 1.
  239. # zip(*batch):[([],[],[],[],[],[]), ([],[],[],[],[],[]), ([],[],[],[],[],[]), ([],[],[],[],[],[]), (F/T, F/T, F/T, F/T, F/T, F/T)]
  240. input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch)) # 转成tensor
  241. # 模型
  242. model = BERT() # 2.
  243. # 损失函数
  244. criterion = nn.CrossEntropyLoss(ignore_index=0) # 真实label中为0的部分不参与损失计算
  245. # 优化器
  246. optimizer = optim.Adam(model.parameters(), lr=0.001)
  247. # 实例化数据源
  248. dataset = MyDataset(input_ids, segment_ids, masked_tokens, masked_pos, isNext) # 10.
  249. # 实例化
  250. dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
  251. model.train()
  252. start = time.time()
  253. for epoch in range(10):
  254. # 通过DataLoader把MyDataset中的__getitem__函数返回的数据拿出来后 会多增加一个维度
  255. for input_ids, segment_ids, masked_tokens, masked_pos, isNext in dataloader:
  256. logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
  257. loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # mlm任务损失计算
  258. loss_lm = (loss_lm.float()).mean()
  259. loss_clsf = criterion(logits_clsf, isNext) # nsp任务损失计算(cls classification)
  260. loss = loss_lm + loss_clsf
  261. if (epoch + 1) % 2 == 0:
  262. print('Epoch:', '%04d' % (epoch + 1), 'cost=', '{:.6f}'.format(loss))
  263. optimizer.zero_grad()
  264. loss.backward()
  265. optimizer.step()
  266. # 保存模型
  267. # torch.save(model.state_dict(), './wb_model/wb_bert(cpu).pt')
  268. end = time.time()
  269. print(f'cpu下共执行--->{end - start}s')
  270. print('-------------------- Predict mask tokens and isNext --------------------')
  271. # 加载数据
  272. batch = make_batch()
  273. # batch[0]:取出第0个批次 [[],[],[],[],F/T]
  274. # zip(batch[0]):[([],), ([],), ([],), ([],), (F/T,)]
  275. # [1, 30] [1, 30] [1, 5] [1, 5] [1,]
  276. input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0])) # 转成tensor
  277. print(text)
  278. print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])
  279. # 加载训练好的模型参数
  280. model = BERT()
  281. # model.load_state_dict(torch.load('./wb_model/wb_bert(cpu).pt'))
  282. # 模型预测
  283. with torch.no_grad():
  284. logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos) # logits_lm==[1, 5, 29] logits_clsf==[1, 2]
  285. logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
  286. print('masked tokens list: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])
  287. print('predict masked tokens list: ', [pos for pos in logits_lm if pos != 0])
  288. logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
  289. print('isNext: ', True if isNext else False)
  290. print('predict isNext: ', True if logits_clsf else False)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/345017
推荐阅读
相关标签
  

闽ICP备14008679号