当前位置:   article > 正文

Transformer模型代码(详细注释,适合新手)_transformers建模代码

transformers建模代码
  1. # Hyperparameters
  2. batch_size = 4 # How many batches per training step
  3. context_length = 16 # Length of the token chunk each batch
  4. d_model = 64 # The size of our model token embeddings
  5. num_blocks = 8 # Number of transformer blocks
  6. num_heads = 4 # Number of heads in Multi-head attention
  7. learning_rate = 1e-3 # 0.001
  8. dropout = 0.1 # Dropout rate
  9. max_iters = 5000 # Total of training iterations <- Change this to smaller number for testing
  10. eval_interval = 50 # How often to evaluate
  11. eval_iters = 20 # Number of iterations to average for evaluation
  12. device = 'cuda' if torch.cuda.is_available() else 'cpu' # Use GPU if it's available.
  13. TORCH_SEED = 1337
  14. torch.manual_seed(TORCH_SEED)
  15. class FeedForward(nn.Module):
  16. def __init__(self):
  17. super().__init__()
  18. # 模型维度
  19. self.d_model = d_model
  20. # 丢弃率
  21. self.dropout = dropout
  22. # 第一个线性层
  23. self.ln1 = nn.Linear(in_features=self.d_model, out_features=self.d_model * 4)
  24. # ReLU激活函数
  25. self.relu = nn.ReLU()
  26. # 第二个线性层
  27. self.ln2 = nn.Linear(in_features=self.d_model * 4, out_features=self.d_model)
  28. # 丢弃层
  29. self.dp = nn.Dropout(dropout)
  30. def forward(self, x):
  31. # 输入形状为 batch_size, seq_len, d_model
  32. x = self.ln1(x)
  33. x = self.relu(x)
  34. x = self.ln2(x)
  35. out = self.dp(x)
  36. return out
  37. class Attention(nn.Module):
  38. def __init__(self, head_size):
  39. """
  40. 参数:
  41. head_size (int): 每个注意力头的大小。
  42. d_model (int): 输入张量的特征维度大小。
  43. context_length (int): 上下文长度,即时间步数。
  44. dropout (float): Dropout 层的丢弃率。
  45. """
  46. super().__init__()
  47. # 设置模型的参数
  48. self.d_model = d_model
  49. self.head_size = head_size
  50. self.context_length = context_length
  51. self.dropout = dropout
  52. # 定义用于计算注意力权重的线性层
  53. self.key_layer = nn.Linear(in_features=self.d_model, out_features=self.head_size, bias=False)
  54. self.query_layer = nn.Linear(in_features=self.d_model, out_features=self.head_size, bias=False)
  55. self.value_layer = nn.Linear(in_features=self.d_model, out_features=self.head_size, bias=False)
  56. # 生成下三角掩码
  57. self.register_buffer('tril', torch.tril(torch.ones((self.context_length, self.context_length))))
  58. # Dropout 层,用于防止过拟合
  59. self.dropout_layer = nn.Dropout(self.dropout)
  60. def forward(self, x):
  61. #todo 输入输出维度一样
  62. # 获取输入张量的形状信息
  63. B, T, C = x.shape # Batch size, Time steps(current context_length), Channels(dimensions)
  64. # 确保时间步数不超过上下文长度,且特征维度与模型参数匹配
  65. assert T <= self.context_length
  66. assert C == self.d_model
  67. # 通过线性层得到查询、键和值张量
  68. q = self.query_layer(x)
  69. k = self.key_layer(x)
  70. v = self.value_layer(x)
  71. # 缩放点积注意力:Q @ K^T / sqrt(d_k)
  72. weights = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
  73. # 应用掩码
  74. weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
  75. weights = F.softmax(input=weights, dim=-1)
  76. weights = self.dropout_layer(weights)
  77. # 加权求和:weights @ V
  78. out = weights @ v
  79. return out
  80. class MultiHeadAttention(nn.Module):
  81. def __init__(self, head_size):
  82. """
  83. 定义一个多头注意力机制模型。
  84. 参数:
  85. num_heads (int): 注意力头的数量。
  86. head_size (int): 每个注意力头的大小。
  87. d_model (int): 输入张量的特征维度大小。
  88. context_length (int): 上下文长度,即时间步数。
  89. dropout (float): Dropout 层的丢弃率。
  90. """
  91. super().__init__()
  92. # 设置模型的参数
  93. self.num_heads = num_heads
  94. self.head_size = head_size
  95. self.d_model = d_model
  96. self.context_length = context_length
  97. self.dropout = dropout
  98. # 创建多个注意力头
  99. self.heads = nn.ModuleList([Attention(head_size=self.head_size)
  100. for _ in range(self.num_heads)])
  101. # 线性投影层,用于将多头注意力的输出映射回原始的特征维度
  102. self.projection_layer = nn.Linear(in_features=self.num_heads * self.head_size, out_features=self.d_model)
  103. # Dropout 层,用于防止过拟合
  104. self.dropout_layer = nn.Dropout(self.dropout)
  105. def forward(self, x):
  106. """
  107. 参数:
  108. x (torch.Tensor): 输入张量,形状为(batch_size, seq_len, d_model)。
  109. 返回:
  110. torch.Tensor: 多头注意力机制的输出张量,形状与输入张量相同。
  111. """
  112. # 对每个注意力头执行前向传播,并将它们的输出拼接在一起
  113. out = torch.cat([h(x) for h in self.heads], dim=-1)
  114. # 通过线性投影层将多头注意力的输出映射回原始的特征维度
  115. out = self.projection_layer(out)
  116. # 应用 Dropout 层,防止过拟合
  117. out = self.dropout_layer(out)
  118. return out
  119. class TransformerBlock(nn.Module):
  120. def __init__(self):
  121. super().__init__()
  122. # 设置模型的参数
  123. self.d_model = d_model
  124. self.context_length = context_length
  125. self.head_size = d_model // num_heads # 注意力头的大小
  126. self.num_heads = num_heads
  127. self.dropout = dropout
  128. # 多头注意力层
  129. self.multi_head_attention_layer = MultiHeadAttention(self.head_size)
  130. # 前馈神经网络层
  131. self.feed_forward_layer = FeedForward()
  132. # Layer normalization 层
  133. self.layer_norm_1 = nn.LayerNorm(normalized_shape=self.d_model)
  134. self.layer_norm_2 = nn.LayerNorm(normalized_shape=self.d_model)
  135. def forward(self, x):
  136. """
  137. 定义模型的前向传播过程。
  138. 参数:
  139. x (torch.Tensor): 输入张量,形状为(batch_size, seq_len, d_model)。
  140. 返回:
  141. torch.Tensor: Transformer 块的输出张量,形状与输入张量相同。
  142. """
  143. # 注意:操作的顺序与原始的 Transformer 论文不同
  144. # 这里的顺序是:LayerNorm -> Multi-head attention -> LayerNorm -> Feed forward
  145. # 使用 Layer normalization 对输入张量进行归一化
  146. x_normalized_1 = self.layer_norm_1(x)
  147. # 执行多头注意力机制,并将输出与输入张量相加(残差连接)
  148. x = x + self.multi_head_attention_layer(x_normalized_1)
  149. # 使用 Layer normalization 对得到的结果进行归一化
  150. x_normalized_2 = self.layer_norm_2(x)
  151. # 执行前馈神经网络,并将输出与之前的结果相加(残差连接)
  152. x = x + self.feed_forward_layer(x_normalized_2)
  153. return x
  154. class TransformerLanguageModel(nn.Module):
  155. def __init__(self):
  156. super().__init__()
  157. self.d_model = d_model
  158. self.context_length = context_length
  159. self.num_heads = num_heads
  160. self.num_blocks = num_blocks
  161. self.dropout = dropout
  162. self.max_token_value = max_token_value
  163. # 设置 token 嵌入查找表
  164. self.token_embedding_lookup_table = nn.Embedding(num_embeddings=self.max_token_value + 1, embedding_dim=self.d_model)
  165. # 运行所有的 Transformer 块
  166. # 与原始论文不同,这里在所有块之后添加了一个最终的层规范化
  167. self.transformer_blocks = nn.Sequential(*(
  168. [TransformerBlock(num_heads=self.num_heads) for _ in range(self.num_blocks)] +
  169. [nn.LayerNorm(self.d_model)]
  170. ))
  171. self.language_model_out_linear_layer = nn.Linear(in_features=self.d_model, out_features=self.max_token_value)
  172. def forward(self, idx, targets=None):
  173. B, T = idx.shape
  174. """
  175. # 设置位置嵌入查找表
  176. # 遵循原始 Transformer 论文相同的方法(正弦和余弦函数)
  177. """
  178. position_encoding_lookup_table = torch.zeros(self.context_length, self.d_model)
  179. position = torch.arange(0, self.context_length, dtype=torch.float).unsqueeze(1)
  180. div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-math.log(10000.0) / self.d_model))
  181. position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
  182. position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
  183. # 将 position_encoding_lookup_table 从 (context_length, d_model) 更改为 (T, d_model)
  184. position_embedding = position_encoding_lookup_table[:T, :].to(device)
  185. x = self.token_embedding_lookup_table(idx) + position_embedding
  186. x = self.transformer_blocks(x)
  187. # “logits” 是我们的模型在应用 softmax 之前的输出值
  188. logits = self.language_model_out_linear_layer(x)
  189. if targets is not None:
  190. B, T, C = logits.shape
  191. logits_reshaped = logits.view(B * T, C)
  192. targets_reshaped = targets.view(B * T)
  193. loss = F.cross_entropy(input=logits_reshaped, target=targets_reshaped)
  194. else:
  195. loss = None
  196. return logits, loss
  197. def generate(self, idx, max_new_tokens):
  198. # idx 是当前上下文中索引的 (B,T) 数组
  199. for _ in range(max_new_tokens):
  200. # 将 idx 裁剪到我们位置嵌入表的最大尺寸
  201. idx_crop = idx[:, -self.context_length:]
  202. # 获取预测值
  203. logits, loss = self(idx_crop)
  204. # 从 logits 中获取最后一个时间步,其中 logits 的维度为 (B,T,C)
  205. logits_last_timestep = logits[:, -1, :]
  206. # 应用 softmax 获取概率
  207. probs = F.softmax(input=logits_last_timestep, dim=-1)
  208. # 从概率分布中采样
  209. idx_next = torch.multinomial(input=probs, num_samples=1)
  210. # 将采样的索引 idx_next 追加到 idx 中
  211. idx = torch.cat((idx, idx_next), dim=1)
  212. return idx

本文提供了transformer代码附带详细注释,要注意本文的transformer并非传统的encoder-decoder结构的,而是主流的gpt结构(decoder-only),不了解decoder-only的同学,可以参考我的另一篇文章,链接放在最后。我过几天会出一个介绍gpt模型结构的的文章,欢迎大家前来讨论。http://t.csdnimg.cn/IGCUL

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/代码探险家/article/detail/886975
推荐阅读
相关标签
  

闽ICP备14008679号