赞
踩
import pandas as pd import matplotlib.pyplot as plt loss=pd.read_pickle("loss_8.pkl") plt.plot(loss) loss=pd.read_pickle("loss_16.pkl") plt.plot(loss) loss=pd.read_pickle("loss_4_8.pkl") plt.plot(loss) loss=pd.read_pickle("loss_8_8.pkl") plt.plot(loss) loss=pd.read_pickle("loss_16_8.pkl") plt.plot(loss) loss=pd.read_pickle("loss_8_4.pkl") plt.plot(loss) loss=pd.read_pickle("loss_16_4.pkl") plt.plot(loss) loss=pd.read_pickle("loss_8_32.pkl") plt.plot(loss) plt.legend(["8_256","16_256","4_8_256","8_8_512","16_8_512","8_4_512","16_4_512","256_8_32"]) plt.show() # 总结规律后发现 参数维度越大越好,层数太多不好,8 层几个头影响不大。
在深度学习模型的设计中,参数的数量和层数是两个重要的因素。参数的维度通常与模型的学习能力有关,参数越多,模型能够捕捉的模式就越复杂。然而,这并不意味着参数越多越好,因为过多的参数可能会导致过拟合,即模型在训练数据上表现良好,但在未见过的新数据上表现不佳。
层数的多少影响模型的深度,理论上,更深的模型可以捕捉更复杂的特征,但同样存在过拟合的风险。此外,层数过多还会导致梯度消失或梯度爆炸问题,使得模型难以训练。
关于8层和几个头的影响,这通常指的是Transformer模型中的结构。在Transformer模型中,每层的输出可以被分割成多个“头”,每个头关注不同的信息。这种结构被称为多头注意力机制。8层的Transformer模型是一个相对较深的模型,而头的数量决定了模型关注不同部分输入信息的能力。一般来说,头的数量不会对模型性能产生太大的影响,但过多的头可能会导致计算资源的浪费。
总结来说,参数维度越大,模型的学习能力越强,但过大的参数维度可能会导致过拟合。层数的增加可以提高模型捕捉复杂特征的能力,但同时也增加了过拟合和训练难度。在Transformer模型中,8层的深度可以捕捉较为复杂的特征,而头的数量对模型性能的影响相对较小。在设计模型时,需要根据具体任务和数据集的特点,合理选择参数维度、层数和头的数量,以达到最佳的性能。
import math import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F class MaskMultiHeadAttention(nn.Layer): def __init__(self, hidden_size, num_heads): super(MaskMultiHeadAttention, self).__init__() assert hidden_size % num_heads == 0, "Hidden size must be divisible by the number of heads." self.hidden_size = hidden_size self.num_heads = num_heads self.head_size = hidden_size // num_heads # Query, Key, Value matrices self.query_linear = nn.Linear(hidden_size, hidden_size, bias_attr=False) self.key_linear = nn.Linear(hidden_size, hidden_size, bias_attr=False) self.value_linear = nn.Linear(hidden_size, hidden_size, bias_attr=False) # Output matrix self.output_linear = nn.Linear(hidden_size, hidden_size,bias_attr=False) def forward(self, x, state=None, seq_len=None): batch_size = x.shape[0] # Compute Query, Key, Value for all heads in parallel query = self.query_linear(x).reshape([batch_size, -1, self.num_heads, self.head_size]).transpose([0, 2, 1, 3]) key = self.query_linear(x).reshape([batch_size, -1, self.num_heads, self.head_size]).transpose([0, 2, 1, 3]) value = self.value_linear(x).reshape([batch_size, -1, self.num_heads, self.head_size]).transpose([0, 2, 1, 3]) # Compute attention scores scores = (F.relu(query) + F.relu(-query)) / (self.head_size ** 0.5 + 0.000000000001) key = (F.relu(key) + F.relu(-key)) / (self.head_size ** 0.5 + 0.000000000001)+0.00000000001 key = key/paddle.sum(key, axis=-1, keepdim=True) if state is None: state = 0 state = paddle.cumsum(scores, -2) + 0.0000000000000001 + state scores = scores / state state = state[:, :, -1:] out = scores * value*key # Concatenate and transform to get the final output out = out.transpose([0, 2, 1, 3]).reshape([batch_size, -1, self.hidden_size]) out = self.output_linear(out) return out, state class FeedForward(nn.Layer): def __init__(self, hidden_size): super(FeedForward, self).__init__() self.ffn1 = nn.Linear(hidden_size, hidden_size*2) self.ffn2 = nn.Linear(hidden_size*2, hidden_size) self.gate = nn.Linear(hidden_size, hidden_size*2) self.relu = nn.Silu() def forward(self, x): x1 = self.ffn1(x) x2 = self.relu(self.gate(x)) x = x1*x2 x = self.ffn2(x) return x class RMSNorm(nn.Layer): def __init__(self,dim, eps: float = 1e-6): super(RMSNorm,self).__init__() self.eps = eps self.fc=paddle.create_parameter(shape=[dim],dtype='float32',default_initializer=nn.initializer.Constant(value=1.0)) def norm(self, x): return x * paddle.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) def forward(self, x): output = self.norm(x) return output*self.fc class GPTDecoderLayer(nn.Layer): def __init__(self, hidden_size, num_heads): super(GPTDecoderLayer, self).__init__() self.self_attention = MaskMultiHeadAttention(hidden_size, num_heads) self.ffn = FeedForward(hidden_size) # self.norm1 = nn.LayerNorm(hidden_size) self.norm1 = RMSNorm(hidden_size) def forward(self, x, state=None, seq_len=None): x1, state = self.self_attention(x, state, seq_len=None) # Self-Attention with residual connection x = x1 + x x = self.norm1(x) x = self.ffn(x) + x # Feed-Forward with residual connection x = self.norm1(x) return x, state class PositionalEncoding(nn.Layer): def __init__(self, d_model, max_len=5000): super(PositionalEncoding, self).__init__() # Create a long enough Paddle array to hold position encodings for the maximum sequence length position = paddle.arange(max_len).unsqueeze(1).astype("float32") # Create a constant 'pe' matrix with the same size as the embedding matrix div_term = paddle.exp(paddle.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) pe = paddle.zeros([max_len, d_model]) pe[:, 0::2] = paddle.sin(position * div_term) pe[:, 1::2] = paddle.cos(position * div_term) self.pe = pe.unsqueeze(0) # Shape: [1, max_len, d_model] # Register 'pe' as a buffer (non-trainable parameter) def forward(self, x, seq_len=None): # x is of shape [batch_size, seq_len, d_model] if seq_len is None: seq_len = x.shape[1] return x + self.pe[:, :seq_len, :] else: return x + self.pe[:, seq_len - 1:seq_len, :] # %% def sinusoidal_position_embedding(max_len, output_dim): # (max_len, 1) position = paddle.arange(0, max_len, dtype="float32").unsqueeze(-1) # (output_dim//2) ids = paddle.arange(0, output_dim // 2, dtype="float32") # 即公式里的i, i的范围是 [0,d/2] theta = 10000 ** (-2 * ids / output_dim) # (max_len, output_dim//2) embeddings = position * theta # 即公式里的:pos / (10000^(2i/d)) sin_embeddings = paddle.sin(embeddings) cos_embeddings = paddle.cos(embeddings) return sin_embeddings, cos_embeddings def rope(q, sin_em, cos_em, seq_len=None): if seq_len is None: sin_em = sin_em[:q.shape[2]] cos_em = cos_em[:q.shape[2]] else: sin_em = sin_em[seq_len - 1:seq_len] cos_em = cos_em[seq_len - 1:seq_len] q1 = q.reshape([q.shape[0], q.shape[1], q.shape[2], -1, 2])[..., 1] q2 = q.reshape([q.shape[0], q.shape[1], q.shape[2], -1, 2])[..., 0] # 奇数负值*sin_em+偶数正值*cos_em 奇数正值*cos_em+偶数正值*sin_em q3 = paddle.stack([-q1 * sin_em + q2 * cos_em, q1 * cos_em + q2 * sin_em], -1) q = q3.reshape(q.shape) # reshape后就是正负交替了 return q class GPT(nn.Layer): def __init__(self, vocab_size, hidden_size, num_heads, num_layers): super(GPT, self).__init__() self.embedding = nn.Embedding(vocab_size, hidden_size) self.decoder_layers = nn.LayerList([GPTDecoderLayer(hidden_size, num_heads) for _ in range(num_layers)]) self.fc = nn.Linear(hidden_size, vocab_size, bias_attr=False) self.sin_em, self.cos_em = sinusoidal_position_embedding(50000, hidden_size//num_heads//2) def forward(self, x, state=None, seq_len=None): x = self.embedding(x) # x = self.position_embedding(x, seq_len) if state is None: state = [None] * len(self.decoder_layers) i = 0 for decoder_layer in self.decoder_layers: x = rope(x.reshape([x.shape[0], x.shape[1], -1, self.sin_em.shape[1] * 2]).transpose([0, 2, 1, 3]), self.sin_em, self.cos_em, seq_len).transpose([0, 2, 1, 3]).reshape(x.shape) + x x, state[i] = decoder_layer(x, state[i]) i += 1 out = self.fc(x) return out, state def check_rope(): q = paddle.randn((8, 12, 10, 32)) sin_em, cos_em = sinusoidal_position_embedding(50000, 32) rope(q, sin_em, cos_em) def check_mask_multi_head_attention(): x = paddle.randn([4, 10, 20]) mha = MaskMultiHeadAttention(20, 5) out = mha(x) def check_positional_encoding(): x = paddle.randn([4, 10, 20]) pe = PositionalEncoding(20) out = pe(x) def check_gpt_decoder_layer(): x = paddle.randn([4, 10, 20]) dl = GPTDecoderLayer(20, 5) out = dl(x) def check_gpt(): x = paddle.randint(4, 10, [4, 10]) gpt = GPT(10, 20, 5, 2) out = gpt(x) def check_lisa_adam_w(): gpt = GPT(10, 20, 5, 2) # 初始化模型和超参数 nl = len(gpt.parameters()) T = 100 K = 10 gamma = 2 eta0 = 0.001 pnl = gamma / np.ones(len(gpt.parameters())) * gamma / nl pnl[0] = 1 pnl[-1] = 1 criterion = nn.CrossEntropyLoss() optimizer = paddle.optimizer.AdamW(parameters=gpt.parameters(), learning_rate=eta0) # 训练模型 for i in range(T // K): # 冻结所有层 for l in gpt.parameters(): l.trainable = False # 激活层采样 for l, p_l in enumerate(pnl): print(gpt.parameters()[l].shape) if paddle.rand([1]) < p_l and len(gpt.parameters()[l].shape) > 1: w_lora_l = paddle.norm(paddle.linalg.svd(gpt.parameters()[l])[0], p=2) + paddle.norm( paddle.linalg.svd(gpt.parameters()[l])[2], p=2) # LoRA权重范数 w_lisa_l = paddle.norm(gpt.parameters()[l], p=2) # LISA权重范数 p_l_1 = w_lora_l / w_lisa_l optimizer.set_lr(optimizer.get_lr() * p_l_1.item()) # 激活该层 gpt.parameters()[l].trainable = True # 训练激活层 for _ in range(K): x = paddle.randint(4, 10, [4, 10]) out,_ = gpt(x) loss = criterion(out, paddle.randint(0, 10, [4, 10])) optimizer.clear_grad() loss.backward() optimizer.step() # 输出模型参数 # print(gpt.state_dict()) def check_all(): # check_lisa_adam_w() check_rope() check_mask_multi_head_attention() check_positional_encoding() check_gpt_decoder_layer() check_gpt() if __name__ == '__main__': # 检查所有模块 check_all()
import json import numpy as np import paddle import paddle.nn as nn from new_attention import GPT import pandas as pd with open("唐诗.json","r",encoding="utf-8") as f: data=f.read() data=json.loads(data) data=[i[4].split() for i in data if len(i[4].split())>3 ] data=np.hstack(data) data = [i for i in data if len("".join(i.split()))==24 and "a" not in i ] data = [i for i in data if len("".join(i.split()))==24 and "f" not in i ] data = [i for i in data if len("".join(i.split()))==24 and "e" not in i ] data = [i for i in data if len("".join(i.split()))==24 and "h" not in i ] data = [i for i in data if len("".join(i.split()))==24 and "X" not in i ] data = [i for i in data if len("".join(i.split()))==24 and "“" not in i ] data = [i for i in data if len("".join(i.split()))==24 and '□' not in i ] data = [i for i in data if len("".join(i.split()))==24 and '《' not in i ] data = [i for i in data if len("".join(i.split()))==24 and '》' not in i ] voc=sorted(set(np.hstack([list(set(list(i))) for i in data]))) data_set=[[voc.index(j) for j in i] for i in data] from tqdm import tqdm batch_size = 600 epochs = 10 model = GPT(len(voc), 256, 8,32) # model.load_dict(paddle.load("gpt.pdparams")) loss_func = nn.CrossEntropyLoss() opt = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.0003) bar = tqdm(range(epochs)) epochs_loss=[] for epoch in bar: np.random.shuffle(data_set) loss_list=[] for i in range(0, len(data_set), batch_size): j = batch_size + i batch_data = paddle.to_tensor(data_set[i:j]).astype('int64') out,_ = model(batch_data[:, :-1]) loss = loss_func(out, batch_data[:, 1:]) loss_list.append(loss.item()) bar.set_description("epoch:{}_____loss:{}".format(epoch,np.mean(loss_list))) opt.clear_grad() loss.backward() opt.step() epochs_loss.append(np.mean(loss_list)) paddle.save(model.state_dict(), "gpt.pdparams") pd.to_pickle(epochs_loss,"loss_8_32.pkl")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。