赞
踩
近年来,随着深度学习的快速发展,自然语言处理(NLP)领域取得了突破性进展,其中最引人注目的便是大语言模型(Large Language Model,LLM)的出现。从早期的统计语言模型到神经网络语言模型,再到如今基于 Transformer 架构的预训练语言模型,大语言模型的发展经历了多个重要阶段:
大语言模型的出现为 NLP 领域带来了革命性的变化,其强大的文本理解和生成能力使其在众多应用场景中展现出巨大潜力,例如:
尽管大语言模型取得了令人瞩目的成就,但其发展仍面临着一些挑战:
大语言模型是一种基于深度学习的语言模型,通常包含数亿甚至数千亿个参数,并在大规模文本语料上进行预训练。其目标是学习自然语言的统计规律,从而能够理解和生成人类语言。
推理策略是指在给定输入的情况下,如何利用大语言模型生成输出的过程。常见的推理策略包括:
预训练、微调和推理策略是大语言模型应用的三个重要环节,它们之间有着密切的联系:
下图展示了预训练、微调和推理策略之间的关系:
在接下来的章节中,我们将深入探讨大语言模型的微调和推理策略,并通过具体的代码示例和应用案例来展示其在实际场景中的应用。
A t t e n t i o n ( Q , K , V ) = s o f t m a x ( Q K T d k ) V Attention(Q,K,V) = softmax(\frac{QK^T}{\sqrt{d_k}})V Attention(Q,K,V)=softmax(dk QKT)V
M
u
l
t
i
H
e
a
d
(
Q
,
K
,
V
)
=
C
o
n
c
a
t
(
h
e
a
d
1
,
.
.
.
,
h
e
a
d
h
)
W
O
MultiHead(Q,K,V) = Concat(head_1, ..., head_h)W^O
MultiHead(Q,K,V)=Concat(head1,...,headh)WO
h
e
a
d
i
=
A
t
t
e
n
t
i
o
n
(
Q
W
i
Q
,
K
W
i
K
,
V
W
i
V
)
head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)
headi=Attention(QWiQ,KWiK,VWiV)
F F N ( x ) = m a x ( 0 , x W 1 + b 1 ) W 2 + b 2 FFN(x) = max(0, xW_1 + b_1)W_2 + b_2 FFN(x)=max(0,xW1+b1)W2+b2
P ( w 1 , . . . , w n ) = ∏ i = 1 n P ( w i ∣ w 1 , . . . , w i − 1 ) P(w_1, ..., w_n) = \prod_{i=1}^n P(w_i | w_1, ..., w_{i-1}) P(w1,...,wn)=∏i=1nP(wi∣w1,...,wi−1)
L ( D ) = − ∑ i = 1 n log P ( w i ∣ w 1 , . . . , w i − 1 ; θ ) L(D) = -\sum_{i=1}^n \log P(w_i | w_1, ..., w_{i-1}; \theta) L(D)=−∑i=1nlogP(wi∣w1,...,wi−1;θ)
P ( w i ∣ w 1 , . . . , w i − 1 , w i + 1 , . . . , w n ) P(w_i | w_1, ..., w_{i-1}, w_{i+1}, ..., w_n) P(wi∣w1,...,wi−1,wi+1,...,wn)
P ( I s N e x t ∣ s 1 , s 2 ) P(IsNext | s_1, s_2) P(IsNext∣s1,s2)
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
from transformers import TextDataset, DataCollatorForLanguageModeling train_path = "path/to/train.txt" valid_path = "path/to/valid.txt" train_dataset = TextDataset( tokenizer=tokenizer, file_path=train_path, block_size=128) valid_dataset = TextDataset( tokenizer=tokenizer, file_path=valid_path, block_size=128) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False) from transformers import Trainer, TrainingArguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, weight_decay=0.01, num_train_epochs=3, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset, data_collator=data_collator, ) trainer.train()
prompt = "The answer to the universe is"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(
input_ids,
max_length=50,
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
print(tokenizer.decode(output[0], skip_special_tokens=True))
import torch import torch.nn as nn class GPT(nn.Module): def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, max_seq_length, pos_dropout, embd_pdrop, attn_pdrop, resid_pdrop): super().__init__() self.pos_emb = nn.Embedding(max_seq_length, d_model) self.pos_drop = nn.Dropout(pos_dropout) self.tok_emb = nn.Embedding(vocab_size, d_model) self.drop = nn.Dropout(embd_pdrop) self.blocks = nn.Sequential(*[Block(d_model, nhead, dim_feedforward, attn_pdrop, resid_pdrop) for _ in range(num_layers)]) self.norm = nn.LayerNorm(d_model) self.head = nn.Linear(d_model, vocab_size, bias=False) self.apply(self._init_weights) def _init_weights(self, module): if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=0.02) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def forward(self, x, mask=None): seq_len = x.size(1) pos = torch.arange(0, seq_len, dtype=torch.long, device=x.device) pos = pos.unsqueeze(0).expand_as(x) x = self.tok_emb(x) + self.pos_emb(pos) x = self.pos_drop(x) x = self.blocks(x, mask) x = self.norm(x) x = self.head(x) return x
import torch.optim as optim def train(model, data, optimizer, epochs, device): model.train() for epoch in range(epochs): total_loss = 0 for batch in data: batch = batch.to(device) optimizer.zero_grad() output = model(batch[:, :-1]) targets = batch[:, 1:] loss = nn.CrossEntropyLoss()(output.view(-1, output.size(-1)), targets.reshape(-1)) loss.backward() optimizer.step() total_loss += loss.item() print(f"Epoch: {epoch}, Loss: {total_loss / len(data)}") vocab_size = 10000 d_model = 768 nhead = 12 num_layers = 12 dim_feedforward = 3072 max_seq_length = 512 pos_dropout = 0.1 embd_pdrop = 0.1 attn_pdrop = 0.1 resid_pdrop = 0.1 model = GPT(vocab_size, d_model, nhead, num_layers, dim_feedforward, max_seq_length, pos_dropout, embd_pdrop, attn_pdrop, resid_pdrop) model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-4) train(model, train_data, optimizer, epochs=10, device=device)
作者:禅与计算机程序设计艺术 / Zen and the Art of Computer Programming
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。