当前位置:   article > 正文

LoRA微调大语言模型Bert

LoRA微调大语言模型Bert

LoRA是一种流行的微调大语言模型的手段,这是因为LoRA仅需在预训练模型需要微调的地方添加旁路矩阵。LoRA 的作者们还提供了一个易于使用的库 loralib,它极大地简化了使用 LoRA 微调模型的过程。这个库允许用户轻松地将 LoRA 层添加到现有的模型架构中,而无需深入了解其底层实现细节。这使得 LoRA 成为了一种非常实用的技术,既适合研究者也适合开发人员。下面给出了一个LoRA微调Bert模型的具体例子。
下图给出了一个LoRA微调Bert中自注意力矩阵 W Q W^Q WQ的例子。如图所示,通过冻结矩阵 W Q W^Q WQ,并且添加旁路低秩矩阵 A , B A,B A,B来进行微调。同理,使用LoRA来微调 W K W^K WK也是如此。
image.png
我们给出了通过LoRA来微调Bert模型中自注意力矩阵的具体代码。代码是基于huggingface中Bert开源模型进行改造。Bert开源项目链接如下:
https://huggingface.co/transformers/v4.3.3/_modules/transformers/models/bert/modeling_bert.html

基于LoRA微调的代码如下:
# 环境配置
# pip install loralib
# 或者
# pip install git+https://github.com/microsoft/LoRA
import loralib as lora

class LoraBertSelfAttention(BertSelfAttention):
    """
    继承BertSelfAttention模块
    对Query,Value用LoRA进行微调
    
    参数:
    - r (int): LoRA秩的大小
    - config: Bert模型的参数配置
    """
    def __init__(self, r=8, *config):
        super().__init__(*config)
        # 获得所有的注意力的头数
        d = self.all_head_size 
        # 使用LoRA提供的库loralib
        self.lora_query = lora.Linear(d, d, r)
        self.lora_value = lora.Linear(d, d, r)
        
    def lora_query(self, x):
        """
        对Query矩阵执行Wx + BAx操作
        """
        return self.query(x) + F.linear(x, self.lora_query)
    
    def lora_value(self, x):
        """
        对Value矩阵执行Wx + BAx操作
        """
        return self.value(x) + F.linear(x, self.lora_value)
    
    
    def forward(self, hidden_states, *config):
        """
        更新涉及到Query矩阵和Value矩阵的操作
        """
        # 通过LoRA微调Query矩阵
        mixed_query_layer = self.lora_query(hidden_states)
        is_cross_attention = encoder_hidden_states is not None
        if is_cross_attention and past_key_value is not None:
            # reuse k,v, cross_attentions
            key_layer = past_key_value[0]
            value_layer = past_key_value[1]
            attention_mask = encoder_attention_mask
        elif is_cross_attention:
            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
            # 通过LoRA微调Value矩阵
            value_layer = self.transpose_for_scores(self.lora_value(hidden_states))
            attention_mask = encoder_attention_mask
        elif past_key_value is not None:
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            # 通过LoRA微调Value矩阵
            value_layer = self.transpose_for_scores(self.lora_value(hidden_states))
            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
        else:
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            # 通过LoRA微调Value矩阵
            value_layer = self.transpose_for_scores(self.lora_value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        if self.is_decoder:
            past_key_value = (key_layer, value_layer)
        # Query矩阵与Key矩阵算点积得到注意力分数
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            seq_length = hidden_states.size()[1]
            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
            distance = position_ids_l - position_ids_r
            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
            if self.position_embedding_type == "relative_key":
                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores
            elif self.position_embedding_type == "relative_key_query":
                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)
        if head_mask is not None:
            attention_probs = attention_probs * head_mask
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        if self.is_decoder:
            outputs = outputs + (past_key_value,)
        return outputs

class LoraBert(nn.Module):
    def __init__(self, task_type, num_classes=None, dropout_rate=0.1, model_id="bert-base-cased",
                 lora_rank=8, train_biases=True, train_embedding=False, train_layer_norms=True):
        """
        - task_type: 设计任务的类型,如:'glue', 'squad_v1', 'squad_v2'.
        - num_classes: 分类类别的数量.
        - model_id: 预训练好的Bert的ID,如:"bert-base-uncased","bert-large-uncased".
        - lora_rank: LoRA秩的大小.
        - train_biases, train_embedding, train_layer_norms: 这是参数是否需要训练    
        """
        super().__init__()
        # 1.加载权重
        self.model_id = model_id
        self.tokenizer = BertTokenizer.from_pretrained(model_id)
        self.model = BertForPreTraining.from_pretrained(model_id)
        self.model_config = self.model.config
        # 2.添加模块
        d_model = self.model_config.hidden_size
        self.finetune_head_norm = nn.LayerNorm(d_model)
        self.finetune_head_dropout = nn.Dropout(dropout_rate)
        self.finetune_head_classifier = nn.Linear(d_model, num_classes)
        # 3.通过LoRA微调模型
        self.replace_multihead_attention()
        self.freeze_parameters()
        
    def replace_self_attention(self, model):
        """
        把预训练模型中的自注意力换成自己定义的LoraBertSelfAttention
        """
        for name, module in model.named_children():
            if isinstance(module, RobertaSelfAttention):
                layer = LoraBertSelfAttention(r=self.lora_rank, config=self.model_config)
                layer.load_state_dict(module.state_dict(), strict=False)
                setattr(model, name, layer)
            else:
                self.replace_self_attention(module)
                
                
    def freeze_parameters(self):
        """
        将除了涉及LoRA微调模块的其他参数进行冻结
        LoRA微调影响到的模块: the finetune head, bias parameters, embeddings, and layer norms 
        """
        for name, param in self.model.named_parameters():
            is_trainable = (
                "lora_" in name or
                "finetune_head_" in name or
                (self.train_biases and "bias" in name) or
                (self.train_embeddings and "embeddings" in name) or
                (self.train_layer_norms and "LayerNorm" in name)
            )
            param.requires_grad = is_trainable
	peft库中包含了LoRA在内的许多大模型高效微调方法,并且与transformer库兼容。使用peft库对大模型flan-T5-xxl进行LoRA微调的代码例子如下:


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
# 通过LoRA微调flan-T5-xxl
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
# 模型介绍:https://huggingface.co/google/flan-t5-xxl
model_name_or_path = "google/flan-t5-xxl"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")
peft_config = LoraConfig(
 r=8,
 lora_alpha=16, 
 target_modules=["q", "v"], # 仅对Query,Value矩阵进行微调
 lora_dropout=0.1,
 bias="none", 
 task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, peft_config)
# 打印可训练的参数
model.print_trainable_parameters()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Li_阴宅/article/detail/1000385
推荐阅读
相关标签
  

闽ICP备14008679号