git clone --recursive https://github.com/QwenLM/qwen.cpp && cd qwen.cpp
git submodule update --init --recursive
python3 qwen_cpp/convert.py -i /mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat -t q4_0 -o qwen7b-ggml.bin

cmake -B build
cmake --build build -j --config Release
./build/bin/main -m ./qwen7b-ggml.bin --tiktoken /mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat/qwen.tiktoken -p 你好
2.1 Qwen的covert

Convert Hugging Face Qwen models to GGML format
import argparse
import platform
import struct
import sys
from enum import Enum
from pathlib import Path
from typing import BinaryIO

import torch
from tabulate import tabulate
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

GGML_QK8_0 = 32
GGML_QK4_0 = 32
GGML_QK4_1 = 32
GGML_QK5_0 = 32
GGML_QK5_1 = 32


if platform.system() == "Darwin":
    # cpm_kernels doesn't support macOS but transformers will check missing packages, so mock it
    sys.modules["cpm_kernels"] = object()

class GGMLType(Enum):
    F32 = 0
    F16 = 1
    Q4_0 = 2
    Q4_1 = 3
    Q5_0 = 6
    Q5_1 = 7
    Q8_0 = 8

def quantize_q8_0(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q8_0 in ggml.c
    assert tensor.shape[1] % GGML_QK8_0 == 0
    tensor = tensor.view(-1, GGML_QK8_0)
    scale = tensor.abs().max(dim=-1, keepdim=True).values / ((1 << 7) - 1)
    tensor = (tensor / scale).round().clamp(min=-128, max=127).char()
    # add scale into each block
    tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
    return tensor

def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q4_0 in ggml.c,GGML_QK4_0=32
    assert tensor.shape[1] % GGML_QK4_0 == 0
    tensor = tensor.view(-1, GGML_QK4_0)  # 将tensor调整为二维张量
    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices  # :这行代码计算了每行中绝对值最大的元素的索引
    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1) # 这行代码利用上一步计算得到的索引,从张量中取出对应的元素值,形成一个新的张量 max_values。
    scale = max_values / -8
    tensor = (tensor / scale + 8).round().clamp(min=0, max=15).char()
    # compress two int4 weights into an int8
    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
    # add scale into each block
    tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
    return tensor

def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q4_1 in ggml.c
    assert tensor.shape[1] % GGML_QK4_1 == 0
    tensor = tensor.view(-1, GGML_QK4_1)
    min_vals = tensor.min(dim=-1, keepdim=True).values
    max_vals = tensor.max(dim=-1, keepdim=True).values
    scale = (max_vals - min_vals) / ((1 << 4) - 1)
    tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=15).char()
    # compress two int4 weights into an int8
    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
    # add scale & min into each block
    tensor = torch.cat((scale.half().view(torch.int8), min_vals.half().view(torch.int8), tensor), dim=-1)
    return tensor

def quantize_q5_0(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q5_0 in ggml.c
    assert tensor.shape[1] % GGML_QK5_0 == 0
    tensor = tensor.view(-1, GGML_QK5_0)
    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices
    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1)
    scale = max_values / -16
    tensor = (tensor / scale + 16).round().clamp(min=0, max=31).char()
    qs = (tensor[:, :16] & 0x0F) | (tensor[: 16:] << 4)
    qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32)
    for i in range(32):
        qh |= ((tensor[:, i] & 0x10) >> 4).int() << i

    # add scale into each block
    tensor = torch.cat((scale.half().view(torch.int8), qh[..., None].view(torch.int8), qs), dim=-1)
    return tensor

def quantize_q5_1(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q5_1 in ggml.c
    assert tensor.shape[1] % GGML_QK5_1 == 0
    tensor = tensor.view(-1, GGML_QK5_1)
    min_vals = tensor.min(dim=-1, keepdim=True).values
    max_vals = tensor.max(dim=-1, keepdim=True).values
    scale = (max_vals - min_vals) / ((1 << 5) - 1)
    tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=31).char()
    qs = (tensor[:, :16] & 0x0F) | (tensor[:, 16:] << 4)
    qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32)
    for i in range(32):
        qh |= ((tensor[:, i] & 0x10) >> 4).int() << i

    # add scale & min into each block
    tensor = torch.cat(
        (scale.half().view(torch.int8), min_vals.half().view(torch.int8), qh[..., None].view(torch.int8), qs), dim=-1
    return tensor

def dump_tensor(f, name: str, tensor: torch.Tensor, ggml_type: GGMLType):
    assert tensor.dtype == torch.float32

    # tensor name
    f.write(struct.pack("i", len(name.encode())))

    # tensor shape & dtype
    f.write(struct.pack("i" * (2 + tensor.ndim), tensor.ndim, *tensor.shape, ggml_type.value))

    # tensor data
    if ggml_type == GGMLType.F32:
        tensor = tensor.float()
    elif ggml_type == GGMLType.F16:
        tensor = tensor.half()
    elif ggml_type == GGMLType.Q8_0:
        tensor = quantize_q8_0(tensor)
    elif ggml_type == GGMLType.Q4_0:
        tensor = quantize_q4_0(tensor)
    elif ggml_type == GGMLType.Q4_1:
        tensor = quantize_q4_1(tensor)
    elif ggml_type == GGMLType.Q5_0:
        tensor = quantize_q5_0(tensor)
    elif ggml_type == GGMLType.Q5_1:
        tensor = quantize_q5_1(tensor)
        raise NotImplementedError(f"Cannot dump tensor of dtype {tensor.dtype}")

    # align address
    aligned_pos = (f.tell() + (GGML_MEM_ALIGN - 1)) // GGML_MEM_ALIGN * GGML_MEM_ALIGN
'''OrderedDict([('transformer.wte.weight', tensor([[-1.6846e-02, -9.5825e-03,  8.1787e-03,  ...,  1.6357e-02,
          1.3351e-03,  1.7578e-02],
        [ 2.9297e-03,  9.6436e-03, -6.0425e-03,  ...,  1.8799e-02,
         -8.4839e-03,  9.1553e-03],
        [ 1.6235e-02, -2.2461e-02, -1.0193e-02,  ...,  3.9307e-02,
          2.3071e-02, -1.2589e-03],
        [ 1.1921e-07, -8.4043e-06,  2.6822e-05,  ...,  2.5153e-05,
         -4.7684e-06,  4.4703e-06],
        [-8.4639e-06,  1.2040e-05,  2.1219e-05,  ...,  1.9431e-05,
          3.8147e-06,  3.0398e-06],
        [-2.5749e-05,  2.4080e-05, -1.0252e-05,  ...,  2.9802e-06,
         -1.3709e-05,  1.4067e-05]], dtype=torch.float16)), 
         ('transformer.h.0.ln_1.weight', tensor([0.0977, 0.0884, 0.1050,  ..., 0.0938, 0.0957, 0.0938],
       ('transformer.h.0.attn.c_attn.weight', tensor([[ 0.0295, -0.0085,  0.0361,  ...,  0.0059, -0.0104,  0.0110],
        [ 0.0023, -0.0036, -0.0481,  ...,  0.0031,  0.0276,  0.0084],
        [ 0.0303, -0.0239, -0.0075,  ...,  0.0315,  0.0053, -0.0413],
        [-0.0037,  0.0234, -0.0079,  ...,  0.0068,  0.0258, -0.0160],
        [ 0.0034,  0.0060, -0.0135,  ..., -0.0159, -0.0009,  0.0036],
        [-0.0008,  0.0073,  0.0067,  ..., -0.0405, -0.0018,  0.0080]],
       ('transformer.h.0.attn.c_attn.bias', tensor([-9.4531e-01,  1.8828e+00, -7.4609e-01,  ...,  1.7548e-04,
        -3.1128e-03, -1.0605e-03], dtype=torch.float16)), 
        ('transformer.h.0.attn.c_proj.weight', tensor([[-7.2327e-03, -2.6550e-03, -6.4373e-05,  ..., -2.7954e-02,
          5.1880e-03,  1.2146e-02],
        [ 7.3242e-03,  3.7384e-03, -1.1047e-02,  ..., -8.5449e-03,
         -7.9956e-03, -2.6978e-02],
        [-2.9907e-02,  8.3618e-03, -6.0425e-03,  ...,  4.0771e-02,
         -1.6403e-03, -2.9541e-02],
        [ 9.2773e-03, -9.5215e-03,  4.6997e-03,  ...,  6.3782e-03,
          3.7003e-04, -8.9111e-03],
        [-3.8574e-02, -6.2256e-03, -3.8574e-02,  ..., -2.8839e-03,
          1.2665e-03, -8.3008e-03],
        [-4.7913e-03,  1.7090e-02,  3.9795e-02,  ..., -1.1292e-02,
         -1.6602e-02, -1.1215e-03]], dtype=torch.float16)),
        ('transformer.h.0.ln_2.weight', tensor([0.1768, 0.1719, 0.1680,  ..., 0.1611, 0.1719, 0.1660],
       ('transformer.h.0.mlp.w1.weight', tensor([[-0.0276,  0.0123, -0.0299,  ...,  0.0215, -0.0173, -0.0293],
        [-0.0036, -0.0060,  0.0062,  ..., -0.0009,  0.0176,  0.0119],
        [-0.0013, -0.0004,  0.0214,  ...,  0.0002, -0.0216,  0.0304],
        [-0.0161,  0.0149, -0.0017,  ..., -0.0085, -0.0064,  0.0126],
        [ 0.0002,  0.0021,  0.0122,  ...,  0.0026,  0.0254,  0.0322],
        [ 0.0033,  0.0184, -0.0200,  ..., -0.0052, -0.0137,  0.0056]],
       ('transformer.h.0.mlp.w2.weight', tensor([[-0.0264, -0.0044,  0.0160,  ...,  0.0128, -0.0221, -0.0148],
        [ 0.0217,  0.0097, -0.0359,  ..., -0.0049,  0.0225,  0.0077],
        [ 0.0192,  0.0140, -0.0132,  ..., -0.0040,  0.0102, -0.0449],
        [ 0.0005, -0.0295, -0.0195,  ..., -0.0117,  0.0026, -0.0044],
        [ 0.0035, -0.0194, -0.0269,  ..., -0.0302, -0.0015, -0.0043],
        [-0.0215, -0.0140,  0.0017,  ...,  0.0043,  0.0157,  0.0112]],
       ('transformer.h.0.mlp.c_proj.weight', tensor([[-0.0042, -0.0111, -0.0013,  ...,  0.0018, -0.0116,  0.0288],
        [ 0.0192,  0.0131, -0.0081,  ...,  0.0120,  0.0176,  0.0120],
        [ 0.0030,  0.0109,  0.0067,  ...,  0.0003,  0.0166, -0.0005],
        [-0.0120,  0.0133,  0.0131,  ..., -0.0273,  0.0016, -0.0011],
        [ 0.0320,  0.0041, -0.0244,  ..., -0.0078,  0.0053,  0.0132],
        [-0.0337, -0.0137, -0.0013,  ..., -0.0088,  0.0315, -0.0094]],
       ('transformer.h.1.ln_1.weight', tensor([0.1348, 0.0913, 0.1045,  ..., 0.1289, 0.0806, 0.0938],
       ('transformer.h.1.attn.c_attn.weight', tensor([[ 4.4922e-02, -8.2397e-03,  2.1484e-02,  ..., -7.0496e-03,
         -1.6724e-02, -3.0670e-03],
        [-2.2583e-02,  2.9449e-03, -3.7994e-03,  ...,  9.8877e-03,
          7.4768e-03, -2.1210e-03],
        [-3.3447e-02,  7.6599e-03, -1.7822e-02,  ...,  1.5869e-02,
          1.9775e-02, -2.3193e-02],
        [-4.5166e-03,  3.6621e-03,  1.8799e-02,  ..., -9.1553e-03,
          4.2152e-04, -1.0803e-02],
        [-6.7139e-03,  8.2397e-03,  1.2756e-02,  ..., -1.7929e-03,
         -2.6733e-02, -5.9843e-05],
        [-1.1963e-02, -1.5259e-02, -9.7046e-03,  ...,  2.1210e-03,
         -9.5215e-03, -7.6294e-04]], dtype=torch.float16)), 
         ('transformer.h.1.attn.c_attn.bias', tensor([ 4.1797e-01, -1.1172e+00, -1.1094e+00,  ...,  4.0436e-04,
        -1.9455e-03,  1.8921e-03], dtype=torch.float16)), 
        ('transformer.h.1.attn.c_proj.weight', tensor([[ 0.0018,  0.0124,  0.0074,  ..., -0.0155, -0.0009,  0.0004],
        [-0.0031, -0.0286,  0.0116,  ...,  0.0066,  0.0021,  0.0006],
        [ 0.0028, -0.0011,  0.0085,  ..., -0.0044, -0.0176,  0.0090],
        [ 0.0216,  0.0042, -0.0164,  ...,  0.0072,  0.0076,  0.0093],
        [-0.0007,  0.0248, -0.0166,  ...,  0.0151,  0.0106,  0.0018],
        [ 0.0001, -0.0057, -0.0295,  ...,  0.0075, -0.0088, -0.0012]],
       ('transformer.h.1.ln_2.weight', tensor([0.2441, 0.2441, 0.2393,  ..., 0.2217, 0.2363, 0.2305],
       dtype=torch.float16)), ...])'''
       dtype=torch.float16)), ...])'''

def dump_state_dict(f, weight_names, state_dict, ggml_type):
    tensor_info = []
    for name in tqdm(weight_names, desc="Processing model states"):
        tensor = state_dict[name]
        if tensor.ndim == 2:
            # 2d weight: should quantize it if needed

            # step 1: de-quantize it back to float32
            tensor = tensor.float()

            # step 2: quantize it into ggml format
            tensor_ggml_type = ggml_type
            # 1d weight: convert it to float32
            assert tensor.ndim == 1
            tensor = tensor.float()
            tensor_ggml_type = GGMLType.F32

        dump_tensor(f, name, tensor, tensor_ggml_type)
        # [('transformer.wte.weight', torch.Size([151936, 4096]), 'Q4_0')]
        tensor_info.append((name, tensor.shape, tensor_ggml_type.name))

    print(tabulate(tensor_info, headers=["name", "shape", "dtype"], tablefmt="psql"))

self.transformer = QWenModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

  (transformer): QWenModel(
    (wte): Embedding(151936, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (rotary_emb): RotaryEmbedding()
    (h): ModuleList(
      (0-31): 32 x QWenBlock(
        (ln_1): RMSNorm()
        (attn): QWenAttention(
          (c_attn): Linear(in_features=4096, out_features=12288, bias=True)
          (c_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
        (ln_2): RMSNorm()
        (mlp): QWenMLP(
          (w1): Linear(in_features=4096, out_features=11008, bias=False)
          (w2): Linear(in_features=4096, out_features=11008, bias=False)
          (c_proj): Linear(in_features=11008, out_features=4096, bias=False)
    (ln_f): RMSNorm()
  (lm_head): Linear(in_features=4096, out_features=151936, bias=False)

 vocab_size=151851, model_max_length=8192, is_fast=False, padding_side='right', 
 truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),

class QwenConverter:
    def convert(cls, f, model, tokenizer, ggml_type):
        f.write(b"ggml")  # magic
        cls.dump_config(f, model.config, model.generation_config, tokenizer, ggml_type)
        cls.dump_model(f, model, ggml_type)

    def dump_config(f, config, generation_config, tokenizer, ggml_type):
        config_values = [
        ]  # dump_config 函数用于将模型配置信息以二进制格式写入文件
        f.write(struct.pack("i" * len(config_values), *config_values))
    # 这段代码定义了一个静态方法 dump_model,用于将模型的权重信息以二进制格式写入文件transformer.h.{0-31}.ln_1.weight
    def dump_model(f, model, ggml_type):
        weight_names = ["transformer.wte.weight"]
        for i in range(model.config.num_hidden_layers):
            weight_names += [
        weight_names += [
        dump_state_dict(f, weight_names, model.state_dict(), ggml_type)

def convert(f: BinaryIO, model_name_or_path: str, dtype: str = "q4_0"):
    # 将q4_0中的q改变为大写Q,<GGMLType.Q4_0: 2>
    ggml_type = GGMLType[dtype.upper()]  
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)

    QwenConverter.convert(f, model, tokenizer, ggml_type)

def main():
    parser = argparse.ArgumentParser("qwen-convert")
    # parser.add_argument(
    #     "-i",
    #     "--model_name_or_path",
    #     default="Qwen/Qwen-7B-Chat",
    #     type=str,
    #     help="Model name or path used in AutoModel.from_pretrained",
    # )
        help="Model name or path used in AutoModel.from_pretrained",
    # parser.add_argument(
    #     "-o", "--save_path", default="qwen7b-ggml.bin", type=Path, help="Path to save the generated GGML model"
    # )
        "-o", "--save_path", default="qwen7b-ggml.bin", type=Path, help="Path to save the generated GGML model"
        choices=["f32", "f16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"],
        help="GGML model quantization type",
    args = parser.parse_args()
    # f就是qwen7b-ggml.bin
    with open(args.save_path, "wb") as f:
        convert(f, args.model_name_or_path, dtype=args.type)

    print(f"GGML model saved to {args.save_path}")

if __name__ == "__main__":
from modelscope import snapshot_download
model_dir = snapshot_download('shakechen/Llama-2-7b',cache_dir='/mnt/workspace/llama2.c/llama2pth')
def concat_weights(models):
    state_dict = {}
    for name in list(models[0]):
        tensors = [model[name] for model in models]
        if len(tensors) == 1 or len(tensors[0].shape) == 1:
            state_dict[name] = tensors[0]
        is_axis_1 = (
            or name.endswith('.attention.wo.weight')
            or name.endswith('.feed_forward.w2.weight')
        axis = 1 if is_axis_1 else 0
        state_dict[name] = torch.cat(tensors, dim=axis)
        for model in models:
            del model[name]
    return state_dict
def load_and_export(model_path, output_path):
# 读取配置文件,json.load(f)方法解析json内容并存储在变量params中
    params_path = os.path.join(model_path, 'params.json')
    with open(params_path) as f:
        params = json.load(f)
    # 将pth加载到cpu上,然后将加载后的模型字典存入列表models。concat这儿只有一个model
    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
    models = [torch.load(p, map_location='cpu') for p in model_paths]
    state_dict = concat_weights(models)
    del models
[{'tok_embeddings.weight': tensor([[ 1.2293e-06, -1.8179e-06, -4.3511e-06,  ...,  8.7172e-07,
         -6.5267e-06,  8.9034e-07],
        [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
          2.5787e-03, -3.9368e-03],
        [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
          7.7057e-04, -5.0049e-03],
        [-1.3977e-02, -2.7313e-03, -1.9897e-02,  ..., -1.0437e-02,
          9.5825e-03, -1.8005e-03],
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05]], dtype=torch.bfloat16), 'norm.weight': tensor([1.8672, 1.8672, 1.8047,  ..., 1.7188, 1.8281, 1.6016],
       dtype=torch.bfloat16), 'output.weight': tensor([[-0.0039,  0.0032, -0.0071,  ...,  0.0053, -0.0082,  0.0070],
        [-0.0315,  0.0466, -0.0023,  ..., -0.0211,  0.0173,  0.0334],
        [-0.0125,  0.0036,  0.0195,  ..., -0.0271,  0.0143, -0.0082],
        [-0.0281, -0.0195, -0.0024,  ...,  0.0123, -0.0117, -0.0237],
        [ 0.0229,  0.0255,  0.0315,  ...,  0.0067, -0.0092, -0.0058],
        [ 0.0080, -0.0088,  0.0063,  ..., -0.0293, -0.0200,  0.0337]],
       dtype=torch.bfloat16), 'layers.0.attention.wq.weight': tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0089, -0.0136,  0.0003,  ..., -0.0100, -0.0198,  0.0078],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0086, -0.0022, -0.0008,  ...,  0.0237, -0.0081,  0.0059],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       dtype=torch.bfloat16), 'layers.0.attention.wk.weight': tensor([[-0.0162,  0.0079, -0.0013,  ...,  0.0166, -0.0099, -0.0135],
        [ 0.0255,  0.0170,  0.0019,  ..., -0.0081,  0.0113,  0.0103],
        [ 0.0192,  0.0015,  0.0036,  ..., -0.0211,  0.0152,  0.0234],
        [-0.0056,  0.0173, -0.0032,  ..., -0.0032,  0.0115, -0.0110],
        [ 0.0178, -0.0038,  0.0003,  ...,  0.0053, -0.0109,  0.0104],
        [ 0.0037, -0.0021,  0.0013,  ...,  0.0070, -0.0115,  0.0095]],
       dtype=torch.bfloat16), 'layers.0.attention.wv.weight': tensor([[ 0.0008, -0.0006,  0.0019,  ...,  0.0059, -0.0006,  0.0103],
        [-0.0069, -0.0005, -0.0077,  ..., -0.0106,  0.0126,  0.0048],
        [ 0.0018,  0.0096,  0.0010,  ...,  0.0048, -0.0139, -0.0142],
        [-0.0063, -0.0057,  0.0103,  ...,  0.0031,  0.0040, -0.0022],
        [ 0.0031,  0.0048, -0.0010,  ...,  0.0054,  0.0156,  0.0007],
        [ 0.0001,  0.0025,  0.0056,  ..., -0.0007, -0.0007,  0.0015]],
       dtype=torch.bfloat16), 'layers.0.attention.wo.weight': tensor([[-1.6212e-05, -1.9226e-03,  4.8828e-03,  ...,  5.9204e-03,
          3.4485e-03, -9.5215e-03],
        [ 2.7618e-03,  1.8463e-03, -1.2970e-03,  ..., -1.0300e-03,
          1.8082e-03,  6.2561e-03],
        [ 2.3346e-03, -2.7275e-04,  9.2697e-04,  ..., -1.6556e-03,
         -5.7373e-03, -6.3705e-04],
        [ 4.1809e-03, -3.3264e-03,  5.8899e-03,  ...,  1.2131e-03,
          2.6093e-03,  4.3030e-03],
        [-3.3569e-03, -2.4872e-03, -2.5787e-03,  ...,  6.1951e-03,
         -3.4790e-03, -5.1117e-04],
        [ 6.1951e-03, -6.5613e-04,  2.6245e-03,  ...,  5.4932e-03,
         -7.5989e-03, -6.6833e-03]], dtype=torch.bfloat16), 'layers.0.feed_forward.w1.weight': tensor([[ 1.5747e-02,  1.7090e-02,  3.1494e-02,  ..., -1.5869e-02,
          6.5002e-03,  1.5869e-02],
        [-2.1667e-03, -6.0120e-03,  5.6458e-03,  ...,  1.6113e-02,
         -8.6670e-03,  9.8877e-03],
        [ 6.8359e-03, -2.1606e-02,  2.0508e-02,  ..., -1.3000e-02,
          1.8921e-02,  1.9409e-02],
        [ 1.4126e-05, -3.2227e-02,  5.7983e-03,  ..., -8.9111e-03,
         -1.3489e-02,  4.0283e-02],
        [ 2.6611e-02,  2.0142e-02, -1.7090e-02,  ..., -3.4332e-03,
         -6.4087e-03, -1.8921e-02],
        [-5.9891e-04, -1.1353e-02, -2.3682e-02,  ...,  1.1063e-03,
          5.9204e-03, -2.4780e-02]], dtype=torch.bfloat16), 'layers.0.feed_forward.w2.weight': tensor([[ 0.0027, -0.0145,  0.0083,  ..., -0.0175, -0.0054,  0.0014],
        [ 0.0046, -0.0042,  0.0090,  ...,  0.0160, -0.0138,  0.0334],
        [ 0.0020,  0.0339, -0.0044,  ..., -0.0146,  0.0220,  0.0167],
        [-0.0089, -0.0114,  0.0052,  ...,  0.0231, -0.0135,  0.0295],
        [-0.0177,  0.0374,  0.0090,  ..., -0.0069, -0.0122, -0.0219],
        [ 0.0120, -0.0013, -0.0079,  ..., -0.0003, -0.0030, -0.0302]],
       dtype=torch.bfloat16), ...}]
解释:该 export 函数的主要目的是将模型权重以浮点32(fp32)格式导出到一个名为 .bin 的二进制文件中,以便C语言程序能够读取这些权重。以

{'tok_embeddings.weight': tensor([[ 1.2293e-06, -1.8179e-06, -4.3511e-06,  ...,  8.7172e-07,
         -6.5267e-06,  8.9034e-07],
        [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
          2.5787e-03, -3.9368e-03],
        [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
          7.7057e-04, -5.0049e-03],
        [-1.3977e-02, -2.7313e-03, -1.9897e-02,  ..., -1.0437e-02,
          9.5825e-03, -1.8005e-03],
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05]], dtype=torch.bfloat16), 'norm.weight': tensor([1.8672, 1.8672, 1.8047,  ..., 1.7188, 1.8281, 1.6016],
       dtype=torch.bfloat16), 'output.weight': tensor([[-0.0039,  0.0032, -0.0071,  ...,  0.0053, -0.0082,  0.0070],
        [-0.0315,  0.0466, -0.0023,  ..., -0.0211,  0.0173,  0.0334],
        [-0.0125,  0.0036,  0.0195,  ..., -0.0271,  0.0143, -0.0082],
        [-0.0281, -0.0195, -0.0024,  ...,  0.0123, -0.0117, -0.0237],
        [ 0.0229,  0.0255,  0.0315,  ...,  0.0067, -0.0092, -0.0058],
        [ 0.0080, -0.0088,  0.0063,  ..., -0.0293, -0.0200,  0.0337]],
       dtype=torch.bfloat16), 'layers.0.attention.wq.weight': tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0089, -0.0136,  0.0003,  ..., -0.0100, -0.0198,  0.0078],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0086, -0.0022, -0.0008,  ...,  0.0237, -0.0081,  0.0059],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       dtype=torch.bfloat16), 'layers.0.attention.wk.weight': tensor([[-0.0162,  0.0079, -0.0013,  ...,  0.0166, -0.0099, -0.0135],
        [ 0.0255,  0.0170,  0.0019,  ..., -0.0081,  0.0113,  0.0103],
        [ 0.0192,  0.0015,  0.0036,  ..., -0.0211,  0.0152,  0.0234],
        [-0.0056,  0.0173, -0.0032,  ..., -0.0032,  0.0115, -0.0110],
        [ 0.0178, -0.0038,  0.0003,  ...,  0.0053, -0.0109,  0.0104],
        [ 0.0037, -0.0021,  0.0013,  ...,  0.0070, -0.0115,  0.0095]],
       dtype=torch.bfloat16), 'layers.0.attention.wv.weight': tensor([[ 0.0008, -0.0006,  0.0019,  ...,  0.0059, -0.0006,  0.0103],
        [-0.0069, -0.0005, -0.0077,  ..., -0.0106,  0.0126,  0.0048],
        [ 0.0018,  0.0096,  0.0010,  ...,  0.0048, -0.0139, -0.0142],
        [-0.0063, -0.0057,  0.0103,  ...,  0.0031,  0.0040, -0.0022],
        [ 0.0031,  0.0048, -0.0010,  ...,  0.0054,  0.0156,  0.0007],
        [ 0.0001,  0.0025,  0.0056,  ..., -0.0007, -0.0007,  0.0015]],
       dtype=torch.bfloat16), 'layers.0.attention.wo.weight': tensor([[-1.6212e-05, -1.9226e-03,  4.8828e-03,  ...,  5.9204e-03,
          3.4485e-03, -9.5215e-03],
        [ 2.7618e-03,  1.8463e-03, -1.2970e-03,  ..., -1.0300e-03,
          1.8082e-03,  6.2561e-03],
        [ 2.3346e-03, -2.7275e-04,  9.2697e-04,  ..., -1.6556e-03,
         -5.7373e-03, -6.3705e-04],
        [ 4.1809e-03, -3.3264e-03,  5.8899e-03,  ...,  1.2131e-03,
          2.6093e-03,  4.3030e-03],
        [-3.3569e-03, -2.4872e-03, -2.5787e-03,  ...,  6.1951e-03,
         -3.4790e-03, -5.1117e-04],
        [ 6.1951e-03, -6.5613e-04,  2.6245e-03,  ...,  5.4932e-03,
         -7.5989e-03, -6.6833e-03]], dtype=torch.bfloat16), 'layers.0.feed_forward.w1.weight': tensor([[ 1.5747e-02,  1.7090e-02,  3.1494e-02,  ..., -1.5869e-02,
          6.5002e-03,  1.5869e-02],
        [-2.1667e-03, -6.0120e-03,  5.6458e-03,  ...,  1.6113e-02,
         -8.6670e-03,  9.8877e-03],
        [ 6.8359e-03, -2.1606e-02,  2.0508e-02,  ..., -1.3000e-02,
          1.8921e-02,  1.9409e-02],
        [ 1.4126e-05, -3.2227e-02,  5.7983e-03,  ..., -8.9111e-03,
         -1.3489e-02,  4.0283e-02],
        [ 2.6611e-02,  2.0142e-02, -1.7090e-02,  ..., -3.4332e-03,
         -6.4087e-03, -1.8921e-02],
        [-5.9891e-04, -1.1353e-02, -2.3682e-02,  ...,  1.1063e-03,
          5.9204e-03, -2.4780e-02]], dtype=torch.bfloat16), 'layers.0.feed_forward.w2.weight': tensor([[ 0.0027, -0.0145,  0.0083,  ..., -0.0175, -0.0054,  0.0014],
        [ 0.0046, -0.0042,  0.0090,  ...,  0.0160, -0.0138,  0.0334],
        [ 0.0020,  0.0339, -0.0044,  ..., -0.0146,  0.0220,  0.0167],
        [-0.0089, -0.0114,  0.0052,  ...,  0.0231, -0.0135,  0.0295],
        [-0.0177,  0.0374,  0.0090,  ..., -0.0069, -0.0122, -0.0219],
        [ 0.0120, -0.0013, -0.0079,  ..., -0.0003, -0.0030, -0.0302]],
       dtype=torch.bfloat16), 'layers.0.feed_forward.w3.weight': tensor([[ 0.0003, -0.0292,  0.0148,  ..., -0.0210, -0.0270,  0.0065],
        [-0.0111, -0.0312,  0.0128,  ...,  0.0190,  0.0060,  0.0025],
        [-0.0059,  0.0149, -0.0084,  ..., -0.0227,  0.0075,  0.0017],
        [-0.0091, -0.0016, -0.0067,  ...,  0.0295, -0.0028,  0.0183],
        [-0.0166,  0.0073,  0.0189,  ...,  0.0014, -0.0166,  0.0031],
        [ 0.0190,  0.0197, -0.0004,  ...,  0.0118, -0.0143, -0.0388]],
       dtype=torch.bfloat16), 'layers.0.attention_norm.weight': tensor([0.0297, 0.0136, 0.0020,  ..., 0.0103, 0.0110, 0.0061],
       dtype=torch.bfloat16), 'layers.0.ffn_norm.weight': tensor([0.0503, 0.0525, 0.0500,  ..., 0.0525, 0.0535, 0.0491],
       dtype=torch.bfloat16), 'layers.1.attention.wq.weight': tensor([[-0.0125,  0.0073, -0.0381,  ..., -0.0024, -0.0588,  0.0356],
        [-0.0195,  0.0410,  0.0544,  ...,  0.0214, -0.0308,  0.0315],
        [-0.0006, -0.0082,  0.0079,  ..., -0.0083, -0.0488,  0.0277],
        [-0.0021, -0.0038, -0.0042,  ...,  0.0088,  0.0052,  0.0062],
        [ 0.0024, -0.0022,  0.0049,  ...,  0.0098,  0.0005,  0.0128],
        [ 0.0003,  0.0048,  0.0067,  ..., -0.0079, -0.0005, -0.0111]],
       dtype=torch.bfloat16), 'layers.1.attention.wk.weight': tensor([[-0.0248, -0.0025,  0.0383,  ...,  0.0179,  0.0208, -0.0096],
        [ 0.0147,  0.0030, -0.0276,  ..., -0.0227, -0.0044, -0.0032],
        [-0.0295,  0.0046, -0.0114,  ..., -0.0159,  0.0094, -0.0588],
        [ 0.0080, -0.0192,  0.0040,  ..., -0.0121, -0.0015, -0.0065],
        [ 0.0090, -0.0239,  0.0014,  ..., -0.0122,  0.0027, -0.0074],
        [-0.0080,  0.0147,  0.0007,  ...,  0.0040, -0.0016,  0.0060]],
       dtype=torch.bfloat16), ...}
