赞
踩
Bert-base-chinese模型是一个在简体和繁体中文文本上训练得到的预训练模型,具有以下特点:
该模型的主要作用是获取每个汉字的向量表示,后续通过微调可应用于各种简体和繁体中文任务。
- import torch
- from transformers import BertTokenizer, BertModel
-
- # 第一步:离线下载
- # from transformers import BertModel, BertTokenizer
- # model_name = "bert-base-chinese"
- # # 下载模型和分词器
- # model = BertModel.from_pretrained(model_name)
- # tokenizer = BertTokenizer.from_pretrained(model_name)
- # # 保存模型和分词器到本地路径
- # model.save_pretrained("./bert-base-chinese")
- # tokenizer.save_pretrained("./bert-base-chinese")
-
- # 第二步:加载模型和分词器
- model_path = "./bert-base-chinese"
- tokenizer = BertTokenizer.from_pretrained(model_path)
- model = BertModel.from_pretrained(model_path)
-
-
- def encode_text_with_bert(text):
- """
- 使用bert-base-chinese模型对文本进行编码
- :param text: 输入的文本
- :return: 编码后的张量
- """
- # 使用tokenizer对文本进行编码,并去掉起始和结束标志
- encoded_text = tokenizer.encode(text)[1: -1]
- # 把列表转成张量
- encoded_tensor = torch.LongTensor([encoded_text])
-
- # 不自动进行梯度计算
- with torch.no_grad():
- output = model(encoded_tensor)
-
- # 返回编码后的张量(取last_hidden_state)
- return output[0]
-
-
- if __name__ == '__main__':
- text1 = "你好,美丽中国"
- result = encode_text_with_bert(text1)
- print('text1编码的形状:', result.size())
- print('text1编码:\n', result)
text1编码的形状: torch.Size([1, 7, 768])
text1编码:
tensor([[[ 0.0781, -0.7386, -0.5120, ..., 1.0695, -0.4252, -0.3970],
[ 0.3118, -0.2283, -0.2513, ..., -0.0618, 0.8715, -0.0833],
[ 0.0287, -0.4937, -0.5554, ..., 0.1643, 0.8771, 0.0019],
...,
[-0.3068, -0.3406, 0.0525, ..., 0.5506, 0.8915, -0.3713],
[-0.1079, -0.0951, -0.1549, ..., 0.8432, 0.7255, -0.5235],
[-0.0414, -0.3786, 0.1590, ..., 0.3844, 0.7464, -0.4266]]])
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。