赞
踩
介绍transformer一些主要组件的基本使用,几乎所有的NLP任务都可以套用这些基础组件来完成。
1、pipline基本使用
# 查看pipline支持的任务类型 from transformers import pipelines # 遍历打印任务类型 for k,v in pipelines.SUPPORTED_TASKS.items(): print(k,v) # Pipeline的创建与使用方式 from transformers import pipeline # 本地加载 并同时加载指定的中文模型 pipe = pipeline("text-classification", model="../models/bert-base-chinese") pipe("我感觉还行") # 先加载模型 再创建pipline 但必须同时指定model跟tokenizer from transformers import AutoTokenizer, AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained("../models/bert-base-chinese") tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese") # 创建pipline pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) pipe("感觉很好") # 其他任务 qa_pipe = pipeline("question-answering", model="../models/bert-base-chinese") qa_pipe(question="中国的首都是哪里?", context="中国的首都是北京", max_answer_len=1) # Pipeline背后的实现 from transformers import * import torch tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese") model = AutoModelForSequenceClassification.from_pretrained("../models/bert-base-chinese") # tokenizer数据处理 input_text = "我感觉很好" inputs = tokenizer(input_text, return_tensors="pt") # print(inputs) # 传入模型 res = model(**inputs) # print(res) # logits转换 概率 logits = res.logits logits = torch.softmax(logits, dim=-1) # print(logits) # 获取预测标签 pred = torch.argmax(logits).item() # print(pred) # 标签准换 print(model.config.id2label) result = model.config.id2label.get(pred) print(result)
2、tokenizer基本使用
from transformers import AutoTokenizer # tokenizer加载与保存 tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese") # print(tokenizer) # 保存到本地 tokenizer.save_pretrained("./save_tokenizer") # 查看词典 print(tokenizer.vocab) # 词典大小 print(tokenizer.vocab_size) # 句子分词 sen = "重庆是个好地方" # 分词 tokens = tokenizer.tokenize(sen) print(tokens) # 此序列转换为词典的id序列 ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) # 将id转为词 new_tokens = tokenizer.convert_ids_to_tokens(ids) print(new_tokens) # 将词转换为string str_sen = tokenizer.convert_tokens_to_string(tokens) print(str_sen) # 一步转换 # 词转id --> 编码 ids = tokenizer.encode(sen, add_special_tokens=True) print(ids) # id转字符串 --> 解码 tokens = tokenizer.decode(ids) print(tokens) # 句子有长有短 需要补充与截断 # 补充 不够填充0 ids = tokenizer.encode(sen, padding="max_length", max_length=20) print(ids) # 截断 统一长度 tokens = tokenizer.encode(sen, max_length=5, truncation=True) print(tokens) # mask ids = tokenizer.encode(sen, padding="max_length", max_length=15) attention_mask = [1 if idx != 0 else 0 for idx in ids] token_type_ids = [0] * len(ids) print(ids, "\n",attention_mask,"\n", token_type_ids) # 一步调用 inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15) print(inputs) inputs = tokenizer(sen, padding="max_length", max_length=15) print(inputs) # 特殊tokenizer的加载 一些非官方或者自定义的tokenizer # 调用时需要加上参数 trust_remote_code=True tokenizer = AutoTokenizer.from_pretrained("../chatglm3-6b", trust_remote_code=True) # tokenizer tokenizer.decode(tokenizer.encode(sen))
3、datasets基本使用
# 数据集本地加载 from datasets import load_dataset, Dataset # 读取本地csv文件 dataset = load_dataset("csv", data_files="../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv", split="train") print(dataset) # pandasdf转dataset import pandas as pd pdf = pd.read_csv("../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv") dataset = Dataset.from_pandas(pdf) print(dataset) # 直接使用Dataset读 dataset = Dataset.from_csv("../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv") print(dataset) # 数据操作 print(dataset.column_names) print(dataset.features) print(dataset[:3]) print(dataset["label"][:3]) # 数据集划分 print(dataset.train_test_split(test_size=0.2)) # 选取与过滤 print(dataset.select([0,3])) print(dataset.filter(lambda example: example["label"] == 1)) # 过滤空值 dataset = dataset.filter(lambda x: x["review"] is not None) # 数据映射 def add_prefix(example): if example["review"] is not None: example["review"] = 'Prefix: ' + example["review"] return example map_dataset = dataset.map(add_prefix) map_dataset["review"][:3] # 结合tokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese") def process_fun(example, tokenizer=tokenizer): moedel_inputs = tokenizer(example["review"], max_length=128, truncation=True) moedel_inputs["labels"] = example["label"] return moedel_inputs processed_dataset = dataset.map(process_fun, batched=True) print(processed_dataset) # 移除不需要传入模型的字段 原来的字段 processed_dataset = dataset.map(process_fun, batched=True, remove_columns=dataset.column_names) print(processed_dataset) # 结合内置的一些DataCollator from transformers import DataCollatorWithPadding from datasets import load_dataset dataset = load_dataset("csv", data_files="../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv", split="train") dataset = dataset.filter(lambda example: example["review"] is not None) # print(dataset) def process_fun(example, tokenizer=tokenizer): moedel_inputs = tokenizer(example["review"], max_length=128, truncation=True) moedel_inputs["labels"] = example["label"] return moedel_inputs tokenized_dataset = dataset.map(process_fun, batched=True, remove_columns=dataset.column_names) # print(tokenized_dataset) # 结合DataLoader做数据处理 from torch.utils.data import DataLoader collator = DataCollatorWithPadding(tokenizer=tokenizer) dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=collator, shuffle=True) next(enumerate(dl))[1]
4、model基本使用
# 常用的模型一般分为三种:自回归模型、自编码模型和序列到序列模型 from transformers import AutoConfig, AutoModel, AutoTokenizer # 模型本地加载 带modelhead跟不带modelhead # 不带任务头(modelhead) 仅返回模型本身 实际少用 model = AutoModel.from_pretrained("../models/bert-base-chinese") # 查看修改模型配置 config = AutoConfig.from_pretrained("../models/bert-base-chinese") # print(config) print(config.hidden_size) # 带任务头的使用 常使用 from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained("../models/bert-base-chinese") sen = "重庆是个很好玩的地方!" tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese") inputs = tokenizer(sen, return_tensors="pt") # 模型调用 output = model(**inputs) print(output) print(model.config.num_labels)
5、evaluate基本使用
import evaluate # 查看支持的评估函数 # print(evaluate.list_evaluation_modules()) # 加载评估函数 本地加载 accuracy = evaluate.load("../metrics/accuracy") # 查看评估指标说明 print(accuracy.description) # 全局计算 results = accuracy.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) print(results) # 迭代计算 for ref, pred in zip([0,1,0,1], [1,0,0,1]): accuracy.add(references=ref, predictions=pred) print(accuracy.compute()) for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]): accuracy.add_batch(references=refs, predictions=preds) print(accuracy.compute()) # 多评估指标计算 # clf_metrics = evaluate.combine(["accuracy", "f1", "recall", "precision"]) # print(clf_metrics)
6、Trainer、TrainingArguments基本使用
from transformers import Trainer, TrainingArguments # 配置训练参数 train_args = TrainingArguments(output_dir="./checkpoints", # 输出文件夹 per_device_train_batch_size=64, # 训练时的batch_size per_device_eval_batch_size=128, # 验证时的batch_size logging_steps=10, # log 打印的频率 evaluation_strategy="epoch", # 评估策略 save_strategy="epoch", # 保存策略 save_total_limit=3, # 最大保存数 learning_rate=2e-5, # 学习率 weight_decay=0.01, # weight_decay metric_for_best_model="f1", # 设定评估指标 load_best_model_at_end=True) # 训练完成后加载最优模型 print(train_args) # 创建训练参数 from transformers import DataCollatorWithPadding trainer = Trainer(model=model, args=train_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["test"], data_collator=DataCollatorWithPadding(tokenizer=tokenizer), compute_metrics=eval_metric)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。