赞
踩
首先准备我们自己的数据集,我这里让chatgpt帮忙生成了一些
- {
- "title":"尊嘟假嘟",
- "data": [{"text": "我爱黑丝美女","labels": 2},
- {"text": "我爱白丝美女","labels": 1},
- {"text": "黑丝美女真性感","labels": 2},
- {"text": "白丝美女也很迷人","labels": 1},
- {"text": "网袜让美腿更加迷人","labels": 0},
- {"text": "黑丝和白丝都很好看","labels": 3},
- {"text": "黑丝美女让我心动","labels": 3},
- {"text": "白丝美女让我忍不住多看几眼","labels": 1},
- {"text": "黑丝和白丝哪个更好看呢?","labels": 3},
- {"text": "我喜欢穿黑丝的女孩","labels": 2},
- {"text": "我觉得白丝更适合我","labels": 1},
- {"text": "黑丝和白丝都有不同的魅力","labels": 3}]
- }
文件名称为 dummydata 这里的label是这样的:
0 | 网袜 |
1 | 白丝 |
2 | 黑丝 |
3 | 白丝+黑丝 |
- import evaluate
- import torch.utils.data
- from datasets import load_dataset, DatasetDict, Dataset
- from transformers import DataCollatorWithPadding, AutoTokenizer
- from torch.utils.data import DataLoader
这是我json文件的位置:F:\bert意图识别\data\dummydata.jsonl
- def load_datasets(test_size: float = 0.2) -> DatasetDict[str, Dataset]:
- assert 0 < test_size < 1, 'value must in range (0-1)'
- data = load_dataset('json', data_files='../data/dummydata.jsonl', field='data')
- train_test_valid = data['train'].train_test_split(test_size=0.1)
- dataset = DatasetDict({
- "train": train_test_valid["train"],
- "test": train_test_valid["test"],
- "valid": train_test_valid["train"]})
- return dataset
这里设置了自动将数据集按照0.1的比例将数据集分为训练集和测试集
然后dataset是一个DatasetDict类里面有训练集,测试集,验证集(其中验证集内容和训练集相同)
- def get_dataloaders(tokenizer, batch_size) -> dict[str:torch.utils.data.DataLoader]:
- #这里设置使用tokenizer将数据集自动变为index并且padding,返回的格式是pt
- tokenize_func = lambda x: tokenizer(x["text"], padding=True, truncation=True, return_tensors="pt")
- #这里的load_datasets是上一个函数
- dataset = load_datasets()
- #使用类功能中的map对dataset进行操作
- tokenized_datasets = dataset.map(tokenize_func, batched=True)
- #这个remove是去掉数据集中的文本,只保留bert类可以接受的参数
- tokenized_datasets = tokenized_datasets.remove_columns(["text"])
- #这里使用的pytorch
- tokenized_datasets.set_format("torch")
- # collect a Dataloader
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
- train_dataloader = DataLoader(
- tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
- )
- eval_dataloader = DataLoader(
- tokenized_datasets["valid"], batch_size=batch_size, collate_fn=data_collator
- )
- test_dataloader = DataLoader(
- tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator
- )
- return {
- "train": train_dataloader,
- "valid": eval_dataloader,
- "test": test_dataloader
该函数返回三个可迭代的torch中的Dataloader可以直接进行训练等操作。
- import os
- import torch
- import warnings
- import evaluate
- from tqdm.auto import tqdm
- from progressbar import ProgressBar
- from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
- from dataset import load_datasets, get_dataloaders
- from utils import save_model
- # OS HYPER GLOBAL PARAMETERS
- warnings.filterwarnings("ignore")
- torch.backends.cudnn.enabled = True
- os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
- os.environ["TORCH_USE_CUDA_DSA"] = "1"
- os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'#设置使用的显卡,我这里只有一张卡
- #剩下的都是为了调试bug更好用,并且使用cudnn设置的
- # - - - - - - - - - - - - - - -- - - - - - - -
- # SAVE MODEL SETTINGS
- CHECK_POINT_PATH = "../model/embedding_model" # 这里是使用的预训练的ernie-zh-base3.0
- BEST_MODEL_SAVE_DIR = "../output/model/best_model"# 训练过程中保存最好的模型
- LAST_MODEL_SAVE_DIR = "../output/model/last_model"# 这是最后一个epoch保存模型
- # - - - - - - - - - - - - - - -- - - - - - - -
- # MODEL HYPER PARAMETERS
- LEARN_RATE = 5e-5 # 学习率
- NUM_EPOCHS = 12 #epoch
- BATCH_SIZE = 16
- NUM_LABELS = 4 # 总共要四分类
- # - - - - - - - - - - - - - - -- - - - - - - -
- if __name__ == "__main__":
- tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH)
- train_dataloader, eval_dataloader = (get_dataloaders(tokenizer, BATCH_SIZE)['train'],
- get_dataloaders(tokenizer, BATCH_SIZE)['valid'])
- # load check point
- model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS)
- # define a optimizer
- optimizer = torch.optim.AdamW(model.parameters(), lr=LEARN_RATE)
- num_training_steps = len(train_dataloader) * NUM_EPOCHS
- lr_scheduler = get_scheduler(
- "linear",
- optimizer=optimizer,
- num_warmup_steps=0,
- num_training_steps=num_training_steps,
- )
-
- # evaluate settings
- metric = evaluate.load("metric.py", type="metric")
- best_accuracy = 0.0
- progress_bar = tqdm(range(num_training_steps))
- progress_bar.set_description('training')
- # model to device
- model.to(DEVICE)
这里如果要使用dataloader中的num_work多线程加载数据一定要在if __name__ == "__main__":下!!
这里首先加载分词器和模型,然后从上述定义好的get_dataloaders函数获取训练集和验证集
使用Adamw作为超参数优化器
metric = evaluate.load("metric.py", type="metric")
这里我加载的是本地的评估文件,其实也可以直接使用evaluate.load("metric")来从huggingfacehub上加载,可是我的网络不好,就直接copy到本地了
这里给大家放一下metric.py:
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Accuracy metric."""
-
- import datasets
- from sklearn.metrics import accuracy_score
-
- import evaluate
-
- _DESCRIPTION = """
- Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
- Accuracy = (TP + TN) / (TP + TN + FP + FN)
- Where:
- TP: True positive
- TN: True negative
- FP: False positive
- FN: False negative
- """
-
- _KWARGS_DESCRIPTION = """
- Args:
- predictions (`list` of `int`): Predicted labels.
- references (`list` of `int`): Ground truth labels.
- normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
- sample_weight (`list` of `float`): Sample weights Defaults to None.
- Returns:
- accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
- Examples:
- Example 1-A simple example
- >>> accuracy_metric = evaluate.load("accuracy")
- >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
- >>> print(results)
- {'accuracy': 0.5}
- Example 2-The same as Example 1, except with `normalize` set to `False`.
- >>> accuracy_metric = evaluate.load("accuracy")
- >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
- >>> print(results)
- {'accuracy': 3.0}
- Example 3-The same as Example 1, except with `sample_weight` set.
- >>> accuracy_metric = evaluate.load("accuracy")
- >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
- >>> print(results)
- {'accuracy': 0.8778625954198473}
- """
-
- _CITATION = """
- @article{scikit-learn,
- title={Scikit-learn: Machine Learning in {P}ython},
- author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
- and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
- and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
- Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
- journal={Journal of Machine Learning Research},
- volume={12},
- pages={2825--2830},
- year={2011}
- }
- """
-
-
- @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
- class Accuracy(evaluate.Metric):
- def _info(self):
- return evaluate.MetricInfo(
- description=_DESCRIPTION,
- citation=_CITATION,
- inputs_description=_KWARGS_DESCRIPTION,
- features=datasets.Features(
- {
- "predictions": datasets.Sequence(datasets.Value("int32")),
- "references": datasets.Sequence(datasets.Value("int32")),
- }
- if self.config_name == "multilabel"
- else {
- "predictions": datasets.Value("int32"),
- "references": datasets.Value("int32"),
- }
- ),
- reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
- )
-
- def _compute(self, predictions, references, normalize=True, sample_weight=None):
- return {
- "accuracy": float(
- accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
- )
- }
- for epoch in range(NUM_EPOCHS):
- pbar = ProgressBar().start() #这里是开始进度条
- model.train()
- total_loss = 0
- for batch in train_dataloader:
- batch = {k: v.to(DEVICE) for k, v in batch.items()}
- outputs = model(**batch)
- loss = outputs.loss
- loss.backward()
- optimizer.step()
- lr_scheduler.step()
- optimizer.zero_grad()
- total_loss += loss.item()
- average_loss = total_loss / len(train_dataloader) #计算一个epoch的平均损失
- # noinspection DuplicatedCode
- model.eval()
- for batch in eval_dataloader:
- batch = {k: v.to(DEVICE) for k, v in batch.items()}
- with torch.no_grad():
- outputs = model(**batch)
- logits = outputs.logits
- predictions = torch.argmax(logits, dim=-1)
- metric.add_batch(predictions=predictions, references=batch["labels"])
- result = metric.compute()
- accuracy = result['accuracy']
- print(f"epoch: {epoch}, average_loss: {average_loss:.4f},accuracy: {result['accuracy']:.4f}")
- progress_bar.update(1)
1.首先定义一个保存模型函数:
- def save_model(tokenizer, model, save_dir):
- tokenizer.save_pretrained(save_dir)
- model.save_pretrained(save_dir)
- logging.info('save done')
2 .保存模型代码:
- if accuracy > best_accuracy:
- best_accuracy = accuracy
- save_model(tokenizer, model, BEST_MODEL_SAVE_DIR)
- save_model(tokenizer, model, LAST_MODEL_SAVE_DIR)
- pbar.finish()
- import os
- import torch
- import warnings
- import evaluate
- from tqdm.auto import tqdm
- from progressbar import ProgressBar
- from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
- from dataset import load_datasets, get_dataloaders
- from utils import save_model
-
- # OS HYPER GLOBAL PARAMETERS
- warnings.filterwarnings("ignore")
- torch.backends.cudnn.enabled = True
- os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
- os.environ["TORCH_USE_CUDA_DSA"] = "1"
- os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
- DEVICE = 'cuda'
- # - - - - - - - - - - - - - - -- - - - - - - -
- # SAVE MODEL SETTINGS
- CHECK_POINT_PATH = "../model/embedding_model"
- BEST_MODEL_SAVE_DIR = "../output/model/best_model"
- LAST_MODEL_SAVE_DIR = "../output/model/last_model"
- # - - - - - - - - - - - - - - -- - - - - - - -
- # MODEL HYPER PARAMETERS
- LEARN_RATE = 5e-5
- NUM_EPOCHS = 12
- BATCH_SIZE = 16
- NUM_LABELS = 4
- # - - - - - - - - - - - - - - -- - - - - - - -
-
-
- if __name__ == "__main__":
- tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH)
- train_dataloader, eval_dataloader = (get_dataloaders(tokenizer, BATCH_SIZE)['train'],
- get_dataloaders(tokenizer, BATCH_SIZE)['valid'])
- # load check point
- model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS)
- # define a optimizer
- optimizer = torch.optim.AdamW(model.parameters(), lr=LEARN_RATE)
- num_training_steps = len(train_dataloader) * NUM_EPOCHS
- lr_scheduler = get_scheduler(
- "linear",
- optimizer=optimizer,
- num_warmup_steps=0,
- num_training_steps=num_training_steps,
- )
-
- # evaluate settings
- metric = evaluate.load("metric.py", type="metric")
- best_accuracy = 0.0
- progress_bar = tqdm(range(num_training_steps))
- progress_bar.set_description('training')
- # model to device
- model.to(DEVICE)
- # train & eval & save
- for epoch in range(NUM_EPOCHS):
- pbar = ProgressBar().start()
- model.train()
- total_loss = 0
- for batch in train_dataloader:
- batch = {k: v.to(DEVICE) for k, v in batch.items()}
- outputs = model(**batch)
- loss = outputs.loss
- loss.backward()
- optimizer.step()
- lr_scheduler.step()
- optimizer.zero_grad()
- total_loss += loss.item()
- average_loss = total_loss / len(train_dataloader)
-
- # noinspection DuplicatedCode
- model.eval()
- for batch in eval_dataloader:
- batch = {k: v.to(DEVICE) for k, v in batch.items()}
- with torch.no_grad():
- outputs = model(**batch)
- logits = outputs.logits
- predictions = torch.argmax(logits, dim=-1)
- metric.add_batch(predictions=predictions, references=batch["labels"])
- result = metric.compute()
- accuracy = result['accuracy']
- print(f"epoch: {epoch}, average_loss: {average_loss:.4f},accuracy: {result['accuracy']:.4f}")
- progress_bar.update(1)
-
- if accuracy > best_accuracy:
- best_accuracy = accuracy
- save_model(tokenizer, model, BEST_MODEL_SAVE_DIR)
- save_model(tokenizer, model, LAST_MODEL_SAVE_DIR)
- pbar.finish()
这里与训练过程中大同小异,我直接写了评估,没有实际输出标签
- import evaluate
- import os
- import torch
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
- from dataset import get_dataloaders
- from progressbar import ProgressBar
-
- # DEFINE MODEL PATH
- # ------------------------------------------------------
- CHECK_POINT_PATH = '../output/model/best_model'
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
- DEVICE = 'cuda'
- BATCH = 8
- NUM_LABELS = 4
- # ------------------------------------------------------
- # load tokenizer
- tokenizer = AutoTokenizer.from_pretrained(CHECK_POINT_PATH)
- # load test data
- test_dataloader = get_dataloaders(tokenizer, batch_size=BATCH)['test']
- # init evaluate
- accuracy = evaluate.load('metric.py', type='metric')
- model = AutoModelForSequenceClassification.from_pretrained(CHECK_POINT_PATH, num_labels=NUM_LABELS)
-
- if __name__ == '__main__':
- pbar = ProgressBar().start()
- model.to(DEVICE)
- # noinspection DuplicatedCode
- model.eval()
- for batch in test_dataloader:
- batch = {k: v.to(DEVICE) for k, v in batch.items()}
- with torch.no_grad():
- outputs = model(**batch)
- logits = outputs.logits
- print(outputs.logits)
- predictions = torch.argmax(logits, dim=-1)
- accuracy.add_batch(predictions=predictions, references=batch["labels"])
- result = accuracy.compute()
- print(f"Accuracy: {result['accuracy']*100:.2f}%")
- pbar.finish()
这篇文章就到这里啦,记得关注喔
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。