赞
踩
- """
- 使用IMDb评论进行序列分类
- """
- #先下载数据
- # wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
- # tar -xf aclImdb_v1.tar.gz
-
- #整理文件
- from pathlib import Path
-
- def read_imdb_split(split_dir):
- split_dir = Path(split_dir)
- texts = []
- labels = []
- for label_dir in ["pos", "neg"]:
- for text_file in (split_dir/label_dir).iterdir():
- texts.append(text_file.read_text())
- labels.append(0 if label_dir is "neg" else 1)
-
- return texts, labels
-
- train_texts, train_labels = read_imdb_split('aclImdb/train')
- test_texts, test_labels = read_imdb_split('aclImdb/test')
-
- #划分训练集和测试集
- from sklearn.model_selection import train_test_split
- train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
-
- #好了,我们已经阅读了数据集。现在让我们讨论令牌化。我们最终将使用预训练的DistilBert训练分类器,因此让我们使用DistilBert标记器
- from transformers import DistilBertTokenizerFast
- tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
- train_encodings = tokenizer(train_texts, truncation=True, padding=True)
- val_encodings = tokenizer(val_texts, truncation=True, padding=True)
- test_encodings = tokenizer(test_texts, truncation=True, padding=True)
-
- import torch
-
- class IMDbDataset(torch.utils.data.Dataset):
- def __init__(self, encodings, labels):
- self.encodings = encodings
- self.labels = labels
-
- def __getitem__(self, idx):
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
- item['labels'] = torch.tensor(self.labels[idx])
- return item
-
- def __len__(self):
- return len(self.labels)
-
- train_dataset = IMDbDataset(train_encodings, train_labels)
- val_dataset = IMDbDataset(val_encodings, val_labels)
- test_dataset = IMDbDataset(test_encodings, test_labels)
-
- #与培训师进行微调
- from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
-
- training_args = TrainingArguments(
- output_dir='./results', # output directory
- num_train_epochs=3, # total number of training epochs
- per_device_train_batch_size=16, # batch size per device during training
- per_device_eval_batch_size=64, # batch size for evaluation
- warmup_steps=500, # number of warmup steps for learning rate scheduler
- weight_decay=0.01, # strength of weight decay
- logging_dir='./logs', # directory for storing logs
- logging_steps=10,
- )
-
- model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
-
- trainer = Trainer(
- model=model, # the instantiated 声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/347526推荐阅读
相关标签
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。