赞
踩
run_boolq_roberta.py
CUDA_VISIBLE_DEVICES=2 python use_boolq_bert.py --model_type bert --model_name_or_path bert-base-cased --do_eval --do_lower_case --train_file train.jsonl --predict_file val.jsonl --test_file test3.jsonl --per_gpu_eval_batch_size=8 --output_dir /boolq_bert_output/checkpoint-75000
- import json
- import argparse
- import csv
- import glob
- import logging
- import os
- import random
-
- import numpy as np
- import torch
- from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
- from torch.utils.data.distributed import DistributedSampler
- from tqdm import tqdm, trange
-
- from transformers import (
- WEIGHTS_NAME,
- AdamW,
- RobertaConfig,
- RobertaForSequenceClassification,
- RobertaTokenizer,
- get_linear_schedule_with_warmup,
- )
-
-
- try:
- from torch.utils.tensorboard import SummaryWriter
- except ImportError:
- from tensorboardX import SummaryWriter
-
-
- logger = logging.getLogger(__name__)
-
- MODEL_CLASSES = {
- "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
- }
-
-
- class SwagExample(object):
- """A single training/test example for the SWAG dataset."""
-
- def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
- self.swag_id = swag_id
- self.context_sentence = context_sentence
- self.start_ending = start_ending
- self.endings = [
- ending_0,
- ending_1,
- ending_2,
- ending_3,
- ]
- self.label = label
-
- def __str__(self):
- return self.__repr__()
-
- def __repr__(self):
- attributes = [
- "swag_id: {}".format(self.swag_id),
- "context_sentence: {}".format(self.context_sentence),
- "start_ending: {}".format(self.start_ending),
- "ending_0: {}".format(self.endings[0]),
- "ending_1: {}".format(self.endings[1]),
- "ending_2: {}".format(self.endings[2]),
- "ending_3: {}".format(self.endings[3]),
- ]
-
- if self.label is not None:
- attributes.append("label: {}".format(self.label))
-
- return ", ".join(attributes)
-
-
- class BoolqExample(object):
- """A single training/test example for the Boolq dataset."""
-
- def __init__(self, swag_id, context_sentence, start_ending, ending_0, label=None):
- self.swag_id = swag_id
- self.context_sentence = context_sentence
- self.start_ending = start_ending
- self.endings = [
- ending_0
- # ending_1,
- ]
- self.label = label
-
- def __str__(self):
- return self.__repr__()
-
- def __repr__(self):
- attributes = [
- "swag_id: {}".format(self.swag_id),
- "context_sentence: {}".format(self.context_sentence),
- "start_ending: {}".format(self.start_ending),
- "ending_0: {}".format(self.endings[0]),
- #"ending_1: {}".format(self.endings[1]),
- ]
-
- if self.label is not None:
- attributes.append("label: {}".format(self.label))
-
- return ", ".join(attributes)
-
-
- class MultiRCExample(object):
- """A single training/test example for the Boolq dataset."""
-
- def __init__(self, swag_id, context_sentence, start_ending, ending_0, label=None):
- self.swag_id = swag_id
- self.context_sentence = context_sentence
- self.start_ending = start_ending
- self.endings = [
- ending_0
- # ending_1,
- ]
- self.label = label
-
- def __str__(self):
- return self.__repr__()
-
- def __repr__(self):
- attributes = [
- "swag_id: {}".format(self.swag_id),
- "context_sentence: {}".format(self.context_sentence),
- "start_ending: {}".format(self.start_ending),
- "ending_0: {}".format(self.endings[0]),
- #"ending_1: {}".format(self.endings[1]),
- ]
-
- if self.label is not None:
- attributes.append("label: {}".format(self.label))
-
- return ", ".join(attributes)
-
-
- '''class InputFeatures(object):
- def __init__(self, example_id, choices_features, label):
- self.example_id = example_id
- self.choices_features = [
- {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
- for _, input_ids, input_mask, segment_ids in choices_features
- ]
- self.label = label'''
-
-
- class InputFeatures(object):
- """
- A single set of features of data.
- Args:
- input_ids: Indices of input sequence tokens in the vocabulary.
- attention_mask: Mask to avoid performing attention on padding token indices.
- Mask values selected in ``[0, 1]``:
- Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
- token_type_ids: Segment token indices to indicate first and second portions of the inputs.
- label: Label corresponding to the input
- """
-
- def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
- self.input_ids = input_ids
- self.attention_mask = attention_mask
- self.token_type_ids = token_type_ids
- self.label = label
-
-
- def read_swag_examples(input_file, is_training=True):
- with open(input_file, "r", encoding="utf-8") as f:
- lines = list(csv.reader(f))
-
- if is_training and lines[0][-1] != "label":
- raise ValueError("For training, the input file must contain a label column.")
-
- examples = [
- SwagExample(
- swag_id=line[2],
- context_sentence=line[4],
- start_ending=line[5], # in the swag dataset, the
- # common beginning of each
- # choice is stored in "sent2".
- ending_0=line[7],
- ending_1=line[8],
- ending_2=line[9],
- ending_3=line[10],
- label=int(line[11]) if is_training else None,
- )
- for line in lines[1:] # we skip the line with the column names
- ]
-
- return examples
-
-
- def read_boolq_examples(input_file, is_training=True):
- with open(input_file, "r", encoding="utf-8") as f:
- lines = f.readlines()
-
- examples = []
- for line in lines:
- data_raw = json.loads(line.strip("\n"))
- if data_raw["label"]:
- label_input = 1
- else:
- label_input = 0
- examples.append(
- BoolqExample(
- swag_id=data_raw["idx"],
- context_sentence=data_raw["passage"],
- start_ending=data_raw["question"],
- ending_0="",
- label=label_input if is_training else None,
- )
- )
-
- return examples
-
- def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
- """Loads a data file into a list of `InputBatch`s."""
-
- # Swag is a multiple choice task. To perform this task using Bert,
- # we will use the formatting proposed in "Improving Language
- # Understanding by Generative Pre-Training" and suggested by
- # @jacobdevlin-google in this issue
- # https://github.com/google-research/bert/issues/38.
- #
- # Each choice will correspond to a sample on which we run the
- # inference. For a given Swag example, we will create the 4
- # following inputs:
- # - [CLS] context [SEP] choice_1 [SEP]
- # - [CLS] context [SEP] choice_2 [SEP]
- # - [CLS] context [SEP] choice_3 [SEP]
- # - [CLS] context [SEP] choice_4 [SEP]
- # The model will output a single value for each input. To get the
- # final decision of the model, we will run a softmax over these 4
- # outputs.
- features = []
- for example_index, example in tqdm(enumerate(examples)):
- context_tokens = tokenizer.tokenize(example.context_sentence)
- start_ending_tokens = tokenizer.tokenize(example.start_ending)
-
- context_tokens_choice = context_tokens[:]
- ending_tokens = start_ending_tokens
- _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
-
- tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
- segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
-
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
- input_mask = [1] * len(input_ids)
-
- # Zero-pad up to the sequence length.
- padding = [0] * (max_seq_length - len(input_ids))
- input_ids += padding
- input_mask += padding
- segment_ids += padding
-
- assert len(input_ids) == max_seq_length
- assert len(input_mask) == max_seq_length
- assert len(segment_ids) == max_seq_length
-
- label = example.label
- if example_index < 5:
- logger.info("*** Example ***")
- logger.info("swag_id: {}".format(example.swag_id))
- logger.info("tokens: {}".format(" ".join(tokens)))
- logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
- logger.info("attention_mask: {}".format(" ".join(map(str, input_mask))))
- logger.info("token_type_ids: {}".format(" ".join(map(str, segment_ids))))
- if is_training:
- logger.info("label: {}".format(label))
-
- features.append(InputFeatures(input_ids = input_ids, attention_mask = input_mask, token_type_ids = segment_ids, label = label ))
-
- return features
-
-
- def _truncate_seq_pair(tokens_a, tokens_b, max_length):
- """Truncates a sequence pair in place to the maximum length."""
-
- # This is a simple heuristic which will always truncate the longer sequence
- # one token at a time. This makes more sense than truncating an equal percent
- # of tokens from each, since if one sequence is very short then each token
- # that's truncated likely contains more information than a longer sequence.
- while True:
- total_length = len(tokens_a) + len(tokens_b)
- if total_length <= max_length:
- break
- if len(tokens_a) > len(tokens_b):
- tokens_a.pop()
- else:
- tokens_b.pop()
-
-
- '''def accuracy(out, labels):
- outputs = np.argmax(out, axis=1)
- return np.sum(outputs == labels)'''
-
-
- '''def accuracy(out, labels):
- outputs = (out >= 0.5).astype(np.int)
- return np.sum(outputs == labels)'''
-
-
- def accuracy(out, labels):
- outputs = (out.squeeze(1) >= 0.5).astype(np.int)
- return np.sum(outputs == labels)
-
-
- def select_field(features, field):
- return [[choice[field] for choice in feature.choices_features] for feature in features]
-
-
- def set_seed(args):
- random.seed(args.seed)
- np.random.seed(args.seed)
- torch.manual_seed(args.seed)
- if args.n_gpu > 0:
- torch.cuda.manual_seed_all(args.seed)
-
-
- def load_and_cache_examples(args, tokenizer, train = False, evaluate=False, test = False, output_examples=False):
- if args.local_rank not in [-1, 0]:
- torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
- # Load data features from cache or dataset file
- # input_file = args.predict_file if evaluate else args.train_file
- if train:
- input_file = args.train_file
- if evaluate:
- input_file = args.predict_file
- if test:
- input_file = args.test_file
-
- cached_features_file = os.path.join(
- os.path.dirname(input_file),
- "cached_{}_{}_{}".format(
- "dev" if evaluate else "train",
- list(filter(None, args.model_name_or_path.split("/"))).pop(),
- str(args.max_seq_length),
- ),
- )
- if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
- logger.info("Loading features from cached file %s", cached_features_file)
- features = torch.load(cached_features_file)
- else:
- logger.info("Creating features from dataset file at %s", input_file)
- examples = read_boolq_examples(input_file)
- features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
-
- if args.local_rank in [-1, 0]:
- logger.info("Saving features into cached file %s", cached_features_file)
- torch.save(features, cached_features_file)
-
- if args.local_rank == 0:
- torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
- # Convert to Tensors and build dataset
- # all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
- # all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
- # all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
- # all_label = torch.tensor([f.label for f in features], dtype=torch.long)
- all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
- all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
- all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
- all_label = torch.tensor([f.label for f in features], dtype=torch.long)
-
- if evaluate:
- dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
- else:
- dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-
- if output_examples:
- return dataset, examples, features
- return dataset
-
-
- def train(args, train_dataset, model, tokenizer):
- """ Train the model """
- '''if args.local_rank in [-1, 0]:
- tb_writer = SummaryWriter()'''
- if args.local_rank in [-1, 0]:
- tb_writer = SummaryWriter(args.tbname)
-
- args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
- train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
- train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
- print("len(train_dataset): ",len(train_dataset))
-
- if args.max_steps > 0:
- t_total = args.max_steps
- args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
- else:
- t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
- # Prepare optimizer and schedule (linear warmup and decay)
- no_decay = ["bias", "LayerNorm.weight"]
- optimizer_grouped_parameters = [
- {
- "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
- "weight_decay": args.weight_decay,
- },
- {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
- ]
- optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
- scheduler = get_linear_schedule_with_warmup(
- optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
- )
- if args.fp16:
- try:
- from apex import amp
- except ImportError:
- raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
- model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
- # multi-gpu training (should be after apex fp16 initialization)
- if args.n_gpu > 1:
- model = torch.nn.DataParallel(model)
-
- # Distributed training (should be after apex fp16 initialization)
- if args.local_rank != -1:
- model = torch.nn.parallel.DistributedDataParallel(
- model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
- )
-
- # Train!
- logger.info("***** Running training *****")
- logger.info(" Num examples = %d", len(train_dataset))
- logger.info(" Num Epochs = %d", args.num_train_epochs)
- logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
- logger.info(
- " Total train batch size (w. parallel, distributed & accumulation) = %d",
- args.train_batch_size
- * args.gradient_accumulation_steps
- * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
- )
- logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
- logger.info(" Total optimization steps = %d", t_total)
-
- global_step = 0
- tr_loss, logging_loss = 0.0, 0.0
- model.zero_grad()
- train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
- set_seed(args) # Added here for reproductibility
- for _ in train_iterator:
- epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
- print("len(train_dataloader): ",len(train_dataloader))
- for step, batch in enumerate(epoch_iterator):
- model.train()
- batch = tuple(t.to(args.device) for t in batch)
- inputs = {
- "input_ids": batch[0],
- "attention_mask": batch[1],
- # 'token_type_ids': None if args.model_type == 'xlm' else batch[2],
- #"token_type_ids": batch[2],
- "labels": batch[3].float(),
- }
- # if args.model_type in ['xlnet', 'xlm']:
- # inputs.update({'cls_index': batch[5],
- # 'p_mask': batch[6]})
- outputs = model(**inputs)
-
- '''logit = outputs[1]
- print(logit)
- print(logit.shape)
- print(batch[3])
- print(batch[3].shape)'''
- '''logit = outputs[1]
- print("logit", logit)
- print("logit.shape", logit.shape)
- print("batch[3]", batch[3])
- print("batch[3].shape", batch[3].shape)
- print("batch[3].type", type(batch[3]))'''
-
- loss = outputs[0] # model outputs are always tuple in transformers (see doc)
-
- if args.n_gpu > 1:
- loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
- if args.gradient_accumulation_steps > 1:
- loss = loss / args.gradient_accumulation_steps
-
- if args.fp16:
- with amp.scale_loss(loss, optimizer) as scaled_loss:
- scaled_loss.backward()
- torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
- else:
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
- tr_loss += loss.item()
- if (step + 1) % args.gradient_accumulation_steps == 0:
- optimizer.step()
- scheduler.step() # Update learning rate schedule
- model.zero_grad()
- global_step += 1
-
- if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
- # Log metrics
- #if (args.local_rank == -1 and args.evaluate_during_training):
- if args.evaluate_during_training:
- eval = True
- test = False
- results = evaluate(eval, test, args, model, tokenizer)
- for key, value in results.items():
- tb_writer.add_scalar("eval_{}".format(key), value, global_step)
- # Save model checkpoint
- output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- model_to_save = (
- model.module if hasattr(model, "module") else model
- ) # Take care of distributed/parallel training
- model_to_save.save_pretrained(output_dir)
- tokenizer.save_vocabulary(output_dir)
- torch.save(args, os.path.join(output_dir, "training_args.bin"))
- logger.info("Saving model checkpoint to %s", output_dir)
- tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
- tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
- logging_loss = tr_loss
-
- '''if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
- # Save model checkpoint
- output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- model_to_save = (
- model.module if hasattr(model, "module") else model
- ) # Take care of distributed/parallel training
- model_to_save.save_pretrained(output_dir)
- tokenizer.save_vocabulary(output_dir)
- torch.save(args, os.path.join(output_dir, "training_args.bin"))
- logger.info("Saving model checkpoint to %s", output_dir)'''
-
- if args.max_steps > 0 and global_step > args.max_steps:
- epoch_iterator.close()
- break
- if args.max_steps > 0 and global_step > args.max_steps:
- train_iterator.close()
- break
-
- if args.local_rank in [-1, 0]:
- tb_writer.close()
-
- return global_step, tr_loss / global_step
-
-
- # train = False, evaluate=False, test = False,
- def evaluate(eval, test, args, model, tokenizer, prefix=""):
- if eval :
- dataset, examples, features = load_and_cache_examples(args, tokenizer, train=False, evaluate=True, test=False,
- output_examples=True)
- if test :
- dataset, examples, features = load_and_cache_examples(args, tokenizer, train=False, evaluate=False, test=True,
- output_examples=True)
-
- # train = False, evaluate=False, test = False,
- if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
- os.makedirs(args.output_dir)
-
- args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
- # Note that DistributedSampler samples randomly
- eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
- eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
- # Eval!
- logger.info("***** Running evaluation {} *****".format(prefix))
- logger.info(" Num examples = %d", len(dataset))
- logger.info(" Batch size = %d", args.eval_batch_size)
-
- eval_loss, eval_accuracy = 0, 0
- nb_eval_steps, nb_eval_examples = 0, 0
-
- for batch in tqdm(eval_dataloader, desc="Evaluating"):
- model.eval()
- batch = tuple(t.to(args.device) for t in batch)
- with torch.no_grad():
- inputs = {
- "input_ids": batch[0],
- "attention_mask": batch[1],
- # 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
- #"token_type_ids": batch[2],
- "labels": batch[3].float(),
- }
-
- # if args.model_type in ['xlnet', 'xlm']:
- # inputs.update({'cls_index': batch[4],
- # 'p_mask': batch[5]})
- outputs = model(**inputs)
- tmp_eval_loss, logits = outputs[:2]
- eval_loss += tmp_eval_loss.mean().item()
-
- logits = logits.detach().cpu().numpy()
- label_ids = inputs["labels"].to("cpu").numpy()
- tmp_eval_accuracy = accuracy(logits, label_ids)
- eval_accuracy += tmp_eval_accuracy
-
- nb_eval_steps += 1
- nb_eval_examples += inputs["input_ids"].size(0)
- '''print("inputs[\"input_ids\"]: ", inputs["input_ids"])
- print("inputs[\"input_ids\"].shape: ", inputs["input_ids"].shape)'''
-
- eval_loss = eval_loss / nb_eval_steps
- eval_accuracy = eval_accuracy / nb_eval_examples
- result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
-
- output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
- with open(output_eval_file, "w") as writer:
- logger.info("***** Eval results *****")
- for key in sorted(result.keys()):
- logger.info("%s = %s", key, str(result[key]))
- writer.write("%s = %s\n" % (key, str(result[key])))
-
- return result
-
-
- def main():
- parser = argparse.ArgumentParser()
-
- # Required parameters
- parser.add_argument(
- "--train_file",
- default=None,
- type=str,
- required=True,
- help="SWAG csv for training. E.g., train.csv"
- )
- parser.add_argument(
- "--predict_file",
- default=None,
- type=str,
- required=True,
- help="SWAG csv for predictions. E.g., val.csv",
- )
- parser.add_argument(
- "--test_file",
- default=None,
- type=str,
- required=True,
- help="SWAG csv for test. E.g., test.csv",
- )
- parser.add_argument(
- "--tbname",
- default=None,
- type=str,
- required=True,
- help="tbname"
- )
- parser.add_argument(
- "--model_type",
- default=None,
- type=str,
- required=True,
- help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
- )
- parser.add_argument(
- "--model_name_or_path",
- default=None,
- type=str,
- required=True,
- help="Path to pre-trained model or shortcut name selected in the list: " + ", ",
- )
- parser.add_argument(
- "--output_dir",
- default=None,
- type=str,
- required=True,
- help="The output directory where the model checkpoints and predictions will be written.",
- )
-
- # Other parameters
- parser.add_argument(
- "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
- )
- parser.add_argument(
- "--tokenizer_name",
- default="",
- type=str,
- help="Pretrained tokenizer name or path if not the same as model_name",
- )
- parser.add_argument(
- "--max_seq_length",
- default=384,
- type=int,
- help="The maximum total input sequence length after tokenization. Sequences "
- "longer than this will be truncated, and sequences shorter than this will be padded.",
- )
- parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
- parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
- parser.add_argument(
- "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
- )
- parser.add_argument(
- "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
- )
-
- parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
- parser.add_argument(
- "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
- )
- parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
- parser.add_argument(
- "--gradient_accumulation_steps",
- type=int,
- default=1,
- help="Number of updates steps to accumulate before performing a backward/update pass.",
- )
- parser.add_argument("--weight_decay", default=0.1, type=float, help="Weight deay if we apply some.")
- parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
- parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
- parser.add_argument(
- "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
- )
- parser.add_argument(
- "--max_steps",
- default=-1,
- type=int,
- help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
- )
- parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
- parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
- parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
- parser.add_argument(
- "--eval_all_checkpoints",
- action="store_true",
- help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
- )
-
- parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
- parser.add_argument(
- "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
- )
- parser.add_argument(
- "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
- )
- parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
- parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
- parser.add_argument(
- "--fp16",
- action="store_true",
- help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
- )
- parser.add_argument(
- "--fp16_opt_level",
- type=str,
- default="O1",
- help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
- "See details at https://nvidia.github.io/apex/amp.html",
- )
- parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
- parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
- args = parser.parse_args()
-
- if (
- os.path.exists(args.output_dir)
- and os.listdir(args.output_dir)
- and args.do_train
- and not args.overwrite_output_dir
- ):
- raise ValueError(
- "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
- args.output_dir
- )
- )
-
- # Setup distant debugging if needed
- if args.server_ip and args.server_port:
- # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
- import ptvsd
-
- print("Waiting for debugger attach")
- ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
- ptvsd.wait_for_attach()
-
- # Setup CUDA, GPU & distributed training
- if args.local_rank == -1 or args.no_cuda:
- device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
- args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
- else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
- torch.cuda.set_device(args.local_rank)
- device = torch.device("cuda", args.local_rank)
- torch.distributed.init_process_group(backend="nccl")
- args.n_gpu = 1
- args.device = device
-
- # Setup logging
- logging.basicConfig(
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
- datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
- )
- logger.warning(
- "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
- args.local_rank,
- device,
- args.n_gpu,
- bool(args.local_rank != -1),
- args.fp16,
- )
-
- # Set seed
- set_seed(args)
-
- # Load pretrained model and tokenizer
- if args.local_rank not in [-1, 0]:
- torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
-
- args.model_type = args.model_type.lower()
- config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
- config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
- tokenizer = tokenizer_class.from_pretrained(
- args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
- )
- config.num_labels = 1
- model = model_class.from_pretrained(
- args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
- )
-
- if args.local_rank == 0:
- torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
-
- model.to(args.device)
-
- logger.info("Training/evaluation parameters %s", args)
-
- # Training
- if args.do_train:
- train_dataset = load_and_cache_examples(args, tokenizer, train = True, evaluate=False, test = False, output_examples=False)
- # train = False, evaluate=False, test = False,
- global_step, tr_loss = train(args, train_dataset, model, tokenizer)
- logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
- # Save the trained model and the tokenizer
- if args.local_rank == -1 or torch.distributed.get_rank() == 0:
- # Create output directory if needed
- if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
- os.makedirs(args.output_dir)
-
- logger.info("Saving model checkpoint to %s", args.output_dir)
- # Save a trained model, configuration and tokenizer using `save_pretrained()`.
- # They can then be reloaded using `from_pretrained()`
- model_to_save = (
- model.module if hasattr(model, "module") else model
- ) # Take care of distributed/parallel training
- model_to_save.save_pretrained(args.output_dir)
- tokenizer.save_pretrained(args.output_dir)
-
- # Good practice: save your training arguments together with the trained model
- torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
- # Load a trained model and vocabulary that you have fine-tuned
- model = model_class.from_pretrained(args.output_dir)
- tokenizer = tokenizer_class.from_pretrained(args.output_dir)
- model.to(args.device)
-
- # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
- results = {}
- if args.do_eval and args.local_rank in [-1, 0]:
- if args.do_train:
- checkpoints = [args.output_dir]
- else:
- # if do_train is False and do_eval is true, load model directly from pretrained.
- checkpoints = [args.model_name_or_path]
-
- if args.eval_all_checkpoints:
- checkpoints = list(
- os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
- )
- logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
-
- logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
- for checkpoint in checkpoints:
- # Reload the model
- global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
- model = model_class.from_pretrained(checkpoint)
- tokenizer = tokenizer_class.from_pretrained(checkpoint)
- model.to(args.device)
-
- # Evaluate
- eval = False
- test = True
- result = evaluate(eval, test, args, model, tokenizer, prefix=global_step,)
- # train = False, evaluate=False, test = False,
- result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
- results.update(result)
-
- logger.info("Results: {}".format(results))
-
- return results
-
-
- if __name__ == "__main__":
- main()

use_boolq_roberta.py
CUDA_VISIBLE_DEVICES=2 python use_boolq_roberta.py --model_type roberta --model_name_or_path roberta-large --do_eval --do_lower_case --train_file train.jsonl --predict_file val.jsonl --test_file test3.jsonl --per_gpu_eval_batch_size=8 --output_dir boolq_roberta_output/checkpoint-3500
- import json
- import argparse
- import csv
- import glob
- import logging
- import os
- import random
-
- import numpy as np
- import torch
- from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
- from torch.utils.data.distributed import DistributedSampler
- from tqdm import tqdm, trange
-
- from transformers import (
- WEIGHTS_NAME,
- AdamW,
- RobertaConfig,
- RobertaForSequenceClassification,
- RobertaTokenizer,
- get_linear_schedule_with_warmup,
- )
-
-
- try:
- from torch.utils.tensorboard import SummaryWriter
- except ImportError:
- from tensorboardX import SummaryWriter
-
-
- logger = logging.getLogger(__name__)
-
- MODEL_CLASSES = {
- "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
- }
-
- change = [1301, 1313, 1319, 1321, 1326, 1338, 1350, 1355, 1373, 1376, 1382, 1387, 1389, 1390, 1404, 1415, 1418, 1420, 1421, 1422, 1424, 1425, 1428, 1430, 1431, 1438, 1440, 1450, 1453, 1465, 1466, 1489, 1491, 1492, 1514, 1531, 1533, 1535, 1550, 1554, 1582, 1593, 1597, 1607, 1611, 1615, 1622, 1626, 1631, 1637, 1638, 1641, 1666, 1685, 1687, 1696, 1698, 1702, 1713, 1717, 1732, 1733, 1739, 1743, 1749, 1757, 1762, 1767, 1770, 1778, 1780, 1789, 1794, 1809, 1811, 1815, 1816, 1826, 1831, 1833, 1835, 1838, 1849, 1855, 1858, 1862, 1879, 1883, 1884, 1897, 1926, 1941, 1953, 1954, 1956, 1961, 1990, 1991, 1997, 1999, 2007, 2008, 2012, 2035, 2038, 2045, 2051, 2052, 2067, 2074, 2079, 2083, 2086, 2109, 2116, 2119, 2141, 2142, 2158, 2162, 2168, 2194, 2199, 2202, 2204, 2206, 2212, 2215, 2219, 2226, 2227, 2231, 2237, 2239, 2245, 2249, 2271, 2272, 2275, 2295, 2300, 2307, 2317, 2321, 2336, 2342, 2343, 2350, 2354, 2367, 2369, 2371, 2379, 2382, 2391, 2396, 2397, 2406, 2407, 2417, 2421, 2431, 2432, 2436, 2443, 2447, 2449, 2452, 2454, 2471, 2474, 2480, 2483, 2499, 2501, 2510, 2519, 2530, 2533, 2547, 2555, 2565]
-
- class BoolqExample(object):
- """A single training/test example for the Dream dataset."""
-
- def __init__(self, swag_id, context_sentence, start_ending, ending_0, label=None):
- self.swag_id = swag_id
- self.context_sentence = context_sentence
- self.start_ending = start_ending
- self.endings = [
- ending_0
- # ending_1,
- ]
- self.label = label
-
- def __str__(self):
- return self.__repr__()
-
- def __repr__(self):
- attributes = [
- "swag_id: {}".format(self.swag_id),
- "context_sentence: {}".format(self.context_sentence),
- "start_ending: {}".format(self.start_ending),
- "ending_0: {}".format(self.endings[0]),
- #"ending_1: {}".format(self.endings[1]),
- ]
-
- if self.label is not None:
- attributes.append("label: {}".format(self.label))
-
- return ", ".join(attributes)
-
-
- '''class InputFeatures(object):
- def __init__(self, example_id, choices_features, label):
- self.example_id = example_id
- self.choices_features = [
- {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
- for _, input_ids, input_mask, segment_ids in choices_features
- ]
- self.label = label'''
-
-
- class InputFeatures(object):
- """
- A single set of features of data.
- Args:
- input_ids: Indices of input sequence tokens in the vocabulary.
- attention_mask: Mask to avoid performing attention on padding token indices.
- Mask values selected in ``[0, 1]``:
- Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
- token_type_ids: Segment token indices to indicate first and second portions of the inputs.
- label: Label corresponding to the input
- """
-
- def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
- self.input_ids = input_ids
- self.attention_mask = attention_mask
- self.token_type_ids = token_type_ids
- self.label = label
-
-
- def read_boolq_examples(input_file, is_training=True):
- with open(input_file, "r", encoding="utf-8") as f:
- lines = f.readlines()
-
- examples = []
- for line in lines:
- data_raw = json.loads(line.strip("\n"))
- if data_raw["label"]:
- label_input = 1
- else:
- label_input = 0
- examples.append(
- BoolqExample(
- swag_id=data_raw["idx"],
- context_sentence=data_raw["passage"],
- start_ending=data_raw["question"],
- ending_0="",
- # ending_1="",
- label=label_input if is_training else None,
- )
- )
-
- return examples
-
- def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
- """Loads a data file into a list of `InputBatch`s."""
-
- # Swag is a multiple choice task. To perform this task using Bert,
- # we will use the formatting proposed in "Improving Language
- # Understanding by Generative Pre-Training" and suggested by
- # @jacobdevlin-google in this issue
- # https://github.com/google-research/bert/issues/38.
- #
- # Each choice will correspond to a sample on which we run the
- # inference. For a given Swag example, we will create the 4
- # following inputs:
- # - [CLS] context [SEP] choice_1 [SEP]
- # - [CLS] context [SEP] choice_2 [SEP]
- # - [CLS] context [SEP] choice_3 [SEP]
- # - [CLS] context [SEP] choice_4 [SEP]
- # The model will output a single value for each input. To get the
- # final decision of the model, we will run a softmax over these 4
- # outputs.
- features = []
- for example_index, example in tqdm(enumerate(examples)):
- context_tokens = tokenizer.tokenize(example.context_sentence)
- start_ending_tokens = tokenizer.tokenize(example.start_ending)
-
- context_tokens_choice = context_tokens[:]
- ending_tokens = start_ending_tokens
- _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
-
- tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
- segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
-
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
- input_mask = [1] * len(input_ids)
-
- # Zero-pad up to the sequence length.
- padding = [0] * (max_seq_length - len(input_ids))
- input_ids += padding
- input_mask += padding
- segment_ids += padding
-
- assert len(input_ids) == max_seq_length
- assert len(input_mask) == max_seq_length
- assert len(segment_ids) == max_seq_length
-
- label = example.label
- if example_index < 5:
- logger.info("*** Example ***")
- logger.info("swag_id: {}".format(example.swag_id))
- logger.info("tokens: {}".format(" ".join(tokens)))
- logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
- logger.info("attention_mask: {}".format(" ".join(map(str, input_mask))))
- logger.info("token_type_ids: {}".format(" ".join(map(str, segment_ids))))
- if is_training:
- logger.info("label: {}".format(label))
-
- features.append(InputFeatures(input_ids = input_ids, attention_mask = input_mask, token_type_ids = segment_ids, label = label ))
-
- return features
-
-
- def _truncate_seq_pair(tokens_a, tokens_b, max_length):
- """Truncates a sequence pair in place to the maximum length."""
-
- # This is a simple heuristic which will always truncate the longer sequence
- # one token at a time. This makes more sense than truncating an equal percent
- # of tokens from each, since if one sequence is very short then each token
- # that's truncated likely contains more information than a longer sequence.
- while True:
- total_length = len(tokens_a) + len(tokens_b)
- if total_length <= max_length:
- break
- if len(tokens_a) > len(tokens_b):
- tokens_a.pop()
- else:
- tokens_b.pop()
-
-
- '''def accuracy(out, labels):
- outputs = np.argmax(out, axis=1)
- return np.sum(outputs == labels)'''
-
-
- def accuracy(out, labels,n,data):
- outputs = (out.squeeze(1) >= 0.5).astype(np.int)
- for i in outputs:
- n[0] = n[0] + 1
- print(n[0], ": ", i)
- #data = open("out2.txt", 'a', encoding='utf-8')
- print("{\"idx\":", n[0], ", \"label\":", i, "}", file=data)
- #data.close()
- return np.sum(outputs == labels)
-
-
- def select_field(features, field):
- return [[choice[field] for choice in feature.choices_features] for feature in features]
-
-
- def set_seed(args):
- random.seed(args.seed)
- np.random.seed(args.seed)
- torch.manual_seed(args.seed)
- if args.n_gpu > 0:
- torch.cuda.manual_seed_all(args.seed)
-
-
- def load_and_cache_examples(args, tokenizer, train = False, evaluate=False, test = False, output_examples=False):
- if args.local_rank not in [-1, 0]:
- torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
- # Load data features from cache or dataset file
- # input_file = args.predict_file if evaluate else args.train_file
- if train:
- input_file = args.train_file
- if evaluate:
- input_file = args.predict_file
- if test:
- input_file = args.test_file
-
- cached_features_file = os.path.join(
- os.path.dirname(input_file),
- "cached_{}_{}_{}".format(
- "dev" if evaluate else "train",
- list(filter(None, args.model_name_or_path.split("/"))).pop(),
- str(args.max_seq_length),
- ),
- )
- if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
- logger.info("Loading features from cached file %s", cached_features_file)
- features = torch.load(cached_features_file)
- else:
- logger.info("Creating features from dataset file at %s", input_file)
- examples = read_boolq_examples(input_file)
- features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
-
- if args.local_rank in [-1, 0]:
- logger.info("Saving features into cached file %s", cached_features_file)
- torch.save(features, cached_features_file)
-
- if args.local_rank == 0:
- torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
- # Convert to Tensors and build dataset
- # all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
- # all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
- # all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
- # all_label = torch.tensor([f.label for f in features], dtype=torch.long)
- all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
- all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
- all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
- all_label = torch.tensor([f.label for f in features], dtype=torch.long)
-
- if evaluate:
- dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
- else:
- dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-
- if output_examples:
- return dataset, examples, features
- return dataset
-
-
- # train = False, evaluate=False, test = False,
- def evaluate(data, eval, test, args, model, tokenizer, prefix=""):
- if eval :
- dataset, examples, features = load_and_cache_examples(args, tokenizer, train=False, evaluate=True, test=False,
- output_examples=True)
- if test :
- dataset, examples, features = load_and_cache_examples(args, tokenizer, train=False, evaluate=False, test=True,
- output_examples=True)
-
- # train = False, evaluate=False, test = False,
- if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
- os.makedirs(args.output_dir)
-
- args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
- # Note that DistributedSampler samples randomly
- eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
- eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
- # Eval!
- logger.info("***** Running evaluation {} *****".format(prefix))
- logger.info(" Num examples = %d", len(dataset))
- logger.info(" Batch size = %d", args.eval_batch_size)
-
- eval_loss, eval_accuracy = 0, 0
- nb_eval_steps, nb_eval_examples = 0, 0
-
- n = [1299]
- for batch in tqdm(eval_dataloader, desc="Evaluating"):
- model.eval()
- batch = tuple(t.to(args.device) for t in batch)
- with torch.no_grad():
- inputs = {
- "input_ids": batch[0],
- "attention_mask": batch[1],
- # 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
- # "token_type_ids": batch[2],
- "labels": batch[3].float(),
- }
-
- # if args.model_type in ['xlnet', 'xlm']:
- # inputs.update({'cls_index': batch[4],
- # 'p_mask': batch[5]})
- outputs = model(**inputs)
- tmp_eval_loss, logits = outputs[:2]
- eval_loss += tmp_eval_loss.mean().item()
-
- logits = logits.detach().cpu().numpy()
- label_ids = inputs["labels"].to("cpu").numpy()
- #print(logits,label_ids)
- tmp_eval_accuracy = accuracy(logits, label_ids,n, data)
- eval_accuracy += tmp_eval_accuracy
-
- nb_eval_steps += 1
- nb_eval_examples += inputs["input_ids"].size(0)
-
- eval_loss = eval_loss / nb_eval_steps
- eval_accuracy = eval_accuracy / nb_eval_examples
- result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
-
- output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
- with open(output_eval_file, "w") as writer:
- logger.info("***** Eval results *****")
- for key in sorted(result.keys()):
- logger.info("%s = %s", key, str(result[key]))
- writer.write("%s = %s\n" % (key, str(result[key])))
-
- return result
-
-
- def main():
- parser = argparse.ArgumentParser()
- data = open("out2.txt", 'w', encoding='utf-8')
-
- # Required parameters
- parser.add_argument(
- "--train_file",
- default=None,
- type=str,
- required=True,
- help="SWAG csv for training. E.g., train.csv"
- )
- parser.add_argument(
- "--predict_file",
- default=None,
- type=str,
- required=True,
- help="SWAG csv for predictions. E.g., val.csv",
- )
- parser.add_argument(
- "--test_file",
- default=None,
- type=str,
- required=True,
- help="SWAG csv for test. E.g., test.csv",
- )
- parser.add_argument(
- "--model_type",
- default=None,
- type=str,
- required=True,
- help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
- )
- parser.add_argument(
- "--model_name_or_path",
- default=None,
- type=str,
- required=True,
- help="Path to pre-trained model or shortcut name selected in the list: " + ", ",
- )
- parser.add_argument(
- "--output_dir",
- default=None,
- type=str,
- required=True,
- help="The output directory where the model checkpoints and predictions will be written.",
- )
-
- # Other parameters
- parser.add_argument(
- "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
- )
- parser.add_argument(
- "--tokenizer_name",
- default="",
- type=str,
- help="Pretrained tokenizer name or path if not the same as model_name",
- )
- parser.add_argument(
- "--max_seq_length",
- default=384,
- type=int,
- help="The maximum total input sequence length after tokenization. Sequences "
- "longer than this will be truncated, and sequences shorter than this will be padded.",
- )
- parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
- parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
- parser.add_argument(
- "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
- )
- parser.add_argument(
- "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
- )
-
- parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
- parser.add_argument(
- "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
- )
- parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
- parser.add_argument(
- "--gradient_accumulation_steps",
- type=int,
- default=1,
- help="Number of updates steps to accumulate before performing a backward/update pass.",
- )
- parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
- parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
- parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
- parser.add_argument(
- "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
- )
- parser.add_argument(
- "--max_steps",
- default=-1,
- type=int,
- help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
- )
- parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
- parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
- parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
- parser.add_argument(
- "--eval_all_checkpoints",
- action="store_true",
- help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
- )
- parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
- parser.add_argument(
- "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
- )
- parser.add_argument(
- "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
- )
- parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
- parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
- parser.add_argument(
- "--fp16",
- action="store_true",
- help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
- )
- parser.add_argument(
- "--fp16_opt_level",
- type=str,
- default="O1",
- help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
- "See details at https://nvidia.github.io/apex/amp.html",
- )
- parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
- parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
- args = parser.parse_args()
-
- if (
- os.path.exists(args.output_dir)
- and os.listdir(args.output_dir)
- and args.do_train
- and not args.overwrite_output_dir
- ):
- raise ValueError(
- "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
- args.output_dir
- )
- )
-
- # Setup distant debugging if needed
- if args.server_ip and args.server_port:
- # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
- import ptvsd
-
- print("Waiting for debugger attach")
- ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
- ptvsd.wait_for_attach()
-
- # Setup CUDA, GPU & distributed training
- if args.local_rank == -1 or args.no_cuda:
- device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
- args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
- else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
- torch.cuda.set_device(args.local_rank)
- device = torch.device("cuda", args.local_rank)
- torch.distributed.init_process_group(backend="nccl")
- args.n_gpu = 1
- args.device = device
-
- # Setup logging
- logging.basicConfig(
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
- datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
- )
- logger.warning(
- "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
- args.local_rank,
- device,
- args.n_gpu,
- bool(args.local_rank != -1),
- args.fp16,
- )
-
- # Set seed
- set_seed(args)
-
- # Load pretrained model and tokenizer
- if args.local_rank not in [-1, 0]:
- torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
-
- args.model_type = args.model_type.lower()
- config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
- config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
- tokenizer = tokenizer_class.from_pretrained(
- args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
- )
- config.num_labels = 1
- model = model_class.from_pretrained(
- args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
- )
-
- if args.local_rank == 0:
- torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
-
- model.to(args.device)
-
- logger.info("Training/evaluation parameters %s", args)
-
- # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
- results = {}
- if args.do_eval and args.local_rank in [-1, 0]:
-
- checkpoints = [args.output_dir]
- logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
- for checkpoint in checkpoints:
- # Reload the model
- global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
- model = model_class.from_pretrained(checkpoint)
- tokenizer = tokenizer_class.from_pretrained(checkpoint)
- model.to(args.device)
-
- # Evaluate
- eval = False
- test = True
- result = evaluate(data, eval, test, args, model, tokenizer, prefix=global_step,)
- # train = False, evaluate=False, test = False,
- result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
- results.update(result)
-
- logger.info("Results: {}".format(results))
-
- data.close()
-
- return results
-
-
- if __name__ == "__main__":
- main()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。