diff --git a/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv b/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv new file mode 100644 index 00000000..da4102b1 --- /dev/null +++ b/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv @@ -0,0 +1,2 @@ +name,model_name,data_dir,model_dir,output_dir,model_type,task_name,max_seq_length,do_train,evaluate_during_training,do_lower_case,per_gpu_train_batch_size,per_gpu_eval_batch_size,gradient_accumulation_steps,learning_rate,weight_decay,num_train_epochs,warmup_steps,seed,hidden_dropout_prob,attention_probs_dropout_prob,data_aug,depth_mult_list,depth_lambda1,depth_lambda2,width_mult_list,width_lambda1,width_lambda2,training_phase,n_gpu,output_mode,train_batch_size,eval_batch_size,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters +multiemo_en_all_sentence,dynabert_d_0.5_w_1.,data/multiemo2,data/models/dynabertw/multiemo_en_all_sentence,data/models/dynabert-finetuned/multiemo_en_all_sentence,bert,multiemo_en_all_sentence,128,True,True,True,16,8,1,5e-05,0.01,3.0,0,42,0.1,0.1,False,"[0.5, 0.75, 1.0]",1.0,1.0,"[0.25, 0.5, 0.75, 1.0]",1.0,0.1,final_finetuning,1,classification,16,8,34809.954543,0.7724978241949522,14.607117,0.5470219435736677,0.5124816446402349,0.5291887793783169,681,0.7953004970628107,0.8290155440414507,0.8118081180811808,2123,0.8104786545924968,0.8232588699080158,0.8168187744458931,1522,0.798219584569733,0.7582804792107117,0.7777376219732562,1419,0.737755169949677,0.7307591344501033,0.7338883234696617,5745,0.770612184792384,0.7724978241949522,0.771219156436846,5745,438020911,437941264,109485316 diff --git a/DynaBERT/download_bert_base.py b/DynaBERT/download_bert_base.py new file mode 100644 index 00000000..fa99e41a --- /dev/null +++ b/DynaBERT/download_bert_base.py @@ -0,0 +1,31 @@ +import os +import requests +import tarfile + +url = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz' + +output_path = os.path.join('data', 'models') +os.makedirs(output_path, exist_ok=True) + +output_tar = os.path.join(output_path, 'bert-base-uncased.tar.gz') +model_folder = os.path.join(output_path, 'bert-base-uncased') + +response = requests.get(url, stream=True) +if response.status_code == 200: + with open(output_tar, 'wb') as f: + f.write(response.raw.read()) + +with tarfile.open(name=output_tar, mode="r|gz") as tar_ref: + tar_ref.extractall(model_folder) + +os.rename(os.path.join(model_folder, 'bert_config.json'), os.path.join(model_folder, 'config.json')) + +os.remove(output_tar) + +url_vocab = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt' +r = requests.get(url_vocab) + +with open(os.path.join(model_folder, 'vocab.txt'), 'wb') as f: + f.write(r.content) + +print('Completed!') diff --git a/DynaBERT/eval_multiemo.py b/DynaBERT/eval_multiemo.py new file mode 100644 index 00000000..b78b9bc4 --- /dev/null +++ b/DynaBERT/eval_multiemo.py @@ -0,0 +1,372 @@ +# coding=utf-8 +# 2020.08.28 - Changed regular evaluation to evaluation with adaptive width and depth +# Huawei Technologies Co., Ltd +# Copyright (c) 2020, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" + +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import os +import random +import time +from datetime import timedelta + +import numpy as np +import torch +from sklearn.metrics import classification_report +from torch.utils.data import (DataLoader, SequentialSampler, TensorDataset) +from tqdm import tqdm + +from transformers import (BertConfig, + BertForSequenceClassification, BertTokenizer, + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer, + ) + +from transformers.data.metrics import multiemo_compute_metrics as compute_metrics +from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \ + MultiemoProcessor, multiemo_output_modes +from utils import dictionary_to_json + +logger = logging.getLogger(__name__) + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.seq_length = seq_length + self.label_id = label_id + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def convert_examples_to_features_test(examples, label_list, max_seq_length, + tokenizer, output_mode): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + seq_length = len(input_ids) + + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + try: + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + except: + label_id = 0 + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("label: {}".format(example.label)) + logger.info("label_id: {}".format(label_id)) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + seq_length=seq_length)) + return features + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), +} + + +def set_seed(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + +def evaluate(args, model, tokenizer, prefix=""): + results = {} + eval_dataset, _ = load_and_cache_examples_test(args, args.task_name, tokenizer) + + eval_output_dir = os.path.join( + args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval') + + if not os.path.exists(eval_output_dir): + # and args.local_rank in [-1, 0]: + os.makedirs(eval_output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + logger.info("***** Running evaluation {} *****".format(prefix)) + logger.info(" Num examples = %d", len(eval_dataset)) + logger.info(" Batch size = %d", args.eval_batch_size) + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + for batch in tqdm(eval_dataloader, desc="Evaluating"): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + + with torch.no_grad(): + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] \ + else None # XLM, DistilBERT and RoBERTa don't use segment_ids + outputs = model(**inputs) + + tmp_eval_loss, logits = outputs[:2] + eval_loss += tmp_eval_loss.mean().item() + + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + + if args.output_mode == "regression": + preds = np.squeeze(preds) + + result = compute_metrics(args.task_name, preds, out_label_ids) + results.update(result) + output_eval_file = os.path.join(eval_output_dir, "test_results_{0}.txt".format(args.task_name)) + + with open(output_eval_file, "a") as writer: + logger.info("***** Eval results {} *****".format(prefix)) + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("\n") + + return results, preds, out_label_ids + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def load_and_cache_examples_test(args, task, tokenizer): + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + output_mode = multiemo_output_modes['multiemo'] + label_list = processor.get_labels() + + examples = processor.get_test_examples(args.data_dir) + features = convert_examples_to_features_test(examples, label_list, args.max_seq_length, tokenizer, output_mode) + data, labels = get_tensor_data(output_mode, features) + return data, label_list + + +def load_and_cache_examples(args, task, tokenizer, evaluate=False): + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + output_mode = multiemo_output_modes['multiemo'] + + logger.info("Creating features from dataset file at %s", args.data_dir) + label_list = processor.get_labels() + + examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + + features = convert_examples_to_features( + examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, + ) + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) + return dataset + + +def main(): + parser = argparse.ArgumentParser() + + # Required parameters + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_type", default=None, type=str, required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the task to train selected") + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model predictions will be written.") + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.") + parser.add_argument("--do_lower_case", default=True, + help="Set this flag if you are using an uncased model.") + parser.add_argument("--per_gpu_eval_batch_size", default=128, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument("--no_cuda", action='store_true', + help="Avoid using CUDA when available") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + parser.add_argument("--model_dir", type=str, + help="The teacher model dir.") + parser.add_argument('--depth_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + parser.add_argument('--width_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + + args = parser.parse_args() + # args.model_dir = os.path.join(args.model_dir, 'best') + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() + args.device = device + + # Setup logging + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + logger.warning("device: %s, n_gpu: %s", device, args.n_gpu, ) + + # Set seed + set_seed(args) + + # Prepare MULTIEMO task: provide num_labels here + args.task_name = args.task_name.lower() + if 'multiemo' not in args.task_name: + raise ValueError("Task not found: %s" % args.task_name) + + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + args.output_mode = multiemo_output_modes['multiemo'] + label_list = processor.get_labels() + num_labels = len(label_list) + + args.model_type = args.model_type.lower() + config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + + config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name) + tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case) + model = model_class.from_pretrained(args.model_dir, config=config) + model.to(args.device) + model.apply(lambda m: setattr(m, 'depth_mult', float(args.depth_mult))) + model.apply(lambda m: setattr(m, 'width_mult', float(args.width_mult))) + + eval_start_time = time.monotonic() + + results, y_logits, y_true = evaluate(args, model, tokenizer) + print(results) + + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(y_true, y_pred, target_names=label_list)) + + report = classification_report(y_true, y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + + eval_output_dir = os.path.join( + args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval') + dictionary_to_json(report, os.path.join(eval_output_dir, "test_results.json")) + + +if __name__ == "__main__": + main() diff --git a/DynaBERT/gather_results.py b/DynaBERT/gather_results.py new file mode 100644 index 00000000..85544525 --- /dev/null +++ b/DynaBERT/gather_results.py @@ -0,0 +1,112 @@ +import argparse +import json +import os +from typing import Any, Dict + +import pandas as pd + +from transformers import BertConfig, BertForSequenceClassification + +from transformers.data.processors.multiemo import MultiemoProcessor + +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') +MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models') +DYNABERT_FT_MODELS_FOLDER = os.path.join(MODELS_FOLDER, 'dynabert-finetuned') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument('--depth_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + parser.add_argument('--width_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + + args = parser.parse_args() + task_name = args.task_name + + models_subdirectories = get_immediate_subdirectories(DYNABERT_FT_MODELS_FOLDER) + print(DYNABERT_FT_MODELS_FOLDER) + print(models_subdirectories) + + data = list() + for subdirectory in models_subdirectories: + data_dict = gather_results(subdirectory, task_name, args.depth_mult, args.width_mult) + data.append(data_dict) + + df = pd.DataFrame(data) + cols = df.columns.tolist() + cols = cols[-2:] + cols[:-2] + df = df[cols] + df.to_csv(os.path.join(DATA_FOLDER, 'results-dynabert-' + task_name + '.csv'), index=False) + + +def get_immediate_subdirectories(a_dir): + return [os.path.join(a_dir, name) for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name))] + + +def gather_results(model_dir: str, task_name: str, depth_mult: float, width_mult: float) -> Dict[str, Any]: + task_subfolder = os.path.basename(model_dir) + eval_output_dir = os.path.join( + model_dir, 'bert_' + str(width_mult) + '_' + str(depth_mult) + '_eval') + + with open(os.path.join(model_dir, 'training_params.json')) as json_file: + training_data_dict = json.load(json_file) + + with open(os.path.join(eval_output_dir, 'test_results.json')) as json_file: + test_data = json.load(json_file) + [test_data_dict] = pd.json_normalize(test_data, sep='_').to_dict(orient='records') + + data = training_data_dict.copy() # start with keys and values of x + data.update(test_data_dict) + + with open(os.path.join(MODELS_FOLDER, 'dynabertw', task_subfolder, 'training_params.json')) as json_file: + dynabertw_training_data_dict = json.load(json_file) + data['training_time'] = data['training_time'] + dynabertw_training_data_dict['training_time'] + + with open(os.path.join(MODELS_FOLDER, 'dynabert', task_subfolder, 'training_params.json')) as json_file: + dynabert_training_data_dict = json.load(json_file) + data['training_time'] = data['training_time'] + dynabert_training_data_dict['training_time'] + + model_size = os.path.getsize(os.path.join(model_dir, 'pytorch_model.bin')) + data['model_size'] = model_size + + if 'multiemo' not in task_name: + raise ValueError("Task not found: %s" % task_name) + + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + label_list = processor.get_labels() + num_labels = len(label_list) + + # LOADING THE BEST MODEL + config = BertConfig.from_pretrained(model_dir, num_labels=num_labels, finetuning_task=task_name) + model = BertForSequenceClassification.from_pretrained(model_dir, config=config) + model.apply(lambda m: setattr(m, 'depth_mult', float(depth_mult))) + model.apply(lambda m: setattr(m, 'width_mult', float(width_mult))) + + memory_params = sum([param.nelement() * param.element_size() for param in model.parameters()]) + memory_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) + memory_used = memory_params + memory_buffers # in bytes + + data['memory'] = memory_used + + parameters_num = 0 + for n, p in model.named_parameters(): + parameters_num += p.nelement() + + data['parameters'] = parameters_num + data['name'] = os.path.basename(model_dir) + data['model_name'] = 'Dynabert_d_' + str(depth_mult) + '_w_' + str(width_mult) + print(data) + return data + + +if __name__ == '__main__': + main() diff --git a/DynaBERT/multiemo_fine_tune_bert.py b/DynaBERT/multiemo_fine_tune_bert.py new file mode 100644 index 00000000..e0f92f0c --- /dev/null +++ b/DynaBERT/multiemo_fine_tune_bert.py @@ -0,0 +1,429 @@ +# coding=utf-8 +# 2019.12.2-Changed for TinyBERT task-specific distillation +# Huawei Technologies Co., Ltd. +# Copyright 2020 Huawei Technologies Co., Ltd. +# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import csv +import logging +import os +import random +import sys +import time +from datetime import timedelta + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) +from tqdm import tqdm, trange + +from sklearn.metrics import classification_report + +from utils import result_to_text_file, dictionary_to_json +from transformers.modeling_bert import BertForSequenceClassification +from transformers.tokenization_bert import BertTokenizer +from transformers.data.metrics import multiemo_compute_metrics as compute_metrics +from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \ + MultiemoProcessor +from transformers import AdamW, WarmupLinearSchedule + +from transformers.file_utils import WEIGHTS_NAME, CONFIG_NAME + +csv.field_size_limit(sys.maxsize) + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + all_logits = None + + for batch in tqdm(eval_dataloader): + model.eval() + batch = tuple(t.to(device) for t in batch) + with torch.no_grad(): + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() + else: + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) + result['eval_loss'] = eval_loss + return result, all_logits + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--pretrained_model", + default=None, + type=str, + help="The pretrained model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=16, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=16, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument('--weight_decay', '--wd', + default=0.01, + type=float, + metavar='W', + help='weight decay') + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + + # added arguments + parser.add_argument('--aug_train', + action='store_true') + parser.add_argument('--eval_step', + type=int, + default=50) + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # intermediate distillation default parameters + default_params = { + "multiemo": {"num_train_epochs": 3, "max_seq_length": 128}, + } + acc_tasks = ["multiemo"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + # Prepare task settings + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + + os.makedirs(args.output_dir, exist_ok=True) + + task_name = args.task_name.lower() + if task_name in default_params: + args.max_seq_len = default_params[task_name]["max_seq_length"] + + if not args.do_eval: + if task_name in default_params: + args.num_train_epoch = default_params[task_name]["num_train_epochs"] + + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = 'classification' + else: + raise ValueError("Task not found: %s" % task_name) + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) + + if not args.do_eval: + if not args.aug_train: + train_examples = processor.get_train_examples(args.data_dir) + else: + train_examples = processor.get_aug_examples(args.data_dir) + + t_total = len(train_examples) // args.gradient_accumulation_steps * args.num_train_epochs + + train_features = convert_examples_to_features( + train_examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=False, + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=0, + ) + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels) + model.to(device) + if args.do_eval: + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + else: + training_start_time = time.monotonic() + + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", t_total) + if n_gpu > 1: + model = torch.nn.DataParallel(model) + + optimizer, scheduler = get_optimizer_and_scheduler(args, model, t_total) + + # Train and evaluate + global_step = 0 + best_dev_acc = 0.0 + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + + for epoch_ in range(int(args.num_train_epochs)): + tr_loss = 0. + tr_cls_loss = 0. + + model.train() + nb_tr_steps = 0 + + for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): + batch = tuple(t.to(device) for t in batch) + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], + 'token_type_ids': batch[2] if args.model_type in ['bert'] else None} + + cls_loss = model(**inputs)[0] + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + + loss.backward() + tr_loss += loss.item() + nb_tr_steps += 1 + + optimizer.step() + scheduler.step() + model.zero_grad() + global_step += 1 + + logger.info("***** Running evaluation *****") + logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + + loss = tr_loss / nb_tr_steps + cls_loss = tr_cls_loss / nb_tr_steps + + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['epoch'] = epoch_ + 1 + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['loss'] = loss + result_to_text_file(result, output_eval_file) + + save_model = False + + if result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True + + if save_model: + logger.info("***** Save model *****") + model_to_save = model.module if hasattr(model, 'module') else model + + model_name = WEIGHTS_NAME + output_model_file = os.path.join(args.output_dir, model_name) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + model.train() + + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(args.output_dir, "training_params.json") + dictionary_to_json(training_parameters, output_training_params_file) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(args.data_dir) + test_features = convert_examples_to_features( + test_examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=False, + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=0 + ) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(test_data) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) + + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.eval_batch_size) + + eval_start_time = time.monotonic() + model.eval() + result, y_logits = do_eval(model, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(args.output_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json")) + + +def get_optimizer_and_scheduler(args, model, t_total): + # Prepare optimizer + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + return optimizer, scheduler + + +if __name__ == "__main__": + main() diff --git a/DynaBERT/run_experiments.py b/DynaBERT/run_experiments.py new file mode 100644 index 00000000..3a1b448f --- /dev/null +++ b/DynaBERT/run_experiments.py @@ -0,0 +1,149 @@ +import logging +import os +import sys + +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + +data_dir = os.path.join('data', 'multiemo2') + +batch_size = 16 +num_train_epochs = 3 +learning_rate = 5e-5 +weight_decay = 0.01 + + +def main(): + print(PROJECT_FOLDER) + os.chdir(PROJECT_FOLDER) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')): + logger.info("Downloading Multiemo data") + cmd = 'python3 scripts/download_dataset.py --data_dir data/multiemo2' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')): + logger.info("Downloading bert-base-uncased model") + cmd = 'python3 download_bert_base.py' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')): + cmd = 'python3 multiemo_fine_tune_bert.py ' + options = [ + '--pretrained_model', 'data/models/bert-base-uncased', + '--data_dir', 'data/multiemo2', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence', + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--weight_decay', str(weight_decay), + '--train_batch_size', str(batch_size), + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence") + run_process(cmd) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabertw', 'multiemo_en_all_sentence')): + cmd = 'python3 run_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--do_train', + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/bert-base-uncased/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabertw/multiemo_en_all_sentence', + '--max_seq_length', str(128), + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--per_gpu_train_batch_size', str(batch_size), + '--weight_decay', str(weight_decay), + '--width_mult_list', '0.25,0.5,0.75,1.0', + '--width_lambda1', str(1.0), + '--width_lambda2', str(0.1), + '--training_phase', 'dynabertw' + ] + cmd += ' '.join(options) + logger.info(f"Training DynaBERT_W for multiemo_en_all_sentence") + run_process(cmd) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabert', 'multiemo_en_all_sentence')): + cmd = 'python3 run_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--do_train', + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/dynabertw/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabert/multiemo_en_all_sentence', + '--max_seq_length', str(128), + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--per_gpu_train_batch_size', str(batch_size), + '--weight_decay', str(weight_decay), + '--width_mult_list', '0.25,0.5,0.75,1.0', + '--depth_mult_list', '0.5,0.75,1.0', + '--width_lambda1', str(1.0), + '--width_lambda2', str(1.0), + '--training_phase', 'dynabert', + ] + cmd += ' '.join(options) + logger.info(f"Training DynaBERT for multiemo_en_all_sentence") + run_process(cmd) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabert-finetuned', 'multiemo_en_all_sentence')): + cmd = 'python3 run_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--do_train', + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/dynabertw/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence', + '--max_seq_length', str(128), + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--per_gpu_train_batch_size', str(batch_size), + '--weight_decay', str(weight_decay), + '--width_mult_list', '0.25,0.5,0.75,1.0', + '--depth_mult_list', '0.5,0.75,1.0', + '--training_phase', 'final_finetuning ', + ] + cmd += ' '.join(options) + logger.info(f"Finetuning DynaBERT for multiemo_en_all_sentence") + run_process(cmd) + + + cmd = 'python3 eval_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--data_dir', 'data/multiemo2', + '--model_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence', + '--max_seq_length', str(128), + '--depth_mult', '0.5' + ] + cmd += ' '.join(options) + logger.info(f"Evaluating DynaBERT for multiemo_en_all_sentence") + run_process(cmd) + + + # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence' + # logger.info(f"Gathering results to csv for multiemo_en_all_sentence") + # run_process(cmd) + + +def run_process(proc): + os.system(proc) + + +if __name__ == '__main__': + main() diff --git a/DynaBERT/run_glue.py b/DynaBERT/run_glue.py index 68ce9718..e29362ee 100644 --- a/DynaBERT/run_glue.py +++ b/DynaBERT/run_glue.py @@ -29,12 +29,12 @@ import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from tqdm import tqdm, trange -from torch.nn import MSELoss +from torch.nn import MSELoss from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer, - RobertaConfig, - RobertaForSequenceClassification, - RobertaTokenizer) + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer) from transformers import AdamW, WarmupLinearSchedule @@ -43,11 +43,11 @@ from transformers import glue_processors as processors from transformers import glue_convert_examples_to_features as convert_examples_to_features - logger = logging.getLogger(__name__) CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" + def soft_cross_entropy(predicts, targets): student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) targets_prob = torch.nn.functional.softmax(targets, dim=-1) @@ -82,7 +82,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): # Prepare optimizer and schedule (linear warmup and decay) if args.model_type == 'roberta': - args.warmup_steps = int(t_total*0.06) + args.warmup_steps = int(t_total * 0.06) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ @@ -90,7 +90,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) @@ -210,7 +210,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): if global_step > 0 and args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.evaluate_during_training: acc = [] - if args.task_name == "mnli": # for both MNLI-m and MNLI-mm + if args.task_name == "mnli": # for both MNLI-m and MNLI-mm acc_both = [] # collect performance of all sub-networks @@ -240,7 +240,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): else: print("***best***{}\n".format(acc)) with open(output_eval_file, "a") as writer: - writer.write("{}\n" .format(acc)) + writer.write("{}\n".format(acc)) logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model @@ -307,11 +307,11 @@ def evaluate(args, model, tokenizer, prefix=""): preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) if eval_task == 'mnli-mm': - results.update({'acc_mm':result['acc']}) + results.update({'acc_mm': result['acc']}) else: results.update(result) - output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") # wirte all the results to the same file + output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") # wirte all the results to the same file with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): @@ -322,7 +322,6 @@ def evaluate(args, model, tokenizer, prefix=""): def load_and_cache_examples(args, task, tokenizer, evaluate=False): - processor = processors[task]() output_mode = output_modes[task] logger.info("Creating features from dataset file at %s", args.data_dir) @@ -338,10 +337,10 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet + pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, - ) + ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) @@ -405,7 +404,7 @@ def compute_neuron_head_importance(args, model, tokenizer): for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, _, label_ids = batch - segment_ids = batch[2] if args.model_type=='bert' else None # RoBERTa does't use segment_ids + segment_ids = batch[2] if args.model_type == 'bert' else None # RoBERTa does't use segment_ids # calculate head importance outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, @@ -415,7 +414,8 @@ def compute_neuron_head_importance(args, model, tokenizer): head_importance += head_mask.grad.abs().detach() # calculate neuron importance - for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, neuron_importance): + for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, + neuron_importance): current_importance += ((w1 * w1.grad).sum(dim=1) + b1 * b1.grad).abs().detach() current_importance += ((w2 * w2.grad).sum(dim=0)).abs().detach() @@ -515,7 +515,7 @@ def main(): args.depth_mult_list = [float(depth) for depth in args.depth_mult_list.split(',')] # Setup CUDA, GPU & distributed training - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device @@ -535,7 +535,7 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name) - config.output_attentions, config.output_hidden_states, config.output_intermediate = True,True,True + config.output_attentions, config.output_hidden_states, config.output_intermediate = True, True, True tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case) # load teacher model if necessary diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py new file mode 100644 index 00000000..bf5d7d08 --- /dev/null +++ b/DynaBERT/run_multiemo.py @@ -0,0 +1,574 @@ +# coding=utf-8 +# 2020.08.28 - Changed regular fine-tuning to fine-tuning with adaptive width and depth +# Huawei Technologies Co., Ltd +# Copyright (c) 2020, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright 2018 The Google AI Language Team Authors, the HuggingFace Inc. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" + +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import os +import random +import math +import time +from datetime import timedelta + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) +from tqdm import tqdm, trange +from torch.nn import MSELoss + +from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer, + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer) + +from transformers import AdamW, WarmupLinearSchedule + +from transformers.data.metrics import multiemo_compute_metrics as compute_metrics +from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \ + MultiemoProcessor, multiemo_output_modes +from utils import result_to_text_file, dictionary_to_json + +logger = logging.getLogger(__name__) +CONFIG_NAME = "config.json" +WEIGHTS_NAME = "pytorch_model.bin" + + +def soft_cross_entropy(predicts, targets): + student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) + targets_prob = torch.nn.functional.softmax(targets, dim=-1) + return -torch.sum(targets_prob * student_likelihood, dim=-1).mean() + + +loss_mse = MSELoss() +ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig)), ()) +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer) +} + + +def set_seed(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + +def train(args, train_dataset, model, tokenizer, teacher_model=None): + """ Train the model """ + + args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) + train_sampler = RandomSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, + batch_size=args.train_batch_size) + + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + + # Prepare optimizer and schedule (linear warmup and decay) + if args.model_type == 'roberta': + args.warmup_steps = int(t_total * 0.06) + + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + global_step = 0 + tr_loss = 0.0 + model.zero_grad() + train_iterator = trange(int(args.num_train_epochs), desc="Epoch") + set_seed(args) + + current_best = 0 + output_eval_file = os.path.join(args.output_dir, 'eval_results.txt') + + for epoch in train_iterator: + epoch_iterator = tqdm(train_dataloader, desc="Iteration") + for step, batch in enumerate(epoch_iterator): + model.train() + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], + 'token_type_ids': batch[2] if args.model_type in ['bert'] else None} + + # prepare the hidden states and logits of the teacher model + if args.training_phase == 'dynabertw' and teacher_model: + with torch.no_grad(): + _, teacher_logit, teacher_reps, _, _ = teacher_model(**inputs) + elif args.training_phase == 'dynabert' and teacher_model: + hidden_max_all, logits_max_all = [], [] + for width_mult in sorted(args.width_mult_list, reverse=True): + with torch.no_grad(): + _, teacher_logit, teacher_reps, _, _ = teacher_model(**inputs) + hidden_max_all.append(teacher_reps) + logits_max_all.append(teacher_logit) + + # accumulate grads for all sub-networks + for depth_mult in sorted(args.depth_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'depth_mult', depth_mult)) + # select teacher model layers for matching + if args.training_phase == 'dynabert' or 'final_finetuning': + model = model.module if hasattr(model, 'module') else model + base_model = getattr(model, model.base_model_prefix, model) + n_layers = base_model.config.num_hidden_layers + depth = round(depth_mult * n_layers) + kept_layers_index = [] + for i in range(depth): + kept_layers_index.append(math.floor(i / depth_mult)) + kept_layers_index.append(n_layers) + + # adjust width + width_idx = 0 + for width_mult in sorted(args.width_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'width_mult', width_mult)) + # stage 1: width-adaptive + if args.training_phase == 'dynabertw': + loss, student_logit, student_reps, _, _ = model(**inputs) + + # distillation loss of logits + if args.output_mode == "classification": + logit_loss = soft_cross_entropy(student_logit, teacher_logit.detach()) + elif args.output_mode == "regression": + logit_loss = 0 + + # distillation loss of hidden states + rep_loss = 0 + for student_rep, teacher_rep in zip(student_reps, teacher_reps): + tmp_loss = loss_mse(student_rep, teacher_rep.detach()) + rep_loss += tmp_loss + + loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss + + # stage 2: width- and depth- adaptive + elif args.training_phase == 'dynabert': + loss, student_logit, student_reps, _, _ = model(**inputs) + + # distillation loss of logits + if args.output_mode == "classification": + logit_loss = soft_cross_entropy(student_logit, logits_max_all[width_idx].detach()) + elif args.output_mode == "regression": + logit_loss = 0 + + # distillation loss of hidden states + rep_loss = 0 + for student_rep, teacher_rep in zip( + student_reps, list(hidden_max_all[width_idx][i] for i in kept_layers_index)): + tmp_loss = loss_mse(student_rep, teacher_rep.detach()) + rep_loss += tmp_loss + + loss = args.depth_lambda1 * logit_loss + args.depth_lambda2 * rep_loss # ground+truth and distillation + width_idx += 1 # move to the next width + + # stage 3: final finetuning + else: + loss = model(**inputs)[0] + + if args.n_gpu > 1: + loss = loss.mean() + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + loss.backward() + + # clip the accumulated grad from all widths + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + tr_loss += loss.item() + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + scheduler.step() # Update learning rate schedule + model.zero_grad() + global_step += 1 + + if 0 < t_total < global_step: + epoch_iterator.close() + break + + # evaluate + if args.evaluate_during_training: + acc = [] + + # collect performance of all sub-networks + for depth_mult in sorted(args.depth_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'depth_mult', depth_mult)) + for width_mult in sorted(args.width_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'width_mult', width_mult)) + results = evaluate(args, model, tokenizer) + + logger.info("********** start evaluate results *********") + logger.info("depth_mult: %s ", depth_mult) + logger.info("width_mult: %s ", width_mult) + logger.info("results: %s ", results) + logger.info("********** end evaluate results *********") + + acc.append(list(results.values())[0]) + + result_to_save = dict() + result_to_save['epoch'] = epoch + 1 + result_to_save['global_step'] = global_step + result_to_save['loss'] = loss + result_to_save['acc'] = acc + + result_to_text_file(result_to_save, output_eval_file) + + # save model + if sum(acc) > current_best: + current_best = sum(acc) + + print("***best***{}\n".format(acc)) + + logger.info("Saving model checkpoint to %s", args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model + model_to_save.save_pretrained(args.output_dir) + torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + model_to_save.config.to_json_file(os.path.join(args.output_dir, CONFIG_NAME)) + tokenizer.save_vocabulary(args.output_dir) + + if 0 < t_total < global_step: + train_iterator.close() + break + + return global_step, tr_loss / global_step + + +def evaluate(args, model, tokenizer, prefix=""): + """ Evaluate the model """ + results = {} + + eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, + batch_size=args.eval_batch_size) + + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + for batch in tqdm(eval_dataloader, desc="Evaluating"): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + + with torch.no_grad(): + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + eval_loss += tmp_eval_loss.mean().item() + + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + + if args.output_mode == "regression": + preds = np.squeeze(preds) + + result = compute_metrics(args.task_name, preds, out_label_ids) + results.update(result) + + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + with open(output_eval_file, "a") as writer: + logger.info("***** Eval results {} *****".format(prefix)) + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("\n") + + return results + + +def load_and_cache_examples(args, task, tokenizer, evaluate=False): + _, lang, domain, kind = task.split('_') + processor = MultiemoProcessor(lang, domain, kind) + output_mode = multiemo_output_modes['multiemo'] + logger.info("Creating features from dataset file at %s", args.data_dir) + label_list = processor.get_labels() + if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: + label_list[1], label_list[2] = label_list[2], label_list[1] + examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + if not evaluate and args.data_aug: + examples_aug = processor.get_train_examples_aug(args.data_dir) + examples = examples + examples_aug + features = convert_examples_to_features( + examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, + ) + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) + return dataset + + +def compute_neuron_head_importance(args, model, tokenizer): + """ This method shows how to compute: + - neuron importance scores based on loss according to http://arxiv.org/abs/1905.10650 + """ + # prepare things for heads + model = model.module if hasattr(model, 'module') else model + base_model = getattr(model, model.base_model_prefix, model) + n_layers, n_heads = base_model.config.num_hidden_layers, base_model.config.num_attention_heads + head_importance = torch.zeros(n_layers, n_heads).to(args.device) + head_mask = torch.ones(n_layers, n_heads).to(args.device) + head_mask.requires_grad_(requires_grad=True) + + # collect weights + intermediate_weight = [] + intermediate_bias = [] + output_weight = [] + for name, w in model.named_parameters(): + if 'intermediate' in name: + if w.dim() > 1: + intermediate_weight.append(w) + else: + intermediate_bias.append(w) + + if 'output' in name and 'attention' not in name: + if w.dim() > 1: + output_weight.append(w) + + neuron_importance = [] + for w in intermediate_weight: + neuron_importance.append(torch.zeros(w.shape[0]).to(args.device)) + + model.to(args.device) + + eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + for batch in tqdm(eval_dataloader, desc="Evaluating for determining importance"): + batch = tuple(t.to(args.device) for t in batch) + input_ids, input_mask, _, label_ids = batch + segment_ids = batch[2] if args.model_type == 'bert' else None # RoBERTa does't use segment_ids + + # calculate head importance + outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, + head_mask=head_mask) + loss = outputs[0] + loss.backward() + head_importance += head_mask.grad.abs().detach() + + # calculate neuron importance + for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, + neuron_importance): + current_importance += ((w1 * w1.grad).sum(dim=1) + b1 * b1.grad).abs().detach() + current_importance += ((w2 * w2.grad).sum(dim=0)).abs().detach() + + return head_importance, neuron_importance + + +def reorder_neuron_head(model, head_importance, neuron_importance): + """ reorder neurons based on their importance. + + Arguments: + model: bert model + head_importance: 12*12 matrix for head importance in 12 layers + neuron_importance: list for neuron importance in 12 layers. + """ + model = model.module if hasattr(model, 'module') else model + base_model = getattr(model, model.base_model_prefix, model) + + # reorder heads and ffn neurons + for layer, current_importance in enumerate(neuron_importance): + # reorder heads + idx = torch.sort(head_importance[layer], descending=True)[-1] + base_model.encoder.layer[layer].attention.reorder_heads(idx) + # reorder neurons + idx = torch.sort(current_importance, descending=True)[-1] + base_model.encoder.layer[layer].intermediate.reorder_neurons(idx) + base_model.encoder.layer[layer].output.reorder_neurons(idx) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_dir", default=None, type=str, required=True, + help="The student (and teacher) model dir.") + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where trained model is saved.") + parser.add_argument("--model_type", default=None, type=str, required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the multiemo task to train") + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.") + parser.add_argument("--do_train", action='store_true', + help="Whether to run training.") + parser.add_argument("--evaluate_during_training", default=True, + help="Rul evaluation during training at each logging step.") + parser.add_argument("--do_lower_case", default=True, + help="Set this flag if you are using an uncased model.") + parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, + help="Batch size per GPU/CPU for training.") + parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument('--gradient_accumulation_steps', type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--learning_rate", default=2e-5, type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, + help="Weight deay if we apply some.") + parser.add_argument("--num_train_epochs", default=3.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_steps", default=0, type=int, + help="Linear warmup over warmup_steps.") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, + help="dropout rate on hidden states.") + parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, + help="dropout rate on attention probs.") + + parser.add_argument('--data_aug', action='store_true', help="whether using data augmentation") + # for depth direction + parser.add_argument('--depth_mult_list', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + parser.add_argument("--depth_lambda1", default=1.0, type=float, + help="logit matching coef.") + parser.add_argument("--depth_lambda2", default=1.0, type=float, + help="hidden states matching coef.") + # for width direction + parser.add_argument('--width_mult_list', type=str, default='1.', + help="the possible widths used for training, e.g., '1.' is for separate training " + "while '0.25,0.5,0.75,1.0' is for vanilla slimmable training") + parser.add_argument("--width_lambda1", default=1.0, type=float, + help="logit matching coef.") + parser.add_argument("--width_lambda2", default=0.1, type=float, + help="hidden states matching coef.") + + parser.add_argument("--training_phase", default="dynabertw", type=str, + help="can be finetuning, dynabertw, dynabert, final_finetuning") + + args = parser.parse_args() + + args.width_mult_list = [float(width) for width in args.width_mult_list.split(',')] + args.depth_mult_list = [float(depth) for depth in args.depth_mult_list.split(',')] + + # Setup CUDA, GPU & distributed training + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + args.n_gpu = torch.cuda.device_count() + args.device = device + + # Set seed + set_seed(args) + + # Prepare MULTIEMO task: provide num_labels here + args.task_name = args.task_name.lower() + if 'multiemo' not in args.task_name: + raise ValueError("Task not found: %s" % args.task_name) + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + args.output_mode = multiemo_output_modes['multiemo'] + label_list = processor.get_labels() + num_labels = len(label_list) + + # prepare model, tokernizer and config + args.model_type = args.model_type.lower() + config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name) + config.output_attentions, config.output_hidden_states, config.output_intermediate = True, True, True + tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case) + + # load teacher model if necessary + if args.training_phase == 'dynabertw' or args.training_phase == 'dynabert': + teacher_model = model_class.from_pretrained(args.model_dir, config=config) + teacher_model.to(args.device) + else: + teacher_model = None + + # load student model if necessary + model = model_class.from_pretrained(args.model_dir, config=config) + + if args.training_phase == 'dynabertw': + # rewire the network according to the importance of attention heads and neurons + head_importance, neuron_importance = compute_neuron_head_importance(args, model, tokenizer) + reorder_neuron_head(model, head_importance, neuron_importance) + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + model.to(args.device) + + logger.info("Training/evaluation parameters %s", args) + + # Training + if args.do_train: + training_start_time = time.monotonic() + + train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) + if teacher_model: + global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher_model) + else: + global_step, tr_loss = train(args, train_dataset, model, tokenizer) + + logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) + + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(args.output_dir, "training_params.json") + + training_parameters.pop('device') + dictionary_to_json(training_parameters, output_training_params_file) + + +if __name__ == "__main__": + main() diff --git a/DynaBERT/scripts/download_dataset.py b/DynaBERT/scripts/download_dataset.py new file mode 100644 index 00000000..701ffd3a --- /dev/null +++ b/DynaBERT/scripts/download_dataset.py @@ -0,0 +1,55 @@ +import argparse +import os +import zipfile + +import requests +from tqdm.auto import tqdm + +# url = 'https://clarin-pl.eu/dspace/bitstream/handle/11321/798/multiemo.zip?sequence=2&isAllowed=y' +url = 'https://clarin-pl.eu/dspace/handle/11321/798/allzip' + + +def main(data_dir): + output_zip = os.path.join( + data_dir, + 'MultiEmo_ Multilingual, Multilevel, Multidomain Sentiment Analysis Corpus of Consumer Reviews.zip') + + response = requests.get(url, stream=True) + + if response.status_code == 200: + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(output_zip, 'wb') as f: + for chunk in response.iter_content(chunk_size=block_size): + if chunk: + progress_bar.update(len(chunk)) + f.write(chunk) + + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, something went wrong") + + with zipfile.ZipFile(output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(output_zip) + os.remove(os.path.join(data_dir, 'multiemo.7z')) + + data_output_zip = os.path.join(data_dir, 'multiemo.zip') + with zipfile.ZipFile(data_output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(data_output_zip) + os.remove(os.path.join(data_dir, 'README.txt')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='multiemo2') + args = parser.parse_args() + + if not os.path.isdir(args.data_dir): + os.mkdir(args.data_dir) + + main(data_dir=args.data_dir) diff --git a/DynaBERT/transformers/data/metrics/__init__.py b/DynaBERT/transformers/data/metrics/__init__.py index c9ebaac3..8893d540 100644 --- a/DynaBERT/transformers/data/metrics/__init__.py +++ b/DynaBERT/transformers/data/metrics/__init__.py @@ -18,11 +18,14 @@ import sys import logging +import numpy as np + logger = logging.getLogger(__name__) try: from scipy.stats import pearsonr, spearmanr - from sklearn.metrics import matthews_corrcoef, f1_score + from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score + _has_sklearn = True except (AttributeError, ImportError) as e: logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") @@ -47,6 +50,16 @@ def acc_and_f1(preds, labels): } + def multiclass_acc_and_f1(preds, labels): + acc = accuracy_score(y_true=labels, y_pred=preds) + f1 = f1_score(y_true=labels, y_pred=preds, average='macro') + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] @@ -81,3 +94,12 @@ def glue_compute_metrics(task_name, preds, labels): return {"acc": simple_accuracy(preds, labels)} else: raise KeyError(task_name) + + + def multiemo_compute_metrics(task_name, logits, labels): + preds = np.argmax(logits, axis=1) + assert len(preds) == len(labels) + if 'multiemo' in task_name: + return multiclass_acc_and_f1(preds, labels) + else: + raise KeyError(task_name) diff --git a/DynaBERT/transformers/data/processors/multiemo.py b/DynaBERT/transformers/data/processors/multiemo.py new file mode 100644 index 00000000..e7cb3142 --- /dev/null +++ b/DynaBERT/transformers/data/processors/multiemo.py @@ -0,0 +1,189 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" GLUE processors and helpers """ + +import logging +import os +from typing import List + +import numpy as np +from .utils import DataProcessor, InputExample, InputFeatures +from ...file_utils import is_tf_available + +if is_tf_available(): + import tensorflow as tf + +logger = logging.getLogger(__name__) + + +def multiemo_convert_examples_to_features( + examples, tokenizer, + max_length=512, + task=None, + label_list=None, + output_mode=None, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True): + """ + Loads a data file into a list of ``InputFeatures`` + + Args: + examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + task: GLUE task + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method + output_mode: String indicating the output mode. Either ``regression`` or ``classification`` + pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) + mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values + and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for + actual values) + + Returns: + If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` + containing the task-specific features. If the input is a list of ``InputExamples``, will return + a list of task-specific ``InputFeatures`` which can be fed to the model. + + """ + + if task is not None: + _, lang, domain, kind = task.split('_') + processor = MultiemoProcessor(lang, domain, kind) + if label_list is None: + label_list = processor.get_labels() + logger.info("Using label list %s for task %s" % (label_list, task)) + if output_mode is None: + output_mode = multiemo_output_modes[task] + logger.info("Using output mode %s for task %s" % (output_mode, task)) + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d" % (ex_index)) + + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids + else: + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), + max_length) + assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), + max_length) + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) + logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) + logger.info("label: %s (id = %d)" % (example.label, label)) + + features.append( + InputFeatures(input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label)) + + return features + + +class MultiemoProcessor(DataProcessor): + """Processor for the Multiemo data2 set""" + + def __init__(self, lang: str, domain: str, kind: str): + super(MultiemoProcessor, self).__init__() + self.lang = lang.lower() + self.domain = domain.lower() + self.kind = kind.lower() + + def get_train_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'train') + logger.info(f"LOOKING AT {file_path}") + return self._create_examples(self._read_txt(file_path), "train") + + def get_dev_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'dev') + return self._create_examples(self._read_txt(file_path), "dev") + + def get_test_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'test') + return self._create_examples(self._read_txt(file_path), "test") + + def get_set_type_path(self, data_dir: str, set_type: str) -> str: + return os.path.join(data_dir, self.domain + '.' + self.kind + '.' + set_type + '.' + self.lang + '.txt') + + def get_labels(self) -> List[str]: + """See base class.""" + if self.kind == 'text': + return ["meta_amb", "meta_minus_m", "meta_plus_m", "meta_zero"] + else: + return ["z_amb", "z_minus_m", "z_plus_m", "z_zero"] + + @staticmethod + def _create_examples(lines: List[str], set_type: str) -> List[InputExample]: + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + split_line = line.split('__label__') + text_a = split_line[0] + label = split_line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +multiemo_tasks_num_labels = { + "multiemo": 4, +} + +multiemo_output_modes = { + "multiemo": "classification" +} diff --git a/DynaBERT/transformers/data/processors/utils.py b/DynaBERT/transformers/data/processors/utils.py index 2d7628f9..d1178364 100644 --- a/DynaBERT/transformers/data/processors/utils.py +++ b/DynaBERT/transformers/data/processors/utils.py @@ -18,6 +18,8 @@ import sys import copy import json +from typing import List + class InputExample(object): """ @@ -32,6 +34,7 @@ class InputExample(object): label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ + def __init__(self, guid, text_a, text_b=None, label=None): self.guid = guid self.text_a = text_a @@ -123,3 +126,10 @@ def _read_tsv(cls, input_file, quotechar=None): line = list(unicode(cell, 'utf-8') for cell in line) lines.append(line) return lines + + @classmethod + def _read_txt(cls, input_file: str) -> List[str]: + """Reads a tab separated value file.""" + with open(input_file, "r", encoding='UTF-8') as f: + lines = f.read().splitlines() + return lines diff --git a/DynaBERT/utils.py b/DynaBERT/utils.py new file mode 100644 index 00000000..5decae1e --- /dev/null +++ b/DynaBERT/utils.py @@ -0,0 +1,33 @@ +import json +import logging +import os +import sys + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + + +def result_to_text_file(result: dict, file_name: str, verbose: bool = True) -> None: + with open(file_name, "a") as writer: + for key in sorted(result.keys()): + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("") + + +def dictionary_to_json(dictionary: dict, file_name: str): + with open(file_name, "w") as f: + json.dump(dictionary, f, indent=2) + + +def is_folder_empty(folder_name: str): + if len([f for f in os.listdir(folder_name) if not f.startswith('.')]) == 0: + return True + else: + return False + + +def get_immediate_subdirectories(directory: str): + return [os.path.join(directory, name) for name in os.listdir(directory) + if os.path.isdir(os.path.join(directory, name))] \ No newline at end of file diff --git a/TernaryBERT/Dockerfile b/TernaryBERT/Dockerfile new file mode 100644 index 00000000..68c2c107 --- /dev/null +++ b/TernaryBERT/Dockerfile @@ -0,0 +1,25 @@ +FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04 + +ENV TZ=Europe/Minsk +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone +RUN apt-get update && \ + apt-get install --no-install-recommends -y build-essential software-properties-common && \ + apt-get install --no-install-recommends -y python3.6 python3-pip python3-dev python3-setuptools python3-distutils && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 \ + -f https://download.pytorch.org/whl/torch_stable.html + +COPY ./requirements.txt . +RUN python3 -m pip install --no-cache-dir -r requirements.txt +RUN rm requirements.txt + +ARG USER_ID +ARG GROUP_ID + +RUN addgroup --gid $GROUP_ID user +RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user +USER user diff --git a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv new file mode 100644 index 00000000..c0be3ad7 --- /dev/null +++ b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv @@ -0,0 +1,2 @@ +name,model_name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters +multiemo_en_all_sentence,TernaryBERT,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316 diff --git a/TernaryBERT/download_bert_base.py b/TernaryBERT/download_bert_base.py new file mode 100644 index 00000000..fa99e41a --- /dev/null +++ b/TernaryBERT/download_bert_base.py @@ -0,0 +1,31 @@ +import os +import requests +import tarfile + +url = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz' + +output_path = os.path.join('data', 'models') +os.makedirs(output_path, exist_ok=True) + +output_tar = os.path.join(output_path, 'bert-base-uncased.tar.gz') +model_folder = os.path.join(output_path, 'bert-base-uncased') + +response = requests.get(url, stream=True) +if response.status_code == 200: + with open(output_tar, 'wb') as f: + f.write(response.raw.read()) + +with tarfile.open(name=output_tar, mode="r|gz") as tar_ref: + tar_ref.extractall(model_folder) + +os.rename(os.path.join(model_folder, 'bert_config.json'), os.path.join(model_folder, 'config.json')) + +os.remove(output_tar) + +url_vocab = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt' +r = requests.get(url_vocab) + +with open(os.path.join(model_folder, 'vocab.txt'), 'wb') as f: + f.write(r.content) + +print('Completed!') diff --git a/TernaryBERT/eval_quant_multiemo.py b/TernaryBERT/eval_quant_multiemo.py new file mode 100644 index 00000000..0312ecee --- /dev/null +++ b/TernaryBERT/eval_quant_multiemo.py @@ -0,0 +1,230 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import random +import time +from datetime import timedelta + +import torch +from torch.utils.data import DataLoader, SequentialSampler, TensorDataset +from torch.nn import CrossEntropyLoss, MSELoss +from sklearn.metrics import classification_report +from tqdm import tqdm + +from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification +from transformer import BertTokenizer +from transformer import BertConfig +from utils_multiemo import * +from utils import dictionary_to_json, result_to_text_file + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +logger = logging.getLogger() + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + all_logits = None + + for batch_ in tqdm(eval_dataloader): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() + else: + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) + result['eval_loss'] = eval_loss + return result, all_logits + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default='data', + type=str, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_dir", + default='models/tinybert', + type=str, + help="The model dir.") + parser.add_argument("--task_name", + type=str, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default='output', + type=str, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + + parser.add_argument("--weight_bits", + default=2, + type=int, + choices=[2, 8], + help="Quantization bits for weight.") + parser.add_argument("--input_bits", + default=8, + type=int, + help="Quantization bits for activation.") + parser.add_argument("--clip_val", + default=2.5, + type=float, + help="Initial clip value.") + + args = parser.parse_args() + task_name = args.task_name.lower() + data_dir = args.data_dir + + model_dir = os.path.join(args.model_dir, task_name) + output_dir = os.path.join(args.output_dir, task_name) + os.makedirs(output_dir, exist_ok=True) + + output_modes = { + "multiemo": "classification" + } + + default_params = { + "multiemo": {"max_seq_length": 128, "batch_size": 16} + } + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if task_name in default_params: + args.batch_size = default_params[task_name]["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params[task_name]["max_seq_length"] + elif 'multiemo' in task_name: + args.batch_size = default_params['multiemo']["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params['multiemo']["max_seq_length"] + + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = output_modes['multiemo'] + else: + raise ValueError("Task not found: %s" % task_name) + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=args.do_lower_case) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(data_dir) + test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(test_data) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size) + + config = BertConfig.from_pretrained( + model_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + model = QuantBertForSequenceClassification.from_pretrained(model_dir, config=config, num_labels=num_labels) + model.to(device) + + model_quant_dir = os.path.join(model_dir, 'quant') + qunat_config = BertConfig.from_pretrained( + model_quant_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + quant_model = QuantBertForSequenceClassification.from_pretrained(model_quant_dir, config=qunat_config, + num_labels=num_labels) + quant_model.to(device) + + output_quant_dir = os.path.join(output_dir, 'quant') + for m, out_dir in zip([model, quant_model], [output_dir, output_quant_dir]): + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.batch_size) + + eval_start_time = time.monotonic() + m.eval() + result, y_logits = do_eval(m, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(out_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(out_dir, "test_results.json")) + + +if __name__ == "__main__": + main() diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py new file mode 100644 index 00000000..7953a52b --- /dev/null +++ b/TernaryBERT/gather_results.py @@ -0,0 +1,103 @@ +import argparse +import json +import os +from typing import Any, Dict + +import pandas as pd + +from transformer import BertConfig +from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification +from utils_multiemo import MultiemoProcessor + +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') +MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models', 'ternarybert') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + + args = parser.parse_args() + task_name = args.task_name + + models_subdirectories = get_immediate_subdirectories(MODELS_FOLDER) + print(MODELS_FOLDER) + + print(models_subdirectories) + data = list() + for subdirectory in models_subdirectories: + data_dict = gather_results(subdirectory, task_name) + data.append(data_dict) + + df = pd.DataFrame(data) + cols = df.columns.tolist() + cols = cols[-2:] + cols[:-2] + df = df[cols] + df.to_csv(os.path.join(DATA_FOLDER, 'results-ternarybert-' + task_name + '.csv'), index=False) + + +def get_immediate_subdirectories(a_dir): + return [os.path.join(a_dir, name) for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name))] + + +def gather_results(model_dir: str, task_name: str) -> Dict[str, Any]: + quant_model_dir = os.path.join(model_dir, 'quant') + + with open(os.path.join(model_dir, 'training_params.json')) as json_file: + training_data_dict = json.load(json_file) + + with open(os.path.join(quant_model_dir, 'test_results.json')) as json_file: + test_data = json.load(json_file) + [test_data_dict] = pd.json_normalize(test_data, sep='_').to_dict(orient='records') + + data = training_data_dict.copy() # start with keys and values of x + data.update(test_data_dict) + + model_size = os.path.getsize(os.path.join(quant_model_dir, 'pytorch_model.bin')) + data['model_size'] = model_size + + if 'multiemo' not in task_name: + raise ValueError("Task not found: %s" % task_name) + + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + label_list = processor.get_labels() + num_labels = len(label_list) + + # LOADING THE BEST MODEL + student_config = BertConfig.from_pretrained( + quant_model_dir, + quantize_act=True, + weight_bits=data['weight_bits'], + input_bits=data['input_bits'], + clip_val=data['clip_val'] + ) + model = QuantBertForSequenceClassification.from_pretrained(quant_model_dir, config=student_config, + num_labels=num_labels) + + memory_params = sum([param.nelement() * param.element_size() for param in model.parameters()]) + memory_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) + memory_used = memory_params + memory_buffers # in bytes + + data['memory'] = memory_used + + parameters_num = 0 + for n, p in model.named_parameters(): + parameters_num += p.nelement() + + data['parameters'] = parameters_num + data['name'] = os.path.basename(model_dir) + data['model_name'] = 'TernaryBERT' + print(data) + + return data + + +if __name__ == '__main__': + main() diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py new file mode 100644 index 00000000..8ea25d40 --- /dev/null +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -0,0 +1,432 @@ +# coding=utf-8 +# 2019.12.2-Changed for TinyBERT task-specific distillation +# Huawei Technologies Co., Ltd. +# Copyright 2020 Huawei Technologies Co., Ltd. +# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import random +import time +from datetime import timedelta + +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) +from tqdm import tqdm, trange + +from torch.nn import CrossEntropyLoss, MSELoss +from sklearn.metrics import classification_report + +from utils import result_to_text_file, dictionary_to_json +from utils_multiemo import * +from transformer.modeling import BertForSequenceClassification +from transformer.tokenization import BertTokenizer +from transformer.optimization import BertAdam +from transformer.file_utils import WEIGHTS_NAME, CONFIG_NAME + +csv.field_size_limit(sys.maxsize) + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + all_logits = None + + for batch_ in tqdm(eval_dataloader): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() + else: + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) + result['eval_loss'] = eval_loss + return result, all_logits + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--pretrained_model", + default=None, + type=str, + help="The pretrained model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=16, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=16, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument('--weight_decay', '--wd', + default=0.01, + type=float, + metavar='W', + help='weight decay') + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + + # added arguments + parser.add_argument('--aug_train', + action='store_true') + parser.add_argument('--eval_step', + type=int, + default=50) + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # intermediate distillation default parameters + default_params = { + "multiemo": {"num_train_epochs": 3, "max_seq_length": 128}, + } + acc_tasks = ["multiemo"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + # Prepare task settings + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + + os.makedirs(args.output_dir, exist_ok=True) + + task_name = args.task_name.lower() + if task_name in default_params: + args.max_seq_len = default_params[task_name]["max_seq_length"] + + if not args.do_eval: + if task_name in default_params: + args.num_train_epoch = default_params[task_name]["num_train_epochs"] + + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = 'classification' + else: + raise ValueError("Task not found: %s" % task_name) + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) + + if not args.do_eval: + if not args.aug_train: + train_examples = processor.get_train_examples(args.data_dir) + else: + train_examples = processor.get_aug_examples(args.data_dir) + + num_train_optimization_steps = int(len(train_examples) / args.train_batch_size) * args.num_train_epochs + + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels) + model.to(device) + if args.do_eval: + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + else: + training_start_time = time.monotonic() + + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + if n_gpu > 1: + model = torch.nn.DataParallel(model) + + optimizer = get_optimizer(args, model, num_train_optimization_steps) + + # Train and evaluate + global_step = 0 + best_dev_acc = 0.0 + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + + for epoch_ in range(int(args.num_train_epochs)): + tr_loss = 0. + tr_cls_loss = 0. + + model.train() + nb_tr_examples, nb_tr_steps = 0, 0 + + for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch + if input_ids.size()[0] != args.train_batch_size: + continue + + cls_loss = 0. + logits, _, _ = model(input_ids, segment_ids, input_mask) + + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + cls_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_mse = MSELoss() + cls_loss = loss_mse(logits.view(-1), label_ids.view(-1)) + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + + loss.backward() + tr_loss += loss.item() + nb_tr_examples += label_ids.size(0) + nb_tr_steps += 1 + + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + logger.info("***** Running evaluation *****") + logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + + loss = tr_loss / nb_tr_steps + cls_loss = tr_cls_loss / nb_tr_steps + + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['epoch'] = epoch_ + 1 + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['loss'] = loss + result_to_text_file(result, output_eval_file) + + save_model = False + + if result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True + + if save_model: + logger.info("***** Save model *****") + model_to_save = model.module if hasattr(model, 'module') else model + + model_name = WEIGHTS_NAME + output_model_file = os.path.join(args.output_dir, model_name) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + model.train() + + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(args.output_dir, "training_params.json") + dictionary_to_json(training_parameters, output_training_params_file) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(args.data_dir) + test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(test_data) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) + + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.eval_batch_size) + + eval_start_time = time.monotonic() + model.eval() + result, y_logits = do_eval(model, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(args.output_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json")) + + +def get_optimizer(args, model, num_train_optimization_steps): + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + size = 0 + for n, p in model.named_parameters(): + logger.info('n: {}'.format(n)) + size += p.nelement() + logger.info('Total parameters: {}'.format(size)) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + optimizer = BertAdam( + optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps + ) + return optimizer + + +if __name__ == "__main__": + main() diff --git a/TernaryBERT/quant_task_glue.py b/TernaryBERT/quant_task_glue.py index 1356da24..a4740e62 100644 --- a/TernaryBERT/quant_task_glue.py +++ b/TernaryBERT/quant_task_glue.py @@ -10,11 +10,11 @@ import numpy as np import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.tensorboard import SummaryWriter from torch.nn import CrossEntropyLoss, MSELoss -from transformer import BertForSequenceClassification,WEIGHTS_NAME, CONFIG_NAME +from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification from transformer import BertTokenizer from transformer import BertAdam @@ -26,6 +26,7 @@ format=log_format, datefmt='%m/%d %I:%M:%S %p') logger = logging.getLogger() + def get_tensor_data(output_mode, features): if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) @@ -36,16 +37,17 @@ def get_tensor_data(output_mode, features): all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_label_ids, all_seq_lengths) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) return tensor_data, all_label_ids + def do_eval(model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels): eval_loss = 0 nb_eval_steps = 0 preds = [] - for _,batch_ in enumerate(eval_dataloader): + for _, batch_ in enumerate(eval_dataloader): batch_ = tuple(t.to(device) for t in batch_) with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ @@ -78,11 +80,13 @@ def do_eval(model, task_name, eval_dataloader, result['eval_loss'] = eval_loss return result + def soft_cross_entropy(predicts, targets): student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) targets_prob = torch.nn.functional.softmax(targets, dim=-1) return (- targets_prob * student_likelihood).mean() + def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", @@ -109,7 +113,7 @@ def main(): default='output', type=str, help="The output directory where the model predictions and checkpoints will be written.") - + parser.add_argument("--learning_rate", default=2e-5, type=float, @@ -122,7 +126,7 @@ def main(): type=int, default=42, help="random seed for initialization") - + parser.add_argument('--aug_train', action='store_false', help="Whether to use augmented data or not") @@ -142,7 +146,7 @@ def main(): parser.add_argument("--weight_bits", default=2, type=int, - choices=[2,8], + choices=[2, 8], help="Quantization bits for weight.") parser.add_argument("--input_bits", default=8, @@ -158,17 +162,17 @@ def main(): summaryWriter = SummaryWriter(args.output_dir) logger.info('The args: {}'.format(args)) task_name = args.task_name.lower() - data_dir = os.path.join(args.data_dir,task_name) - output_dir = os.path.join(args.output_dir,task_name) + data_dir = os.path.join(args.data_dir, task_name) + output_dir = os.path.join(args.output_dir, task_name) # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name) if not os.path.exists(output_dir): os.mkdir(output_dir) - + if args.student_model is None: - args.student_model = os.path.join(args.model_dir,task_name) + args.student_model = os.path.join(args.model_dir, task_name) if args.teacher_model is None: - args.teacher_model = os.path.join(args.model_dir,task_name) + args.teacher_model = os.path.join(args.model_dir, task_name) processors = { "cola": ColaProcessor, @@ -194,14 +198,14 @@ def main(): } default_params = { - "cola": {"max_seq_length": 64,"batch_size":16,"eval_step":50}, - "mnli": {"max_seq_length": 128,"batch_size":32,"eval_step":1000}, - "mrpc": {"max_seq_length": 128,"batch_size":32,"eval_step":200}, - "sst-2": {"max_seq_length": 64,"batch_size":32,"eval_step":200}, - "sts-b": {"max_seq_length": 128,"batch_size":32,"eval_step":50}, - "qqp": {"max_seq_length": 128,"batch_size":32,"eval_step":1000}, - "qnli": {"max_seq_length": 128,"batch_size":32,"eval_step":1000}, - "rte": {"max_seq_length": 128,"batch_size":32,"eval_step":100} + "cola": {"max_seq_length": 64, "batch_size": 16, "eval_step": 50}, + "mnli": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000}, + "mrpc": {"max_seq_length": 128, "batch_size": 32, "eval_step": 200}, + "sst-2": {"max_seq_length": 64, "batch_size": 32, "eval_step": 200}, + "sts-b": {"max_seq_length": 128, "batch_size": 32, "eval_step": 50}, + "qqp": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000}, + "qnli": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000}, + "rte": {"max_seq_length": 128, "batch_size": 32, "eval_step": 100} } acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] @@ -218,11 +222,11 @@ def main(): torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) - + if task_name in default_params: args.batch_size = default_params[task_name]["batch_size"] if n_gpu > 0: - args.batch_size = int(args.batch_size*n_gpu) + args.batch_size = int(args.batch_size * n_gpu) args.max_seq_length = default_params[task_name]["max_seq_length"] args.eval_step = default_params[task_name]["eval_step"] @@ -232,35 +236,36 @@ def main(): num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=True) - + if args.aug_train: try: - train_file = os.path.join(processed_data_dir,'aug_data') - train_features = pickle.load(open(train_file,'rb')) + train_file = os.path.join(processed_data_dir, 'aug_data') + train_features = pickle.load(open(train_file, 'rb')) except: train_examples = processor.get_aug_examples(data_dir) train_features = convert_examples_to_features(train_examples, label_list, - args.max_seq_length, tokenizer, output_mode) + args.max_seq_length, tokenizer, output_mode) else: try: - train_file = os.path.join(processed_data_dir,'train_data') - train_features = pickle.load(open(train_file,'rb')) + train_file = os.path.join(processed_data_dir, 'train_data') + train_features = pickle.load(open(train_file, 'rb')) except: train_examples = processor.get_train_examples(data_dir) train_features = convert_examples_to_features(train_examples, label_list, - args.max_seq_length, tokenizer, output_mode) + args.max_seq_length, tokenizer, output_mode) num_train_optimization_steps = int(len(train_features) / args.batch_size) * args.num_train_epochs train_data, _ = get_tensor_data(output_mode, train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) - + try: - dev_file = train_file = os.path.join(processed_data_dir,'dev_data') - eval_features = pickle.load(open(dev_file,'rb')) + dev_file = train_file = os.path.join(processed_data_dir, 'dev_data') + eval_features = pickle.load(open(dev_file, 'rb')) except: eval_examples = processor.get_dev_examples(data_dir) - eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, + output_mode) eval_data, eval_labels = get_tensor_data(output_mode, eval_features) eval_sampler = SequentialSampler(eval_data) @@ -268,13 +273,13 @@ def main(): if task_name == "mnli": processor = processors["mnli-mm"]() try: - dev_mm_file = train_file = os.path.join(processed_data_dir,'dev-mm_data') - mm_eval_features = pickle.load(open(dev_mm_file,'rb')) + dev_mm_file = train_file = os.path.join(processed_data_dir, 'dev-mm_data') + mm_eval_features = pickle.load(open(dev_mm_file, 'rb')) except: mm_eval_examples = processor.get_dev_examples(data_dir) mm_eval_features = convert_examples_to_features( mm_eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) - + mm_eval_data, mm_eval_labels = get_tensor_data(output_mode, mm_eval_features) logger.info(" Num examples = %d", len(mm_eval_features)) @@ -289,11 +294,11 @@ def main(): teacher_model = torch.nn.DataParallel(teacher_model) result = do_eval(teacher_model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + device, output_mode, eval_labels, num_labels) if task_name in acc_tasks: - if task_name in ['sst-2','mnli','qnli','rte']: + if task_name in ['sst-2', 'mnli', 'qnli', 'rte']: fp32_performance = f"acc:{result['acc']}" - elif task_name in ['mrpc','qqp']: + elif task_name in ['mrpc', 'qqp']: fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}" if task_name in corr_tasks: fp32_performance = f"pearson/spearmanr:{result['pearson']}/{result['spearmanr']}" @@ -303,15 +308,16 @@ def main(): if task_name == "mnli": result = do_eval(teacher_model, 'mnli-mm', mm_eval_dataloader, - device, output_mode, mm_eval_labels, num_labels) + device, output_mode, mm_eval_labels, num_labels) fp32_performance += f" mm-acc:{result['acc']}" - fp32_performance = task_name +' fp32 ' + fp32_performance - student_config = BertConfig.from_pretrained(args.teacher_model, + fp32_performance = task_name + ' fp32 ' + fp32_performance + student_config = BertConfig.from_pretrained(args.teacher_model, quantize_act=True, - weight_bits = args.weight_bits, - input_bits = args.input_bits, - clip_val = args.clip_val) - student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config = student_config, num_labels=num_labels) + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val) + student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config=student_config, + num_labels=num_labels) student_model.to(device) logger.info("***** Running training *****") @@ -320,7 +326,7 @@ def main(): logger.info(" Num steps = %d", num_train_optimization_steps) if n_gpu > 1: student_model = torch.nn.DataParallel(student_model) - + # Prepare optimizer param_optimizer = list(student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] @@ -330,15 +336,15 @@ def main(): ] schedule = 'warmup_linear' optimizer = BertAdam(optimizer_grouped_parameters, - schedule=schedule, - lr=args.learning_rate, - warmup=0.1, - t_total=num_train_optimization_steps) + schedule=schedule, + lr=args.learning_rate, + warmup=0.1, + t_total=num_train_optimization_steps) loss_mse = MSELoss() global_step = 0 best_dev_acc = 0.0 previous_best = None - + tr_loss = 0. tr_att_loss = 0. tr_rep_loss = 0. @@ -359,10 +365,10 @@ def main(): with torch.no_grad(): teacher_logits, teacher_atts, teacher_reps = teacher_model(input_ids, segment_ids, input_mask) - + if args.pred_distill: if output_mode == "classification": - cls_loss = soft_cross_entropy(student_logits,teacher_logits) + cls_loss = soft_cross_entropy(student_logits, teacher_logits) elif output_mode == "regression": cls_loss = loss_mse(student_logits, teacher_logits) @@ -372,9 +378,9 @@ def main(): if args.intermediate_distill: for student_att, teacher_att in zip(student_atts, teacher_atts): student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(device), - student_att) + student_att) teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device), - teacher_att) + teacher_att) tmp_loss = loss_mse(student_att, teacher_att) att_loss += tmp_loss @@ -397,7 +403,7 @@ def main(): tr_loss += loss.item() nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 - if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps-1: + if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1: logger.info("***** Running evaluation *****") logger.info(" {} step of {} steps".format(global_step, num_train_optimization_steps)) if previous_best is not None: @@ -411,34 +417,34 @@ def main(): rep_loss = tr_rep_loss / (step + 1) result = do_eval(student_model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + device, output_mode, eval_labels, num_labels) result['global_step'] = global_step result['cls_loss'] = cls_loss result['att_loss'] = att_loss result['rep_loss'] = rep_loss result['loss'] = loss - summaryWriter.add_scalar('total_loss',loss,global_step) - summaryWriter.add_scalars('distill_loss',{'att_loss':att_loss, - 'rep_loss':rep_loss, - 'cls_loss':cls_loss},global_step) - - if task_name=='cola': - summaryWriter.add_scalar('mcc',result['mcc'],global_step) - elif task_name in ['sst-2','mnli','mnli-mm','qnli','rte','wnli']: - summaryWriter.add_scalar('acc',result['acc'],global_step) - elif task_name in ['mrpc','qqp']: - summaryWriter.add_scalars('performance',{'acc':result['acc'], - 'f1':result['f1'], - 'acc_and_f1':result['acc_and_f1']},global_step) + summaryWriter.add_scalar('total_loss', loss, global_step) + summaryWriter.add_scalars('distill_loss', {'att_loss': att_loss, + 'rep_loss': rep_loss, + 'cls_loss': cls_loss}, global_step) + + if task_name == 'cola': + summaryWriter.add_scalar('mcc', result['mcc'], global_step) + elif task_name in ['sst-2', 'mnli', 'mnli-mm', 'qnli', 'rte', 'wnli']: + summaryWriter.add_scalar('acc', result['acc'], global_step) + elif task_name in ['mrpc', 'qqp']: + summaryWriter.add_scalars('performance', {'acc': result['acc'], + 'f1': result['f1'], + 'acc_and_f1': result['acc_and_f1']}, global_step) else: - summaryWriter.add_scalar('corr',result['corr'],global_step) + summaryWriter.add_scalar('corr', result['corr'], global_step) save_model = False if task_name in acc_tasks and result['acc'] > best_dev_acc: - if task_name in ['sst-2','mnli','qnli','rte']: + if task_name in ['sst-2', 'mnli', 'qnli', 'rte']: previous_best = f"acc:{result['acc']}" - elif task_name in ['mrpc','qqp']: + elif task_name in ['mrpc', 'qqp']: previous_best = f"f1/acc:{result['f1']}/{result['acc']}" best_dev_acc = result['acc'] save_model = True @@ -457,8 +463,8 @@ def main(): # Test mnli-mm if task_name == "mnli": result = do_eval(student_model, 'mnli-mm', mm_eval_dataloader, - device, output_mode, mm_eval_labels, num_labels) - previous_best+= f"mm-acc:{result['acc']}" + device, output_mode, mm_eval_labels, num_labels) + previous_best += f"mm-acc:{result['acc']}" logger.info(fp32_performance) logger.info(previous_best) if args.save_fp_model: @@ -478,10 +484,11 @@ def main(): model_to_save = student_model.module if hasattr(student_model, 'module') else student_model quant_model = copy.deepcopy(model_to_save) for name, module in quant_model.named_modules(): - if hasattr(module,'weight_quantizer'): - module.weight.data = module.weight_quantizer.apply(module.weight,module.weight_clip_val, - module.weight_bits,True) - + if hasattr(module, 'weight_quantizer'): + module.weight.data = module.weight_quantizer.apply(module.weight, + module.weight_clip_val, + module.weight_bits, True) + output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py new file mode 100644 index 00000000..1726b13f --- /dev/null +++ b/TernaryBERT/quant_task_multiemo.py @@ -0,0 +1,504 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import random +import copy +import time +from datetime import timedelta + +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.nn import CrossEntropyLoss, MSELoss +from sklearn.metrics import classification_report +from tqdm import trange, tqdm + +from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME +from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification +from transformer import BertTokenizer +from transformer import BertAdam +from transformer import BertConfig +from utils_multiemo import * +from utils import dictionary_to_json, result_to_text_file + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +logger = logging.getLogger() + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + all_logits = None + + for batch_ in tqdm(eval_dataloader): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() + else: + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) + result['eval_loss'] = eval_loss + return result, all_logits + + +def soft_cross_entropy(predicts, targets): + student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) + targets_prob = torch.nn.functional.softmax(targets, dim=-1) + return (- targets_prob * student_likelihood).mean() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default='data', + type=str, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_dir", + default='models/tinybert', + type=str, + help="The model dir.") + parser.add_argument("--teacher_model", + default=None, + type=str, + help="The models directory.") + parser.add_argument("--student_model", + default=None, + type=str, + help="The models directory.") + parser.add_argument("--task_name", + type=str, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default='output', + type=str, + help="The output directory where the model predictions and checkpoints will be written.") + + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--learning_rate", + default=2e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument('--weight_decay', '--wd', + default=0.01, + type=float, + metavar='W', + help='weight decay') + + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + + parser.add_argument('--aug_train', + action='store_true', + help="Whether to use augmented data or not") + parser.add_argument('--pred_distill', + action='store_true', + help="Whether to distil with task layer") + parser.add_argument('--intermediate_distill', + action='store_true', + help="Whether to distil with intermediate layers") + parser.add_argument('--save_fp_model', + action='store_true', + help="Whether to save fp32 model") + parser.add_argument('--save_quantized_model', + action='store_true', + help="Whether to save quantized model") + + parser.add_argument("--weight_bits", + default=2, + type=int, + choices=[2, 8], + help="Quantization bits for weight.") + parser.add_argument("--input_bits", + default=8, + type=int, + help="Quantization bits for activation.") + parser.add_argument("--clip_val", + default=2.5, + type=float, + help="Initial clip value.") + + args = parser.parse_args() + assert args.pred_distill or args.intermediate_distill, "'pred_distill' and 'intermediate_distill', at least one must be True" + logger.info('The args: {}'.format(args)) + task_name = args.task_name.lower() + data_dir = args.data_dir + output_dir = os.path.join(args.output_dir, task_name) + + os.makedirs(output_dir, exist_ok=True) + + if args.student_model is None: + args.student_model = os.path.join(args.model_dir, task_name) + if args.teacher_model is None: + args.teacher_model = os.path.join(args.model_dir, task_name) + + processors = { + "multiemo": MultiemoProcessor + } + + output_modes = { + "multiemo": "classification" + } + + default_params = { + "multiemo": {"max_seq_length": 128, "batch_size": 16} + } + + acc_tasks = ["multiemo"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if task_name in default_params: + args.batch_size = default_params[task_name]["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params[task_name]["max_seq_length"] + elif 'multiemo' in task_name: + args.batch_size = default_params['multiemo']["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params['multiemo']["max_seq_length"] + + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = output_modes['multiemo'] + else: + raise ValueError("Task not found: %s" % task_name) + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case) + + if args.aug_train: + train_examples = processor.get_aug_examples(data_dir) + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + else: + train_examples = processor.get_train_examples(data_dir) + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + + num_train_optimization_steps = int(len(train_features) / args.batch_size) * args.num_train_epochs + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) + + eval_examples = processor.get_dev_examples(data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) + + teacher_model = BertForSequenceClassification.from_pretrained(args.teacher_model, num_labels=num_labels) + teacher_model.to(device) + teacher_model.eval() + if n_gpu > 1: + teacher_model = torch.nn.DataParallel(teacher_model) + + result, _ = do_eval(teacher_model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + + fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}" + fp32_performance = task_name + ' fp32 ' + fp32_performance + + student_config = BertConfig.from_pretrained( + args.teacher_model, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config=student_config, + num_labels=num_labels) + student_model.to(device) + + training_start_time = time.monotonic() + + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_features)) + logger.info(" Batch size = %d", args.batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + if n_gpu > 1: + student_model = torch.nn.DataParallel(student_model) + + optimizer = get_optimizer(args, num_train_optimization_steps, student_model) + loss_mse = MSELoss() + global_step = 0 + best_dev_acc = 0.0 + previous_best = None + output_eval_file = os.path.join(output_dir, "eval_results.txt") + + tr_loss = 0. + tr_att_loss = 0. + tr_rep_loss = 0. + tr_cls_loss = 0. + for epoch_ in trange(int(args.num_train_epochs)): + nb_tr_examples, nb_tr_steps = 0, 0 + + for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): + student_model.train() + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch + att_loss = 0. + rep_loss = 0. + cls_loss = 0. + loss = 0. + + student_logits, student_atts, student_reps = student_model(input_ids, segment_ids, input_mask) + + with torch.no_grad(): + teacher_logits, teacher_atts, teacher_reps = teacher_model(input_ids, segment_ids, input_mask) + + if args.pred_distill: + if output_mode == "classification": + cls_loss = soft_cross_entropy(student_logits, teacher_logits) + elif output_mode == "regression": + cls_loss = loss_mse(student_logits, teacher_logits) + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if args.intermediate_distill: + for student_att, teacher_att in zip(student_atts, teacher_atts): + student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(device), + student_att) + teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device), + teacher_att) + tmp_loss = loss_mse(student_att, teacher_att) + att_loss += tmp_loss + + for student_rep, teacher_rep in zip(student_reps, teacher_reps): + tmp_loss = loss_mse(student_rep, teacher_rep) + rep_loss += tmp_loss + + loss += rep_loss + att_loss + tr_att_loss += att_loss.item() + tr_rep_loss += rep_loss.item() + + if n_gpu > 1: + loss = loss.mean() + + loss.backward() + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + tr_loss += loss.item() + nb_tr_examples += label_ids.size(0) + nb_tr_steps += 1 + + logger.info("***** Running evaluation *****") + logger.info(" {} step of {} steps".format(global_step, num_train_optimization_steps)) + if previous_best is not None: + logger.info(f"{fp32_performance}\nPrevious best = {previous_best}") + + student_model.eval() + + loss = tr_loss / nb_tr_steps + cls_loss = tr_cls_loss / nb_tr_steps + att_loss = tr_att_loss / nb_tr_steps + rep_loss = tr_rep_loss / nb_tr_steps + + result, _ = do_eval(student_model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + + result['epoch'] = epoch_ + 1 + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['att_loss'] = att_loss + result['rep_loss'] = rep_loss + result['loss'] = loss + + result_to_text_file(result, output_eval_file) + + save_model = False + + if result['acc'] > best_dev_acc: + previous_best = f"f1/acc:{result['f1']}/{result['acc']}" + best_dev_acc = result['acc'] + save_model = True + + if save_model: + logger.info(fp32_performance) + logger.info(previous_best) + if args.save_fp_model: + logger.info("******************** Save full precision model ********************") + model_to_save = student_model.module if hasattr(student_model, 'module') else student_model + output_model_file = os.path.join(output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_dir) + + if args.save_quantized_model: + logger.info("******************** Save quantized model ********************") + output_quant_dir = os.path.join(output_dir, 'quant') + if not os.path.exists(output_quant_dir): + os.makedirs(output_quant_dir) + model_to_save = student_model.module if hasattr(student_model, 'module') else student_model + quant_model = copy.deepcopy(model_to_save) + for name, module in quant_model.named_modules(): + if hasattr(module, 'weight_quantizer'): + module.weight.data = module.weight_quantizer.apply( + module.weight, + module.weight_clip_val, + module.weight_bits, True + ) + + output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) + output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) + + torch.save(quant_model.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_quant_dir) + + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(output_dir, "training_params.json") + dictionary_to_json(training_parameters, output_training_params_file) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(data_dir) + test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(test_data) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size) + + config = BertConfig.from_pretrained( + output_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + model = QuantBertForSequenceClassification.from_pretrained(output_dir, config=config, num_labels=num_labels) + + output_quant_dir = os.path.join(output_dir, 'quant') + qunat_config = BertConfig.from_pretrained( + output_quant_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + quant_model = QuantBertForSequenceClassification.from_pretrained(output_quant_dir, config=qunat_config, + num_labels=num_labels) + + for m, out_dir in zip([model, quant_model], [output_dir, output_quant_dir]): + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.batch_size) + + eval_start_time = time.monotonic() + m.eval() + result, y_logits = do_eval(m, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(out_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(out_dir, "test_results.json")) + + +def get_optimizer(args, num_train_optimization_steps, student_model): + # Prepare optimizer + param_optimizer = list(student_model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + optimizer = BertAdam( + optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=0.1, + t_total=num_train_optimization_steps + ) + return optimizer + + +if __name__ == "__main__": + main() diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt index 7bbcdf0a..52e38093 100644 --- a/TernaryBERT/requirements.txt +++ b/TernaryBERT/requirements.txt @@ -3,5 +3,10 @@ requests scipy future Pillow -tensorflow==1.14.0 -torch==1.1.0 +# tensorflow~=1.14.0 +numpy~=1.21.2 +pandas~=1.3.3 +scikit-learn~=1.0 +tqdm +regex +# torch==1.1.0 diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py new file mode 100644 index 00000000..e669a3bb --- /dev/null +++ b/TernaryBERT/run_experiments.py @@ -0,0 +1,102 @@ +import logging +import os +import sys + +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + +data_dir = os.path.join('data', 'multiemo2') + +batch_size = 16 +num_train_epochs = 3 +learning_rate = 5e-5 +weight_decay = 0.01 + +evaluate = False + + +def main(): + print(PROJECT_FOLDER) + os.chdir(PROJECT_FOLDER) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')): + logger.info("Downloading Multiemo data") + cmd = 'python3 scripts/download_dataset.py --data_dir data/multiemo2' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')): + logger.info("Downloading bert-base-uncased model") + cmd = 'python3 download_bert_base.py' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')): + cmd = 'python3 multiemo_fine_tune_bert.py ' + options = [ + '--pretrained_model', 'data/models/bert-base-uncased', + '--data_dir', 'data/multiemo2', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence', + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--weight_decay', str(weight_decay), + '--train_batch_size', str(batch_size), + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence") + run_process(cmd) + + cmd = 'python3 quant_task_multiemo.py ' + options = [ + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/bert-base-uncased', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/ternarybert', + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--weight_decay', str(weight_decay), + '--weight_bits', str(2), + '--input_bits', str(8), + '--pred_distill', + '--intermediate_distill', + '--save_fp_model', + '--save_quantized_model', + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Training ternarybert for multiemo_en_all_sentence") + run_process(cmd) + + if evaluate: + cmd = 'python3 eval_quant_multiemo.py ' + options = [ + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/ternarybert', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/ternarybert' + '--weight_bits', str(2), + '--input_bits', str(8), + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Evaluating ternarybert for multiemo_en_all_sentence") + run_process(cmd) + + # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence' + # logger.info(f"Gathering results to csv for multiemo_en_all_sentence") + # run_process(cmd) + + +def run_process(proc): + os.system(proc) + + +if __name__ == '__main__': + main() diff --git a/TernaryBERT/scripts/download_dataset.py b/TernaryBERT/scripts/download_dataset.py new file mode 100644 index 00000000..701ffd3a --- /dev/null +++ b/TernaryBERT/scripts/download_dataset.py @@ -0,0 +1,55 @@ +import argparse +import os +import zipfile + +import requests +from tqdm.auto import tqdm + +# url = 'https://clarin-pl.eu/dspace/bitstream/handle/11321/798/multiemo.zip?sequence=2&isAllowed=y' +url = 'https://clarin-pl.eu/dspace/handle/11321/798/allzip' + + +def main(data_dir): + output_zip = os.path.join( + data_dir, + 'MultiEmo_ Multilingual, Multilevel, Multidomain Sentiment Analysis Corpus of Consumer Reviews.zip') + + response = requests.get(url, stream=True) + + if response.status_code == 200: + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(output_zip, 'wb') as f: + for chunk in response.iter_content(chunk_size=block_size): + if chunk: + progress_bar.update(len(chunk)) + f.write(chunk) + + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, something went wrong") + + with zipfile.ZipFile(output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(output_zip) + os.remove(os.path.join(data_dir, 'multiemo.7z')) + + data_output_zip = os.path.join(data_dir, 'multiemo.zip') + with zipfile.ZipFile(data_output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(data_output_zip) + os.remove(os.path.join(data_dir, 'README.txt')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='multiemo2') + args = parser.parse_args() + + if not os.path.isdir(args.data_dir): + os.mkdir(args.data_dir) + + main(data_dir=args.data_dir) diff --git a/TernaryBERT/utils.py b/TernaryBERT/utils.py new file mode 100644 index 00000000..307ea9b2 --- /dev/null +++ b/TernaryBERT/utils.py @@ -0,0 +1,39 @@ +import json +import logging +import os +import sys + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + + +def result_to_text_file(result: dict, file_name: str, verbose: bool = True) -> None: + with open(file_name, "a") as writer: + if verbose: + logger.info("***** Eval results *****") + + for key in sorted(result.keys()): + if verbose: + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + writer.write("") + + +def dictionary_to_json(dictionary: dict, file_name: str): + with open(file_name, "w") as f: + json.dump(dictionary, f, indent=2) + + +def is_folder_empty(folder_name: str): + if len([f for f in os.listdir(folder_name) if not f.startswith('.')]) == 0: + return True + else: + return False + + +def get_immediate_subdirectories(directory: str): + return [os.path.join(directory, name) for name in os.listdir(directory) + if os.path.isdir(os.path.join(directory, name))] \ No newline at end of file diff --git a/TernaryBERT/utils_glue.py b/TernaryBERT/utils_glue.py index 5a33219f..c19c4108 100644 --- a/TernaryBERT/utils_glue.py +++ b/TernaryBERT/utils_glue.py @@ -8,6 +8,7 @@ logger = logging.getLogger() + class InputExample(object): """A single training/test example for simple sequence classification.""" @@ -50,7 +51,7 @@ def get_train_examples(self, data_dir): def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" raise NotImplementedError() - + def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for the test set.""" raise NotImplementedError() @@ -84,7 +85,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -128,7 +129,7 @@ def get_dev_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") @@ -167,6 +168,7 @@ def get_dev_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") + def get_test_examples(self, data_dir): """See base class.""" return self._create_examples( @@ -186,7 +188,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -233,7 +235,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -276,7 +278,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -298,7 +300,7 @@ def _create_examples(self, lines, set_type): guid = "%s-%s" % (set_type, line[0]) text_a = line[7] text_b = line[8] - if set_type== 'test': + if set_type == 'test': label = None else: label = line[-1] @@ -319,7 +321,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -340,7 +342,7 @@ def _create_examples(self, lines, set_type): continue guid = "%s-%s" % (set_type, line[0]) try: - if set_type=='test': + if set_type == 'test': text_a = line[1] text_b = line[2] label = None @@ -368,7 +370,7 @@ def get_dev_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -388,7 +390,7 @@ def _create_examples(self, lines, set_type): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) - if set_type=='test': + if set_type == 'test': text_a = line[1] text_b = line[2] label = None @@ -413,7 +415,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -433,7 +435,7 @@ def _create_examples(self, lines, set_type): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) - if set_type=='test': + if set_type == 'test': text_a = line[1] text_b = line[2] label = None diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py new file mode 100644 index 00000000..e7ba2a24 --- /dev/null +++ b/TernaryBERT/utils_multiemo.py @@ -0,0 +1,236 @@ +import os +import logging +import sys +import csv +from typing import List + +import numpy as np +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score + +logger = logging.getLogger() + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.seq_length = seq_length + self.label_id = label_id + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for the test set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + if sys.version_info[0] == 2: + line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file: str) -> List[str]: + """Reads a tab separated value file.""" + with open(input_file, "r", encoding='UTF-8') as f: + lines = f.read().splitlines() + return lines + + +class MultiemoProcessor(DataProcessor): + """Processor for the Multiemo data2 set""" + + def __init__(self, lang: str, domain: str, kind: str): + super(MultiemoProcessor, self).__init__() + self.lang = lang.lower() + self.domain = domain.lower() + self.kind = kind.lower() + + def get_train_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'train') + logger.info(f"LOOKING AT {file_path}") + return self._create_examples(self._read_txt(file_path), "train") + + def get_dev_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'dev') + return self._create_examples(self._read_txt(file_path), "dev") + + def get_test_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'test') + return self._create_examples(self._read_txt(file_path), "test") + + def get_set_type_path(self, data_dir: str, set_type: str) -> str: + return os.path.join(data_dir, self.domain + '.' + self.kind + '.' + set_type + '.' + self.lang + '.txt') + + def get_labels(self) -> List[str]: + """See base class.""" + if self.kind == 'text': + return ["meta_amb", "meta_minus_m", "meta_plus_m", "meta_zero"] + else: + return ["z_amb", "z_minus_m", "z_plus_m", "z_zero"] + + @staticmethod + def _create_examples(lines: List[str], set_type: str) -> List[InputExample]: + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + split_line = line.split('__label__') + text_a = split_line[0] + label = split_line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer, output_mode): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + seq_length = len(input_ids) + + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + try: + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + except: + label_id = 0 + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("label: {}".format(example.label)) + logger.info("label_id: {}".format(label_id)) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + seq_length=seq_length)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def simple_accuracy(preds, labels): + return accuracy_score(y_true=labels, y_pred=preds) + + +def acc_and_f1(preds, labels): + acc = accuracy_score(y_true=labels, y_pred=preds) + f1 = f1_score(y_true=labels, y_pred=preds, average='macro') + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def compute_metrics(task_name, logits, labels): + preds = np.argmax(logits, axis=1) + assert len(preds) == len(labels) + if 'multiemo' in task_name: + return acc_and_f1(preds, labels) + else: + raise KeyError(task_name) diff --git a/TinyBERT/data/.gitkeep b/TinyBERT/data/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/TinyBERT/data_augmentation.py b/TinyBERT/data_augmentation.py index 6081cf18..de2c8c8b 100644 --- a/TinyBERT/data_augmentation.py +++ b/TinyBERT/data_augmentation.py @@ -90,6 +90,7 @@ def _read_tsv(input_file, quotechar=None): def prepare_embedding_retrieval(glove_file, vocab_size=100000): + logger.info('Preparing GloVe embedding started') cnt = 0 words = [] embeddings = {} @@ -118,6 +119,7 @@ def prepare_embedding_retrieval(glove_file, vocab_size=100000): # normalize each word vector d = (np.sum(emb_matrix ** 2, 1) ** 0.5) emb_norm = (emb_matrix.T / d).T + logger.info('Preparing GloVe embedding finished') return emb_norm, vocab, ids_to_tokens @@ -219,7 +221,7 @@ def augment(self, sent): for (idx, word) in enumerate(tokens): if _is_valid(word) and word.lower() not in StopWordsList: candidate_words[idx] = self._word_augment(sent, idx, word) - logger.info(candidate_words) + # logger.info(candidate_words) cnt = 0 while cnt < self.N: new_sent = list(tokens) @@ -272,7 +274,7 @@ def read_augment_write(self): line[augment_id] = augment_sent writer.writerow(line) - if (i+1) % 1000 == 0: + if (i+1) % 50 == 0: logger.info("Having been processing {} examples".format(str(i+1))) diff --git a/TinyBERT/data_processing.py b/TinyBERT/data_processing.py new file mode 100644 index 00000000..5ab7c242 --- /dev/null +++ b/TinyBERT/data_processing.py @@ -0,0 +1,577 @@ +import csv +import logging +import os +import sys + +import torch +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import f1_score, matthews_corrcoef +from torch.utils.data import TensorDataset + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.seq_length = seq_length + self.label_id = label_id + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + if sys.version_info[0] == 2: + line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = line[4] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), + "dev_matched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[7] + text_b = line[8] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + try: + text_a = line[3] + text_b = line[4] + label = line[5] + except IndexError: + continue + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), + "dev_matched") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer, output_mode): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + seq_length = len(input_ids) + + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("label: {}".format(example.label)) + logger.info("label_id: {}".format(label_id)) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + seq_length=seq_length)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + +def compute_metrics(task_name, preds, labels): + assert len(preds) == len(labels) + if task_name == "cola": + return {"mcc": matthews_corrcoef(labels, preds)} + elif task_name == "sst-2": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mrpc": + return acc_and_f1(preds, labels) + elif task_name == "sts-b": + return pearson_and_spearman(preds, labels) + elif task_name == "qqp": + return acc_and_f1(preds, labels) + elif task_name == "mnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mnli-mm": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "qnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "rte": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "wnli": + return {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor +} + +output_modes = { + "cola": "classification", + "mnli": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification" +} diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py new file mode 100644 index 00000000..4a848775 --- /dev/null +++ b/TinyBERT/fine_tune_bert.py @@ -0,0 +1,460 @@ +# coding=utf-8 +# 2019.12.2-Changed for TinyBERT task-specific distillation +# Huawei Technologies Co., Ltd. +# Copyright 2020 Huawei Technologies Co., Ltd. +# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import csv +import logging +import os +import random +import sys + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler) +from tqdm import tqdm, trange + +from torch.nn import CrossEntropyLoss, MSELoss + +from data_processing import convert_examples_to_features, \ + compute_metrics, get_tensor_data, processors, output_modes +from transformer.modeling import TinyBertForSequenceClassification +from transformer.tokenization import BertTokenizer +from transformer.optimization import BertAdam +from transformer.file_utils import WEIGHTS_NAME, CONFIG_NAME + +csv.field_size_limit(sys.maxsize) + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + +oncloud = True +try: + import moxing as mox +except: + oncloud = False + + +def result_to_file(result, file_name): + with open(file_name, "a") as writer: + writer.write("") + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info("%s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + preds = [] + + for batch_ in tqdm(eval_dataloader, desc="Evaluating"): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if len(preds) == 0: + preds.append(logits.detach().cpu().numpy()) + else: + preds[0] = np.append( + preds[0], logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + preds = preds[0] + if output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(task_name, preds, eval_labels.numpy()) + result['eval_loss'] = eval_loss + + return result + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--pretrained_model", + default=None, + type=str, + help="The pretrained model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=32, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument('--weight_decay', '--wd', + default=1e-4, + type=float, + metavar='W', + help='weight decay') + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + + # added arguments + parser.add_argument('--aug_train', + action='store_true') + parser.add_argument('--eval_step', + type=int, + default=50) + parser.add_argument('--data_url', + type=str, + default="") + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # intermediate distillation default parameters + default_params = { + "cola": {"num_train_epochs": 3, "max_seq_length": 64}, + "mnli": {"num_train_epochs": 3, "max_seq_length": 128}, + "mrpc": {"num_train_epochs": 3, "max_seq_length": 128}, + "sst-2": {"num_train_epochs":3, "max_seq_length": 64}, + "sts-b": {"num_train_epochs": 3, "max_seq_length": 128}, + "qqp": {"num_train_epochs": 3, "max_seq_length": 128}, + "qnli": {"num_train_epochs": 3, "max_seq_length": 128}, + "rte": {"num_train_epochs": 5, "max_seq_length": 128} + } + + acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] + corr_tasks = ["sts-b"] + mcc_tasks = ["cola"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + # Prepare task settings + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + task_name = args.task_name.lower() + + if task_name in default_params: + args.max_seq_len = default_params[task_name]["max_seq_length"] + + if not args.do_eval: + if task_name in default_params: + args.num_train_epoch = default_params[task_name]["num_train_epochs"] + + if task_name not in processors: + raise ValueError("Task not found: %s" % task_name) + + processor = processors[task_name]() + output_mode = output_modes[task_name] + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) + + if not args.do_eval: + if not args.aug_train: + train_examples = processor.get_train_examples(args.data_dir) + else: + train_examples = processor.get_aug_examples(args.data_dir) + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps + + num_train_optimization_steps = int( + len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + model = TinyBertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels) + model.to(device) + + if args.do_eval: + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + else: + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + if n_gpu > 1: + model = torch.nn.DataParallel(model) + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + size = 0 + for n, p in model.named_parameters(): + logger.info('n: {}'.format(n)) + size += p.nelement() + + logger.info('Total parameters: {}'.format(size)) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + + optimizer = BertAdam(optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) + + # Train and evaluate + global_step = 0 + best_dev_acc = 0.0 + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + + for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0. + tr_cls_loss = 0. + + model.train() + nb_tr_examples, nb_tr_steps = 0, 0 + + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", ascii=True)): + batch = tuple(t.to(device) for t in batch) + + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch + if input_ids.size()[0] != args.train_batch_size: + continue + + cls_loss = 0. + + logits, _, _ = model(input_ids, segment_ids, input_mask) + + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + cls_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_mse = MSELoss() + cls_loss = loss_mse(logits.view(-1), label_ids.view(-1)) + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + loss.backward() + + tr_loss += loss.item() + nb_tr_examples += label_ids.size(0) + nb_tr_steps += 1 + + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \ + (global_step + 1) == num_train_optimization_steps: + logger.info("***** Running evaluation *****") + logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + + loss = tr_loss / (step + 1) + cls_loss = tr_cls_loss / (step + 1) + + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['loss'] = loss + + result_to_file(result, output_eval_file) + + save_model = False + + if task_name in acc_tasks and result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True + + if task_name in corr_tasks and result['corr'] > best_dev_acc: + best_dev_acc = result['corr'] + save_model = True + + if task_name in mcc_tasks and result['mcc'] > best_dev_acc: + best_dev_acc = result['mcc'] + save_model = True + + if save_model: + logger.info("***** Save model *****") + + model_to_save = model.module if hasattr(model, 'module') else model + + model_name = WEIGHTS_NAME + + output_model_file = os.path.join(args.output_dir, model_name) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + # Test mnli-mm + if task_name == "mnli": + task_name = "mnli-mm" + processor = processors[task_name]() + if not os.path.exists(args.output_dir + '-MM'): + os.makedirs(args.output_dir + '-MM') + + eval_examples = processor.get_dev_examples(args.data_dir) + + eval_features = convert_examples_to_features( + eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + + logger.info("***** Running mm evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, + batch_size=args.eval_batch_size) + + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + + result['global_step'] = global_step + + tmp_output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") + result_to_file(result, tmp_output_eval_file) + + task_name = 'mnli' + + if oncloud: + logging.info(mox.file.list_directory(args.output_dir, recursive=True)) + logging.info(mox.file.list_directory('.', recursive=True)) + mox.file.copy_parallel(args.output_dir, args.data_url) + mox.file.copy_parallel('.', args.data_url) + + model.train() + + +if __name__ == "__main__": + main() diff --git a/TinyBERT/model_statistics.py b/TinyBERT/model_statistics.py new file mode 100644 index 00000000..a26b6252 --- /dev/null +++ b/TinyBERT/model_statistics.py @@ -0,0 +1,93 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import math +import sys + +import torch +from thop import profile + +from data_processing import processors +from transformer.modeling import TinyBertForSequenceClassification +from transformer.tokenization import BertTokenizer + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + + +def print_results(macs, params, title=''): + if len(title) != 0: + print("- " + title) + print(f"\tmacs [G]: {macs / math.pow(10, 9):.2f}, params [M]: {params / math.pow(10, 6):.2f}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", + default=None, + type=str, + help="The anlised model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + task_name = args.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % task_name) + + processor = processors[task_name]() + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) + + model = TinyBertForSequenceClassification.from_pretrained(args.model, num_labels=num_labels) + model.to(device) + + model_input = tuple([torch.randint(high=len(tokenizer.vocab), + size=(1, args.max_seq_length), dtype=torch.int64, device=device), + torch.randint(high=1, size=(1, args.max_seq_length), dtype=torch.int64, device=device), + torch.randint(high=1, size=(1, args.max_seq_length), dtype=torch.int64, device=device)]) + + macs, params = profile(model, inputs=model_input) + + print("Results") + print_results(macs, params) + + +if __name__ == "__main__": + main() diff --git a/TinyBERT/requirements.txt b/TinyBERT/requirements.txt index 5f5389e8..2a155cdd 100644 --- a/TinyBERT/requirements.txt +++ b/TinyBERT/requirements.txt @@ -7,4 +7,6 @@ requests torch>=1.0.1 scipy>=0.14.0 -seaborn \ No newline at end of file +seaborn + +thop diff --git a/TinyBERT/scripts/download_glue_data.py b/TinyBERT/scripts/download_glue_data.py new file mode 100644 index 00000000..17c1a1f8 --- /dev/null +++ b/TinyBERT/scripts/download_glue_data.py @@ -0,0 +1,154 @@ +''' Script for downloading all GLUE data. + +Note: for legal reasons, we are unable to host MRPC. +You can either use the version hosted by the SentEval team, which is already tokenized, +or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually. +For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example). +You should then rename and place specific files in a folder (see below for an example). + +mkdir MRPC +cabextract MSRParaphraseCorpus.msi -d MRPC +cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt +cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt +rm MRPC/_* +rm MSRParaphraseCorpus.msi + +1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now. +2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray! +''' + +import argparse +import io +import os +import sys +import shutil +import tempfile +import urllib.request +import zipfile + +URLLIB = urllib.request + +TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"] +TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip', + "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip', + "QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip', + "STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip', + "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip', + "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip', + "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip', + "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip', + "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv', + 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'} + +MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' +MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' + +def download_and_extract(task, data_dir): + print("Downloading and extracting %s..." % task) + if task == "MNLI": + print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.") + data_file = "%s.zip" % task + urllib.request.urlretrieve(TASK2PATH[task], data_file) + with zipfile.ZipFile(data_file) as zip_ref: + zip_ref.extractall(data_dir) + os.remove(data_file) + print("\tCompleted!") + +def format_mrpc(data_dir, path_to_data): + print("Processing MRPC...") + mrpc_dir = os.path.join(data_dir, "MRPC") + if not os.path.isdir(mrpc_dir): + os.mkdir(mrpc_dir) + if path_to_data: + mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt") + mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt") + else: + try: + mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") + mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") + URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) + URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file) + except urllib.error.HTTPError: + print("Error downloading MRPC") + return + assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file + assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file + + with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \ + io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh: + header = data_fh.readline() + test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") + for idx, row in enumerate(data_fh): + label, id1, id2, s1, s2 = row.strip().split('\t') + test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2)) + + try: + URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv")) + except KeyError or urllib.error.HTTPError: + print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.") + return + + dev_ids = [] + with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh: + for row in ids_fh: + dev_ids.append(row.strip().split('\t')) + + with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \ + io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \ + io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh: + header = data_fh.readline() + train_fh.write(header) + dev_fh.write(header) + for row in data_fh: + label, id1, id2, s1, s2 = row.strip().split('\t') + if [id1, id2] in dev_ids: + dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) + else: + train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) + + print("\tCompleted!") + +def download_diagnostic(data_dir): + print("Downloading and extracting diagnostic...") + if not os.path.isdir(os.path.join(data_dir, "diagnostic")): + os.mkdir(os.path.join(data_dir, "diagnostic")) + data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv") + urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file) + print("\tCompleted!") + return + +def get_tasks(task_names): + task_names = task_names.split(',') + if "all" in task_names: + tasks = TASKS + else: + tasks = [] + for task_name in task_names: + assert task_name in TASKS, "Task %s not found!" % task_name + tasks.append(task_name) + return tasks + +def main(arguments): + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') + parser.add_argument('--tasks', help='tasks to download data for as a comma separated string', + type=str, default='all') + parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt', + type=str, default='') + args = parser.parse_args(arguments) + + if not os.path.isdir(args.data_dir): + os.mkdir(args.data_dir) + tasks = get_tasks(args.tasks) + + for task in tasks: + if task == 'MRPC': + format_mrpc(args.data_dir, args.path_to_mrpc) + elif task == 'diagnostic': + download_diagnostic(args.data_dir) + else: + download_and_extract(task, args.data_dir) + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) \ No newline at end of file diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py index 16905a31..2e6b9b6b 100644 --- a/TinyBERT/task_distill.py +++ b/TinyBERT/task_distill.py @@ -29,14 +29,13 @@ import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler) from tqdm import tqdm, trange from torch.nn import CrossEntropyLoss, MSELoss -from scipy.stats import pearsonr, spearmanr -from sklearn.metrics import matthews_corrcoef, f1_score +from data_processing import convert_examples_to_features, \ + compute_metrics, get_tensor_data, processors, output_modes from transformer.modeling import TinyBertForSequenceClassification from transformer.tokenization import BertTokenizer from transformer.optimization import BertAdam @@ -59,546 +58,10 @@ oncloud = False -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.seq_length = seq_length - self.label_id = label_id - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) - lines.append(line) - return lines - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[3] - text_b = line[4] - label = line[0] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), - "dev_matched") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliMismatchedProcessor(MnliProcessor): - """Processor for the MultiNLI Mismatched data set (GLUE version).""" - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), - "dev_matched") - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class StsbProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return [None] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[7] - text_b = line[8] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QqpProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - try: - text_a = line[3] - text_b = line[4] - label = line[5] - except IndexError: - continue - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QnliProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), - "dev_matched") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class RteProcessor(DataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class WnliProcessor(DataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -def convert_examples_to_features(examples, label_list, max_seq_length, - tokenizer, output_mode): - """Loads a data file into a list of `InputBatch`s.""" - - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: - logger.info("Writing example %d of %d" % (ex_index, len(examples))) - - tokens_a = tokenizer.tokenize(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[:(max_seq_length - 2)] - - tokens = ["[CLS]"] + tokens_a + ["[SEP]"] - segment_ids = [0] * len(tokens) - - if tokens_b: - tokens += tokens_b + ["[SEP]"] - segment_ids += [1] * (len(tokens_b) + 1) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - input_mask = [1] * len(input_ids) - seq_length = len(input_ids) - - padding = [0] * (max_seq_length - len(input_ids)) - input_ids += padding - input_mask += padding - segment_ids += padding - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = float(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 1: - logger.info("*** Example ***") - logger.info("guid: %s" % (example.guid)) - logger.info("tokens: %s" % " ".join( - [str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - logger.info("label: {}".format(example.label)) - logger.info("label_id: {}".format(label_id)) - - features.append( - InputFeatures(input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_id=label_id, - seq_length=seq_length)) - return features - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -def simple_accuracy(preds, labels): - return (preds == labels).mean() - - -def acc_and_f1(preds, labels): - acc = simple_accuracy(preds, labels) - f1 = f1_score(y_true=labels, y_pred=preds) - return { - "acc": acc, - "f1": f1, - "acc_and_f1": (acc + f1) / 2, - } - - -def pearson_and_spearman(preds, labels): - pearson_corr = pearsonr(preds, labels)[0] - spearman_corr = spearmanr(preds, labels)[0] - return { - "pearson": pearson_corr, - "spearmanr": spearman_corr, - "corr": (pearson_corr + spearman_corr) / 2, - } - - -def compute_metrics(task_name, preds, labels): - assert len(preds) == len(labels) - if task_name == "cola": - return {"mcc": matthews_corrcoef(labels, preds)} - elif task_name == "sst-2": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "mrpc": - return acc_and_f1(preds, labels) - elif task_name == "sts-b": - return pearson_and_spearman(preds, labels) - elif task_name == "qqp": - return acc_and_f1(preds, labels) - elif task_name == "mnli": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "mnli-mm": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "qnli": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "rte": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "wnli": - return {"acc": simple_accuracy(preds, labels)} - else: - raise KeyError(task_name) - - -def get_tensor_data(output_mode, features): - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) - - all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_label_ids, all_seq_lengths) - return tensor_data, all_label_ids - - def result_to_file(result, file_name): with open(file_name, "a") as writer: logger.info("***** Eval results *****") + writer.write("") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) @@ -744,31 +207,6 @@ def main(): args = parser.parse_args() logger.info('The args: {}'.format(args)) - processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mnli-mm": MnliMismatchedProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, - "sts-b": StsbProcessor, - "qqp": QqpProcessor, - "qnli": QnliProcessor, - "rte": RteProcessor, - "wnli": WnliProcessor - } - - output_modes = { - "cola": "classification", - "mnli": "classification", - "mrpc": "classification", - "sst-2": "classification", - "sts-b": "regression", - "qqp": "classification", - "qnli": "classification", - "rte": "classification", - "wnli": "classification" - } - # intermediate distillation default parameters default_params = { "cola": {"num_train_epochs": 50, "max_seq_length": 64}, @@ -991,7 +429,8 @@ def soft_cross_entropy(predicts, targets): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0: + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \ + (global_step + 1) == num_train_optimization_steps: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) logger.info(" Num examples = %d", len(eval_examples))