import os
GPT2 model (Multi Head)
This notebook contains some example of how to use the GPT2-based models in this NLP library
In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out
#This will specify a (or a list) of GPUs for training
'CUDA_VISIBLE_DEVICES'] = "0" os.environ[
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything
from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import AutoTokenizer
from datasets import Dataset
Define the custom augmentation function
def nlp_aug_stochastic(x,aug=None,p=0.5):
if not isinstance(x,list):
if random.random()<p: return aug.augment(x)[0]
return x
=[]
news=[]
originalsfor _x in x:
if random.random()<p: news.append(_x)
else: originals.append(_x)
# only perform augmentation when needed
if len(news): news = aug.augment(news)
return news+originals
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.3) nearby_aug_func
Create a TextDataController object
We will reuse the data and the preprocessings in this tutorial
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train') dset
= TextDataController(dset,
tdc ='Review Text',
main_text=['Division Name','Department Name'],
label_names=['classification','classification'],
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=0.2,
val_ratio=1000,
batch_size=42,
seed=20,
num_proc=False
verbose )
Define our tokenizer for GPT2
= AutoTokenizer.from_pretrained('gpt2')
_tokenizer = _tokenizer.eos_token
_tokenizer.pad_token = 'left' _tokenizer.padding_side
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
print(_tokenizer)
print(len(_tokenizer))
GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
50257
Process and tokenize our dataset
=100,shuffle_trn=True) tdc.process_and_tokenize(_tokenizer,max_length
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18101
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})
Model Experiment: GPT2 Vanilla Multihead classification
Define and train a vanilla GPT2Base model
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_score
= GPT2Model.from_pretrained('gpt2') gpt2body
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
= [len(tdc.label_lists[0]),len(tdc.label_lists[1])]
num_classes num_classes
[3, 6]
# our model is more complex, so it's best to define some of its arguments
={
_model_kwargs# overall model hyperparams
'head_class_sizes':num_classes,
'is_multilabel':tdc.is_multilabel, # False
'is_multihead':tdc.is_multihead, # True
'head_weights':[1,1],
# classfication head hyperparams
'classifier_dropout':0.1
}
= model_init_classification(model_class = GPT2BaseForSequenceClassification,
model = 'gpt2',
cpoint_path =False, # since we are using 'hidden layer contatenation' technique
output_hidden_states=42,
seed=gpt2body,
body_model= _model_kwargs) model_kwargs
Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124446720
Total trainable parameters: 124446720
# resize token embedding
len(_tokenizer)) model.body_model.resize_token_embeddings(
Embedding(50257, 768)
= [partial(f1_score,average='macro'),accuracy_score]
metric_funcs = ModelController(model,tdc,seed=42) controller
And we can start training our model
= 8e-5
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics )
[849/849 06:52, Epoch 3/3]
Epoch | Training Loss | Validation Loss | F1 Score Division name | Accuracy Score Division name | F1 Score Department name | Accuracy Score Department name |
---|---|---|---|---|---|---|
1 | No log | 1.232870 | 0.408721 | 0.610252 | 0.619346 | 0.854839 |
2 | 1.711600 | 1.140891 | 0.453457 | 0.612240 | 0.676615 | 0.879585 |
3 | 1.711600 | 1.139938 | 0.465679 | 0.612019 | 0.680045 | 0.881794 |
'./sample_weights/my_model1') controller.trainer.model.save_pretrained(
Make predictions
= model_init_classification(model_class = GPT2BaseForSequenceClassification,
trained_model = Path('./sample_weights/my_model1'),
cpoint_path =True,
output_hidden_states=42,
seed= _model_kwargs)
model_kwargs
= ModelController(trained_model,tdc,seed=42) controller
Total parameters: 124446720
Total trainable parameters: 124446720
= controller.predict_ddict(ds_type='validation') df_val
-------------------- Start making predictions --------------------
= df_val.to_pandas()
df_val df_val.head()
Title | Review Text | Division Name | Department Name | label | input_ids | attention_mask | pred_Division Name | pred_prob_Division Name | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | . such a fun jacket ! great to wear in the spr... | General Petite | Intimate | [1, 2] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | General | 0.529604 | Jackets | 0.734662 | |
1 | simple and elegant | simple and elegant . i thought this shirt was ... | General Petite | Tops | [1, 4] | [36439, 290, 19992, 764, 1312, 1807, 428, 1014... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | General | 0.551158 | Tops | 0.982016 |
2 | retro and pretty | retro and pretty . this top has a bit of a ret... | General | Tops | [0, 4] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | General | 0.656122 | Tops | 0.990658 |
3 | summer/fall wear | summer / fall wear . i first spotted this on a... | General Petite | Dresses | [1, 1] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ... | General | 0.550641 | Dresses | 0.932515 |
4 | perfect except slip | perfect except slip . this is my new favorite ... | General Petite | Dresses | [1, 1] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | General | 0.512942 | Dresses | 0.988941 |
You can try to get your metric to see if it matches your last traing epoch’s above
'Division Name'],df_val['pred_Division Name'],average='macro')
f1_score(df_val[# 0.45506833397695967
0.4656789717699823
'Department Name'],df_val['pred_Department Name'],average='macro')
f1_score(df_val[# 0.6795641996672526
0.6800451854614634