import osGPT2 model (Custom Single Head)
This notebook contains some example of how to use the GPT2-based models in this NLP library
In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everythingfrom underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import AutoTokenizer
from datasets import DatasetDefine the custom augmentation function
def nlp_aug_stochastic(x,aug=None,p=0.5):
if not isinstance(x,list):
if random.random()<p: return aug.augment(x)[0]
return x
news=[]
originals=[]
for _x in x:
if random.random()<p: news.append(_x)
else: originals.append(_x)
# only perform augmentation when needed
if len(news): news = aug.augment(news)
return news+originalsaug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)Create a TextDataController object
We will reuse the data and the preprocessings in this tutorial
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')tdc = TextDataController(dset,
main_text='Review Text',
label_names='Department Name',
sup_types='classification',
filter_dict={'Review Text': lambda x: x is not None,
'Department Name': lambda x: x is not None,
},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
content_augmentations= [nearby_aug_func,str.lower],
# add "str.lower" here because nearby_aug might return uppercase character
val_ratio=0.2,
batch_size=1000,
seed=42,
num_proc=20,
verbose=False
)Define our tokenizer for GPT2
_tokenizer = AutoTokenizer.from_pretrained('gpt2')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
_tokenizer.pad_token = _tokenizer.eos_token
_tokenizer.padding_side = 'left'print(_tokenizer)
print(len(_tokenizer))GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
50257
Process and tokenize our dataset
tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)tdc.main_ddictDatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18102
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})
Model Experiment: GPT2 Single-Head Classification
Define and train a vanilla GPT2 model
from transformers.models.gpt2.modeling_gpt2 import GPT2Modelfrom that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_scoreUsing HuggingFace model initialization
from transformers.models.gpt2.modeling_gpt2 import GPT2ForSequenceClassificationnum_classes = len(tdc.label_lists[0])
num_classes6
seed_everything(42)
model = GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=num_classes)
model = model.to('cuda:0')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
model.config.pad_token_id = model.config.eos_token_idmodel.resize_token_embeddings(len(_tokenizer))Embedding(50257, 768)
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)And we can start training our model
lr = 8e-5
bs=32
wd=0.01
epochs= 3
controller.fit(epochs,lr,
metric_funcs=metric_funcs,
batch_size=bs,
weight_decay=wd,
save_checkpoint=False,
compute_metrics=compute_metrics,
)
[849/849 02:55, Epoch 3/3]
| Epoch | Training Loss | Validation Loss | F1 Score Department name | Accuracy Score Department name |
|---|---|---|---|---|
| 1 | No log | 0.283675 | 0.739092 | 0.910075 |
| 2 | 0.656600 | 0.261791 | 0.749196 | 0.920901 |
| 3 | 0.656600 | 0.263783 | 0.751478 | 0.922448 |
Using the GPT2Base model (designed for not only single-head but multi-head, multi-label …)
gpt2body = GPT2Model.from_pretrained('gpt2')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
# overall model hyperparams
'head_class_sizes':num_classes,
# classfication head hyperparams
'classifier_dropout':0.1
}
model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
cpoint_path = 'gpt2',
output_hidden_states=False, # since we are not using 'hidden layer contatenation' technique
seed=42,
body_model=gpt2body,
model_kwargs = _model_kwargs)Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124444416
Total trainable parameters: 124444416
# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))Embedding(50257, 768)
Create ModelController and start training
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)And we can start training our model
lr = 8e-5
bs=32
wd=0.01
epochs= 3
controller.fit(epochs,lr,
metric_funcs=metric_funcs,
batch_size=bs,
weight_decay=wd,
save_checkpoint=False,
compute_metrics=compute_metrics,
)
[849/849 03:08, Epoch 3/3]
| Epoch | Training Loss | Validation Loss | F1 Score Department name | Accuracy Score Department name |
|---|---|---|---|---|
| 1 | No log | 0.293438 | 0.736128 | 0.910296 |
| 2 | 0.743200 | 0.263558 | 0.748740 | 0.918692 |
| 3 | 0.743200 | 0.264788 | 0.746244 | 0.917587 |
Make predictions
df_val = controller.predict_ddict(ds_type='validation')-------------------- Start making predictions --------------------
df_val = df_val.to_pandas()
df_val.head()| Title | Review Text | Division Name | Department Name | label | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | general petite . . such a fun jacket ! great t... | general petite | Intimate | 2 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | Jackets | 0.879402 | |
| 1 | simple and elegant | general petite . simple and elegant . i though... | general petite | Tops | 4 | [24622, 4273, 578, 764, 2829, 290, 19992, 764,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.998374 |
| 2 | retro and pretty | general . retro and pretty . this top has a bi... | general | Tops | 4 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | Tops | 0.999834 |
| 3 | summer/fall wear | general petite . summer / fall wear . i first ... | general petite | Dresses | 1 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ... | Dresses | 0.949195 |
| 4 | perfect except slip | general petite . perfect except slip . this is... | general petite | Dresses | 1 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | Dresses | 0.993209 |
You can try to get your metric to see if it matches your last traing epoch’s above
f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')0.7462441580902758