GPT2 model (Custom Single Head)

This notebook contains some example of how to use the GPT2-based models in this NLP library

In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out

import os
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything
from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import AutoTokenizer
from datasets import Dataset

Define the custom augmentation function

def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals
aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

Create a TextDataController object

We will reuse the data and the preprocessings in this tutorial

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         # add "str.lower" here because nearby_aug might return uppercase character
                         val_ratio=0.2,
                         batch_size=1000,
                         seed=42,
                         num_proc=20,
                         verbose=False
                        )

Define our tokenizer for GPT2

_tokenizer = AutoTokenizer.from_pretrained('gpt2')
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
_tokenizer.pad_token = _tokenizer.eos_token
_tokenizer.padding_side = 'left'
print(_tokenizer)
print(len(_tokenizer))
GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
    50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
50257

Process and tokenize our dataset

tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)
tdc.main_ddict
DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18102
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4526
    })
})

Model Experiment: GPT2 Single-Head Classification

Define and train a vanilla GPT2 model

from transformers.models.gpt2.modeling_gpt2 import GPT2Model
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_score

Using HuggingFace model initialization

from transformers.models.gpt2.modeling_gpt2 import GPT2ForSequenceClassification
num_classes = len(tdc.label_lists[0])
num_classes
6
seed_everything(42)
model = GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=num_classes)
model = model.to('cuda:0')
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(_tokenizer))
Embedding(50257, 768)
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )
[849/849 02:55, Epoch 3/3]
Epoch Training Loss Validation Loss F1 Score Department name Accuracy Score Department name
1 No log 0.283675 0.739092 0.910075
2 0.656600 0.261791 0.749196 0.920901
3 0.656600 0.263783 0.751478 0.922448

Using the GPT2Base model (designed for not only single-head but multi-head, multi-label …)

gpt2body = GPT2Model.from_pretrained('gpt2')
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    # classfication head hyperparams
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=False, # since we are not using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)
Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124444416
Total trainable parameters: 124444416
# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))
Embedding(50257, 768)

Create ModelController and start training

metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )
[849/849 03:08, Epoch 3/3]
Epoch Training Loss Validation Loss F1 Score Department name Accuracy Score Department name
1 No log 0.293438 0.736128 0.910296
2 0.743200 0.263558 0.748740 0.918692
3 0.743200 0.264788 0.746244 0.917587

Make predictions

df_val = controller.predict_ddict(ds_type='validation')
-------------------- Start making predictions --------------------
df_val = df_val.to_pandas()
df_val.head()
Title Review Text Division Name Department Name label input_ids attention_mask pred_Department Name pred_prob_Department Name
0 general petite . . such a fun jacket ! great t... general petite Intimate 2 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Jackets 0.879402
1 simple and elegant general petite . simple and elegant . i though... general petite Tops 4 [24622, 4273, 578, 764, 2829, 290, 19992, 764,... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... Tops 0.998374
2 retro and pretty general . retro and pretty . this top has a bi... general Tops 4 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Tops 0.999834
3 summer/fall wear general petite . summer / fall wear . i first ... general petite Dresses 1 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ... Dresses 0.949195
4 perfect except slip general petite . perfect except slip . this is... general petite Dresses 1 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Dresses 0.993209

You can try to get your metric to see if it matches your last traing epoch’s above

f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')
0.7462441580902758

Model Experiment: GPT2 Single-Head Classification (with hidden layer concatenation)

Define and train a custom GPT2 model

from transformers.models.gpt2.modeling_gpt2 import GPT2Model
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_score
num_classes = len(tdc.label_lists[0])
num_classes
6
gpt2body = GPT2Model.from_pretrained('gpt2')
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

Then we can define a classification head. One trick we can use to boost the performance of our entire model is to concatenate the outputs of the last tokens from the four last layers of the pre-trained Roberta model (an improvised approach from this source: https://ieeexplore.ieee.org/document/9335912). We already define such custom head (ConcatHeadSimple), and the necessary architecture to make it work (GPT2HiddenStateConcatForSequenceClassification)

# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class': ConcatHeadSimple,
    # classfication head hyperparams
    'layer2concat':2, # you can change the number of layers to concat (default is 4, based on the paper)
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)
Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124449030
Total trainable parameters: 124449030
# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))
Embedding(50257, 768)
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

seed_everything(42)
lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

# Epoch Training Loss   Validation Loss F1 Score Department name    Accuracy Score Department name
# 1 No log  0.301476    0.746599    0.914494
# 2 0.400300    0.263080    0.749670    0.920901
[849/849 03:15, Epoch 3/3]
Epoch Training Loss Validation Loss F1 Score Department name Accuracy Score Department name
1 No log 0.316337 0.733385 0.907866
2 0.719600 0.271895 0.746649 0.915820
3 0.719600 0.269325 0.745756 0.916924

controller.trainer.model.save_pretrained('./sample_weights/my_model')

Make predictions

Load trained model

_model_kwargs
{'head_class_sizes': 6,
 'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
 'layer2concat': 2,
 'classifier_dropout': 0.1}
trained_model = model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)
Total parameters: 124449030
Total trainable parameters: 124449030

Predict Train/Validation set

df_val = controller.predict_ddict(ds_type='validation')
-------------------- Start making predictions --------------------
df_val = df_val.to_pandas()
df_val.head()
Title Review Text Division Name Department Name label input_ids attention_mask pred_Department Name pred_prob_Department Name
0 general petite . . such a fun jacket ! great t... general petite Intimate 2 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Jackets 0.937680
1 simple and elegant general petite . simple and elegant . i though... general petite Tops 4 [24622, 4273, 578, 764, 2829, 290, 19992, 764,... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... Tops 0.997647
2 retro and pretty general . retro and pretty . this top has a bi... general Tops 4 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Tops 0.998581
3 summer/fall wear general petite . summer / fall wear . i first ... general petite Dresses 1 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ... Dresses 0.973116
4 perfect except slip general petite . perfect except slip . this is... general petite Dresses 1 [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Dresses 0.996431

You can try to get your metric to see if it matches your last traing epoch’s above

f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')
0.746104914178913

Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set

df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)

# save the label, as we will calculate some metrics later. We also filter out labels with NaN Review Text,
# as there will be a filtering processing on the test set
true_labels = df_test.loc[~df_test['Review Text'].isna(),'Department Name'].values 

# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop('Department Name',axis=1,inplace=True)
_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True, # since we have some text filtering in the processing
                                                  )
-------------------- Start making predictions --------------------
df_test_predicted = _test_dset_predicted.to_pandas()
df_test_predicted.head()
Title Review Text Division Name input_ids attention_mask pred_Department Name pred_prob_Department Name
0 perfect for work and play general . perfect for work and play . this shi... general [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Tops 0.999322
1 general petite . . i don't know why i had the ... general petite [24622, 4273, 578, 764, 764, 1312, 836, 470, 7... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... Bottoms 0.988174
2 great pants general petite . great pants . thes e cords ar... general petite [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Bottoms 0.995248
3 surprisingly comfy for a button down general petite . surprisingly comfy for a butt... general petite [24622, 4273, 578, 764, 12362, 401, 24928, 329... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... Tops 0.914309
4 short and small general petite . short and small . the shirt i... general petite [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... Tops 0.997992

Let’s quickly check the f1 score to make sure everything works correctly

f1_score(true_labels,df_test_predicted['pred_Department Name'],average='macro')
0.757361738460204

Predict top k results

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True,
                                                   topk=3
                                                  )
-------------------- Start making predictions --------------------
df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()
Title Review Text Division Name input_ids attention_mask pred_Department Name pred_prob_Department Name
0 perfect for work and play general . perfect for work and play . this shi... general [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [Tops, Dresses, Jackets] [0.9993216, 0.00019925594, 0.00019223447]
1 general petite . . i don't know why i had the ... general petite [24622, 4273, 578, 764, 764, 1312, 836, 470, 7... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [Bottoms, Intimate, Jackets] [0.98817396, 0.0063540265, 0.004216876]
2 great pants general petite . great pants . thes e cords ar... general petite [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [Bottoms, Intimate, Trend] [0.995248, 0.004601938, 0.00012541868]
3 surprisingly comfy for a button down general petite . surprisingly comfy for a butt... general petite [24622, 4273, 578, 764, 12362, 401, 24928, 329... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [Tops, Dresses, Bottoms] [0.914309, 0.06820013, 0.00940009]
4 short and small general petite . short and small . the shirt i... general petite [50256, 50256, 50256, 50256, 50256, 50256, 502... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [Tops, Intimate, Jackets] [0.9979918, 0.00080933404, 0.0005731517]
# Since we have some metadatas (Title and Division Name), we need to define a dictionary containing those values
raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt',
             'Division Name': 'general'}
controller.data_store.num_proc=1
df_result = controller.predict_raw_text(raw_content,topk=3)
-------------------- Start making predictions --------------------
df_result
{'Review Text': ['general . great shirt . this shirt is so comfortable i love it !'],
 'Title': ['great shirt'],
 'Division Name': ['general'],
 'input_ids': [[24622,
   764,
   1049,
   10147,
   764,
   428,
   10147,
   318,
   523,
   6792,
   1312,
   1842,
   340,
   5145]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'pred_Department Name': [['Tops', 'Trend', 'Dresses']],
 'pred_prob_Department Name': [[0.9987654685974121,
   0.0003842698351945728,
   0.0003074762353207916]]}