GPT2 model (Multi Head)

This notebook contains some example of how to use the GPT2-based models in this NLP library

In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out

import os

#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything

from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import AutoTokenizer
from datasets import Dataset

Define the custom augmentation function

def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

Create a TextDataController object

We will reuse the data and the preprocessings in this tutorial

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names=['Division Name','Department Name'],
                         sup_types=['classification','classification'],
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=0.2,
                         batch_size=1000,
                         seed=42,
                         num_proc=20,
                         verbose=False
                        )

Define our tokenizer for GPT2

_tokenizer = AutoTokenizer.from_pretrained('gpt2')
_tokenizer.pad_token = _tokenizer.eos_token
_tokenizer.padding_side = 'left'

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

print(_tokenizer)
print(len(_tokenizer))

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
    50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
50257

Process and tokenize our dataset

tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)

tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18101
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4526
    })
})

Model Experiment: GPT2 Vanilla Multihead classification

Define and train a vanilla GPT2Base model

from transformers.models.gpt2.modeling_gpt2 import GPT2Model

from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_score

gpt2body = GPT2Model.from_pretrained('gpt2')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

num_classes = [len(tdc.label_lists[0]),len(tdc.label_lists[1])] 
num_classes

[3, 6]

# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'is_multilabel':tdc.is_multilabel, # False
    'is_multihead':tdc.is_multihead, # True
    'head_weights':[1,1],
    # classfication head hyperparams
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=False, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124446720
Total trainable parameters: 124446720

# resize token embedding
model.body_model.resize_token_embeddings(len(_tokenizer))

Embedding(50257, 768)

metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

[849/849 06:52, Epoch 3/3]

Epoch	Training Loss	Validation Loss	F1 Score Division name	Accuracy Score Division name	F1 Score Department name	Accuracy Score Department name
1	No log	1.232870	0.408721	0.610252	0.619346	0.854839
2	1.711600	1.140891	0.453457	0.612240	0.676615	0.879585
3	1.711600	1.139938	0.465679	0.612019	0.680045	0.881794

controller.trainer.model.save_pretrained('./sample_weights/my_model1')

Make predictions

trained_model = model_init_classification(model_class = GPT2BaseForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model1'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

Total parameters: 124446720
Total trainable parameters: 124446720

df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------

df_val = df_val.to_pandas()
df_val.head()

	Title	Review Text	Division Name	Department Name	label	input_ids	attention_mask	pred_Division Name	pred_prob_Division Name	pred_Department Name	pred_prob_Department Name
0		. such a fun jacket ! great to wear in the spr...	General Petite	Intimate	[1, 2]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.529604	Jackets	0.734662
1	simple and elegant	simple and elegant . i thought this shirt was ...	General Petite	Tops	[1, 4]	[36439, 290, 19992, 764, 1312, 1807, 428, 1014...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.551158	Tops	0.982016
2	retro and pretty	retro and pretty . this top has a bit of a ret...	General	Tops	[0, 4]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.656122	Tops	0.990658
3	summer/fall wear	summer / fall wear . i first spotted this on a...	General Petite	Dresses	[1, 1]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...	General	0.550641	Dresses	0.932515
4	perfect except slip	perfect except slip . this is my new favorite ...	General Petite	Dresses	[1, 1]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.512942	Dresses	0.988941

You can try to get your metric to see if it matches your last traing epoch’s above

f1_score(df_val['Division Name'],df_val['pred_Division Name'],average='macro')
# 0.45506833397695967

0.4656789717699823

f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')
# 0.6795641996672526

0.6800451854614634

Model Experiment: GPT2 Multi-Head Classification (with Hidden Layer Concatenation)

from transformers.models.gpt2.modeling_gpt2 import GPT2Model

from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.model_main import *
from that_nlp_library.models.gpt2.classifiers import *
from sklearn.metrics import f1_score, accuracy_score

Define and train a custom GPT2 model

num_classes = [len(tdc.label_lists[0]),len(tdc.label_lists[1])] 
num_classes

[3, 6]

gpt2body = GPT2Model.from_pretrained('gpt2')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class': ConcatHeadSimple,
    'is_multilabel':tdc.is_multilabel, # False
    'is_multihead':tdc.is_multihead, # True
    'head_weights':[1,1], # weights for label 1 and label 2 This means L2's weight is twice as much as L1's
    # classfication head hyperparams
    'layer2concat':2, # you can change the number of layers to concat (default is 4, based on the paper)
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'gpt2', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=gpt2body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124453641
Total trainable parameters: 124453641

metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

And we can start training our model

seed_everything(42)

lr = 8e-5
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

[849/849 06:50, Epoch 3/3]

Epoch	Training Loss	Validation Loss	F1 Score Division name	Accuracy Score Division name	F1 Score Department name	Accuracy Score Department name
1	No log	1.443740	0.355963	0.602298	0.597171	0.846001
2	2.155500	1.200898	0.446875	0.610252	0.657054	0.867212
3	2.155500	1.193677	0.459581	0.604507	0.659238	0.869863

controller.trainer.model.save_pretrained('./sample_weights/my_model1')

Make predictions

Load trained model

_model_kwargs

{'head_class_sizes': [3, 6],
 'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
 'is_multilabel': False,
 'is_multihead': True,
 'head_weights': [1, 1],
 'layer2concat': 2,
 'classifier_dropout': 0.1}

trained_model = model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model1'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

Total parameters: 124453641
Total trainable parameters: 124453641

Predict Train/Validation set

df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------

df_val = df_val.to_pandas()
df_val.head()

	Title	Review Text	Division Name	Department Name	label	input_ids	attention_mask	pred_Division Name	pred_prob_Division Name	pred_Department Name	pred_prob_Department Name
0		. such a fun jacket ! great to wear in the spr...	General Petite	Intimate	[1, 2]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.481536	Jackets	0.862960
1	simple and elegant	simple and elegant . i thought this shirt was ...	General Petite	Tops	[1, 4]	[36439, 290, 19992, 764, 1312, 1807, 428, 1014...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.632173	Tops	0.988462
2	retro and pretty	retro and pretty . this top has a bit of a ret...	General	Tops	[0, 4]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.620987	Tops	0.988526
3	summer/fall wear	summer / fall wear . i first spotted this on a...	General Petite	Dresses	[1, 1]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...	General	0.670211	Dresses	0.925290
4	perfect except slip	perfect except slip . this is my new favorite ...	General Petite	Dresses	[1, 1]	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.562400	Dresses	0.983849

You can try to get your metric to see if it matches your last traing epoch’s above

f1_score(df_val['Division Name'],df_val['pred_Division Name'],average='macro')

0.45352717018986105

f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')

0.6628016843655465

Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set

df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)

# save the label, as we will calculate some metrics later. We also filter out labels with NaN Review Text,
# as there will be a filtering processing on the test set
true_labels = df_test.loc[~df_test['Review Text'].isna(),'Department Name'].values 

# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop(['Division Name','Department Name'],axis=1,inplace=True)

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True, # since we have some text filtering in the processing
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	input_ids	attention_mask	pred_Division Name	pred_prob_Division Name	pred_Department Name	pred_prob_Department Name
0	perfect for work and play	perfect for work and play . this shirt works f...	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.609993	Tops	0.990637
1		. i don't know why i had the opposite problem ...	[13, 1312, 836, 470, 760, 1521, 1312, 550, 262...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.814815	Bottoms	0.995138
2	great pants	great pants . thes e cords are great--lightwei...	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.658985	Bottoms	0.994834
3	surprisingly comfy for a button down	surprisingly comfy for a button down . i am a ...	[41199, 401, 24928, 329, 257, 4936, 866, 764, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.692814	Tops	0.975600
4	short and small	short and small . the shirt is mostly a thick ...	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	General	0.621851	Tops	0.947042

Let’s quickly check the f1 score to make sure everything works correctly

f1_score(true_labels,df_test_predicted['pred_Department Name'],average='macro')

0.6731872399763867

Predict top k results

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True,
                                                   topk=3
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	input_ids	attention_mask	pred_Division Name	pred_prob_Division Name	pred_Department Name	pred_prob_Department Name
0	perfect for work and play	perfect for work and play . this shirt works f...	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	[General, General Petite, Initmates]	[0.6099926, 0.38029346, 0.009713977]	[Tops, Intimate, Jackets]	[0.99063677, 0.0068505756, 0.0011757102]
1		. i don't know why i had the opposite problem ...	[13, 1312, 836, 470, 760, 1521, 1312, 550, 262...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, General Petite, Initmates]	[0.81481504, 0.17968053, 0.005504485]	[Bottoms, Intimate, Trend]	[0.9951379, 0.0033080056, 0.00085609016]
2	great pants	great pants . thes e cords are great--lightwei...	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	[General, General Petite, Initmates]	[0.65898484, 0.31146494, 0.029550197]	[Bottoms, Intimate, Trend]	[0.99483407, 0.0045331665, 0.00034278215]
3	surprisingly comfy for a button down	surprisingly comfy for a button down . i am a ...	[41199, 401, 24928, 329, 257, 4936, 866, 764, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, General Petite, Initmates]	[0.6928137, 0.29272404, 0.014462292]	[Tops, Jackets, Intimate]	[0.9755996, 0.011239706, 0.0076327748]
4	short and small	short and small . the shirt is mostly a thick ...	[50256, 50256, 50256, 50256, 50256, 50256, 502...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	[General, General Petite, Initmates]	[0.6218508, 0.32296118, 0.055188037]	[Tops, Intimate, Jackets]	[0.94704163, 0.0429352, 0.0054707266]

# Since we have some metadatas (Title), we need to define a dictionary containing those values
raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt'}

controller.data_store.num_proc=1

df_result = controller.predict_raw_text(raw_content,topk=3)

-------------------- Start making predictions --------------------

df_result

{'Review Text': ['great shirt . this shirt is so comfortable i love it !'],
 'Title': ['great shirt'],
 'input_ids': [[18223,
   10147,
   764,
   428,
   10147,
   318,
   523,
   6792,
   1312,
   1842,
   340,
   5145]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'pred_Division Name': [['General', 'General Petite', 'Initmates']],
 'pred_prob_Division Name': [[0.6347099542617798,
   0.347072035074234,
   0.018217984586954117]],
 'pred_Department Name': [['Tops', 'Intimate', 'Jackets']],
 'pred_prob_Department Name': [[0.9934250116348267,
   0.005981422495096922,
   0.00047667595208622515]]}