Roberta model with Deep Hierarchical Classification

This notebook contains some example of how to use the Roberta-based models in this NLP library

In this tutorial, we walk through another special case of classification with multiple heads, which is based on this paper: https://arxiv.org/ftp/arxiv/papers/2005/2005.06692.pdf

import os

#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything

from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import RobertaTokenizer
from datasets import Dataset

Define the custom augmentation function

def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

Create a TextDataController object

We will reuse the data and the preprocessings in this tutorial

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names=['Division Name','Department Name'],
                         sup_types=['classification','classification'],
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=0.2,
                         batch_size=1000,
                         seed=42,
                         num_proc=20,
                         verbose=False
                        )

Define our tokenizer for Roberta

_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

Process and tokenize our dataset

tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)

tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18101
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4526
    })
})

Model Experiment: Roberta Multi-Head Classification (with Hidden Layer Concatenation)

from that_nlp_library.models.roberta.deep_hierarchical_classifiers import *
from that_nlp_library.models.roberta.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import f1_score, accuracy_score
from transformers.models.roberta.modeling_roberta import RobertaModel
import torch

Build DHC Conditional Mask

tdc.label_names

['Division Name', 'Department Name']

tdc.label_lists

[['General', 'General Petite', 'Initmates'],
 ['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']]

df_trn = tdc.main_ddict['train'].to_pandas()

df_labels = pd.DataFrame(df_trn['label'].tolist())
df_labels.columns=tdc.label_names

df_labels.head()

	Division Name	Department Name
0	0	4
1	1	1
2	1	1
3	1	3
4	0	1

dhc_mask = build_DHC_conditional_mask(df_labels,*tdc.label_names)

dhc_mask.shape

torch.Size([3, 6])

dhc_mask

tensor([[1., 1., 0., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [0., 0., 1., 0., 0., 0.]])

tdc.label_lists

[['General', 'General Petite', 'Initmates'],
 ['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']]

Explain the first row of the mask (for the first label of Division Name: General)

dhc_mask[0]

tensor([1., 1., 0., 1., 1., 1.])

Slicing the first portion for Department Name, show string for True mask. The results are the sub-category of Division Name

for i in torch.where(dhc_mask[0]==True)[0]:
    print(tdc.label_lists[1][i])

Bottoms
Dresses
Jackets
Tops
Trend

# let's double check with the original data
np.sort(df_trn[df_trn['Division Name']=='General']['Department Name'].unique())

array(['Bottoms', 'Dresses', 'Jackets', 'Tops', 'Trend'], dtype=object)

Define and train a custom Roberta model

Full model

roberta_body = RobertaModel.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

_model_kwargs={
    'dhc_mask':dhc_mask,
    'classifier_dropout':0.1,
    'last_hidden_size':768,  
    'linear_l1_size':389,
    'linear_l2_size':417,
    'lloss_weight':1.0,
    'dloss_weight':0.8,
    'layer2concat':4,
}

model = model_init_classification(model_class = RobertaHSCDHCSequenceClassification,
                                  cpoint_path = 'roberta-base', 
                                  output_hidden_states=True,
                                  seed=42,
                                  body_model=roberta_body,
                                  model_kwargs = _model_kwargs)

metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 127631522
Total trainable parameters: 127631522

And we can start training our model

seed_everything(42)

lr = 1e-4
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics_separate_heads,
              )

[849/849 06:06, Epoch 3/3]

Epoch	Training Loss	Validation Loss	F1 Score Division name	Accuracy Score Division name	F1 Score Department name	Accuracy Score Department name
1	No log	3.563722	0.409837	0.612903	0.649168	0.867433
2	3.664900	3.445055	0.439022	0.616880	0.679065	0.881573
3	3.664900	3.447504	0.450244	0.620194	0.681202	0.883120

Simpler model

seed_everything(42)

roberta_body = RobertaModel.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

_model_kwargs={
    'dhc_mask':dhc_mask,
    'lloss_weight':1.0,
    'dloss_weight':0.8,
    'layer2concat':4,
}

model = model_init_classification(model_class = RobertaSimpleHSCDHCSequenceClassification,
                                  cpoint_path = 'roberta-base', 
                                  output_hidden_states=True,
                                  seed=42,
                                  body_model=roberta_body,
                                  model_kwargs = _model_kwargs)

metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124664112
Total trainable parameters: 124664112

And we can start training our model

lr = 1e-4
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics_separate_heads,
              )

[849/849 04:37, Epoch 3/3]

Epoch	Training Loss	Validation Loss	F1 Score Division name	Accuracy Score Division name	F1 Score Department name	Accuracy Score Department name
1	No log	3.566824	0.430449	0.610915	0.627627	0.859258
2	3.663500	3.459148	0.477776	0.609147	0.673201	0.880247
3	3.663500	3.456095	0.468276	0.617985	0.686397	0.886655

controller.trainer.model.save_pretrained('./sample_weights/my_model')

Make predictions

Load trained model

_model_kwargs

{'dhc_mask': tensor([[1., 1., 0., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 0., 0., 0.]]),
 'lloss_weight': 1.0,
 'dloss_weight': 0.8,
 'layer2concat': 4}

trained_model = model_init_classification(model_class = RobertaSimpleHSCDHCSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

Some weights of the model checkpoint at sample_weights/my_model were not used when initializing RobertaSimpleHSCDHCSequenceClassification: ['body_model.pooler.dense.bias', 'body_model.pooler.dense.weight']
- This IS expected if you are initializing RobertaSimpleHSCDHCSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaSimpleHSCDHCSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Total parameters: 124073520
Total trainable parameters: 124073520

Predict Train/Validation set

df_val = controller.predict_ddict(ds_type='validation',are_heads_separated=True)

-------------------- Start making predictions --------------------

df_val = df_val.to_pandas()
df_val.head()

	Title	Review Text	Division Name	Department Name	label	input_ids	attention_mask	pred_Division Name	pred_prob_Division Name	pred_Department Name	pred_prob_Department Name
0		. such a fun jacket ! great to wear in the spr...	General Petite	Intimate	[1, 2]	[0, 4, 215, 10, 1531, 8443, 27785, 372, 7, 356...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.570708	Jackets	0.732757
1	simple and elegant	simple and elegant . i thought this shirt was ...	General Petite	Tops	[1, 4]	[0, 41918, 8, 14878, 479, 939, 802, 42, 6399, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.658240	Tops	0.987063
2	retro and pretty	retro and pretty . this top has a bit of a ret...	General	Tops	[0, 4]	[0, 4903, 1001, 8, 1256, 479, 42, 299, 34, 10,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.639935	Tops	0.982640
3	summer/fall wear	summer / fall wear . i first spotted this on a...	General Petite	Dresses	[1, 1]	[0, 18581, 2089, 1589, 1136, 3568, 479, 939, 7...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.526535	Dresses	0.963429
4	perfect except slip	perfect except slip . this is my new favorite ...	General Petite	Dresses	[1, 1]	[0, 20473, 4682, 9215, 479, 42, 16, 127, 92, 2...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.574504	Dresses	0.980230

You can try to get your metric to see if it matches your last traing epoch’s above

f1_score(df_val['Division Name'],df_val['pred_Division Name'],average='macro')
# 0.45921193659675547

0.46820287596441723

f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')
# 0.6824108822326193

0.6863969355585954

Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set

df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)

# save the label, as we will calculate some metrics later. We also filter out labels with NaN Review Text,
# as there will be a filtering processing on the test set
true_labels = df_test.loc[~df_test['Review Text'].isna(),'Department Name'].values 

# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop(['Division Name','Department Name'],axis=1,inplace=True)

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True,
                                                   are_heads_separated=True,
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	input_ids	attention_mask	pred_Division Name	pred_prob_Division Name	pred_Department Name	pred_prob_Department Name
0	perfect for work and play	perfect for work and play . this shirt works f...	[0, 20473, 13, 173, 8, 310, 479, 42, 6399, 136...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.650245	Tops	0.986601
1		. i don't know why i had the opposite problem ...	[0, 4, 939, 218, 75, 216, 596, 939, 56, 5, 548...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.750323	Bottoms	0.985996
2	great pants	great pants . thes e cords are great--lightwei...	[0, 12338, 9304, 479, 5, 29, 364, 37687, 32, 3...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.641696	Bottoms	0.979315
3	surprisingly comfy for a button down	surprisingly comfy for a button down . i am a ...	[0, 33258, 3137, 24382, 13, 10, 6148, 159, 479...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.644696	Tops	0.967189
4	short and small	short and small . the shirt is mostly a thick ...	[0, 20263, 8, 650, 479, 5, 6399, 16, 2260, 10,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	General	0.566876	Tops	0.929597

Let’s quickly check the f1 score to make sure everything works correctly

f1_score(true_labels,df_test_predicted['pred_Department Name'],average='macro')

0.7083328159471795

Predict top k results

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True,
                                                   topk=3,
                                                   are_heads_separated=True
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	input_ids	attention_mask	pred_Division Name	pred_prob_Division Name	pred_Department Name	pred_prob_Department Name
0	perfect for work and play	perfect for work and play . this shirt works f...	[0, 20473, 13, 173, 8, 310, 479, 42, 6399, 136...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, General Petite, Initmates]	[0.65024525, 0.34086585, 0.008888904]	[Tops, Intimate, Jackets]	[0.98660094, 0.010788872, 0.0011576503]
1		. i don't know why i had the opposite problem ...	[0, 4, 939, 218, 75, 216, 596, 939, 56, 5, 548...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, General Petite, Initmates]	[0.7503231, 0.24517073, 0.0045061694]	[Bottoms, Intimate, Dresses]	[0.9859962, 0.007048236, 0.0063092075]
2	great pants	great pants . thes e cords are great--lightwei...	[0, 12338, 9304, 479, 5, 29, 364, 37687, 32, 3...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, General Petite, Initmates]	[0.64169586, 0.3475547, 0.010749483]	[Bottoms, Intimate, Dresses]	[0.97931457, 0.016918503, 0.002942416]
3	surprisingly comfy for a button down	surprisingly comfy for a button down . i am a ...	[0, 33258, 3137, 24382, 13, 10, 6148, 159, 479...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, General Petite, Initmates]	[0.6446963, 0.33823726, 0.017066495]	[Tops, Intimate, Dresses]	[0.96718866, 0.018242536, 0.0062063746]
4	short and small	short and small . the shirt is mostly a thick ...	[0, 20263, 8, 650, 479, 5, 6399, 16, 2260, 10,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, General Petite, Initmates]	[0.5668759, 0.363859, 0.06926514]	[Tops, Intimate, Jackets]	[0.92959666, 0.06315183, 0.0059652175]

# Since we have some metadatas (Title), we need to define a dictionary containing those values
raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt'}

controller.data_store.num_proc=1

df_result = controller.predict_raw_text(raw_content,are_heads_separated=True,topk=3)

-------------------- Start making predictions --------------------

df_result

{'Review Text': ['great shirt . this shirt is so comfortable i love it !'],
 'Title': ['great shirt'],
 'input_ids': [[0,
   12338,
   6399,
   479,
   42,
   6399,
   16,
   98,
   3473,
   939,
   657,
   24,
   27785,
   2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'pred_Division Name': [['General', 'General Petite', 'Initmates']],
 'pred_prob_Division Name': [[0.6600767374038696,
   0.33104702830314636,
   0.008876222185790539]],
 'pred_Department Name': [['Tops', 'Intimate', 'Jackets']],
 'pred_prob_Department Name': [[0.9869695901870728,
   0.010812523774802685,
   0.0010536855552345514]]}