Roberta model (Multi-Label)

This notebook contains some example of how to use the Roberta-based models in this NLP library

In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out

import os

#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything

from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import RobertaTokenizer
from datasets import Dataset

Define the custom augmentation function

def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

Create a TextDataController object

We will reuse the data and the preprocessings in this tutorial

Construct a ‘multi-label’ column by using both Deparment Name and Division Name

df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')
df = df[~df['Department Name'].isna()].reset_index(drop=True)
df['Multi_Label'] =  df[['Department Name','Division Name']].values.tolist()
# df['Fake Label'] = [np.random.choice(df['Department Name'].unique()[:-1],size=np.random.randint(2,6),replace=False) for _ in range(len(df))]

df.head()

	Clothing ID	Age	Title	Review Text	Rating	Recommended IND	Positive Feedback Count	Division Name	Department Name	Class Name	Multi_Label
0	767	33	NaN	Absolutely wonderful - silky and sexy and comf...	4	1	0	Initmates	Intimate	Intimates	[Intimate, Initmates]
1	1080	34	NaN	Love this dress! it's sooo pretty. i happene...	5	1	4	General	Dresses	Dresses	[Dresses, General]
2	1077	60	Some major design flaws	I had such high hopes for this dress and reall...	3	0	0	General	Dresses	Dresses	[Dresses, General]
3	1049	50	My favorite buy!	I love, love, love this jumpsuit. it's fun, fl...	5	1	0	General Petite	Bottoms	Pants	[Bottoms, General Petite]
4	847	47	Flattering shirt	This shirt is very flattering to all due to th...	5	1	6	General	Tops	Blouses	[Tops, General]

tdc = TextDataController.from_df(df,
                                 main_text='Review Text',
                                 label_names='Multi_Label',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None},
                                 metadatas='Title',
                                 content_transformations=[text_normalize,str.lower],
                                 content_augmentations= [nearby_aug_func,str.lower], 
                                 val_ratio=0.2,
                                 batch_size=1000,
                                 seed=42,
                                 num_proc=20,
                                 verbose=False
                                )

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title          3809
Review Text     844
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows

Define our tokenizer for Roberta

_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

Process and tokenize our dataset

tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)

tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Multi_Label', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18101
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Multi_Label', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4526
    })
})

Model Experiment: Roberta Multi-Head Classification (with Hidden Layer Concatenation)

from that_nlp_library.models.roberta.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import f1_score, accuracy_score

Define and train a custom Roberta model

from transformers.models.roberta.modeling_roberta import RobertaModel

tdc.label_lists[0]

['Bottoms',
 'Dresses',
 'General',
 'General Petite',
 'Initmates',
 'Intimate',
 'Jackets',
 'Tops',
 'Trend']

num_classes = len(tdc.label_lists[0])

roberta_body = RobertaModel.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class': ConcatHeadSimple,
    'is_multilabel':tdc.is_multilabel, # True
    'is_multihead':tdc.is_multihead, # False
    # classfication head hyperparams
    'layer2concat':2, # you can change the number of layers to concat (default is 4, based on the paper)
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'roberta-base', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=roberta_body,
                                  model_kwargs = _model_kwargs)

metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124659465
Total trainable parameters: 124659465

And we can start training our model

seed_everything(42)

# you can adjust the `compute_metrics` to perform multi-label with a threshold
_cmc = partial(compute_metrics,
               is_multilabel=tdc.is_multilabel,
               multilabel_threshold=0.55)

lr = 1e-4
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=_cmc,
              )

[849/849 06:44, Epoch 3/3]

Epoch	Training Loss	Validation Loss	F1 Score Multi Label	Accuracy Score Multi Label
1	No log	0.242643	0.574833	0.402121
2	0.268500	0.230503	0.594430	0.509280
3	0.268500	0.230640	0.598910	0.518118

controller.trainer.model.save_pretrained('./sample_weights/my_model')

Make predictions

Load trained model

_model_kwargs

{'head_class_sizes': 9,
 'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
 'is_multilabel': True,
 'is_multihead': False,
 'layer2concat': 2,
 'classifier_dropout': 0.1}

trained_model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

Some weights of the model checkpoint at sample_weights/my_model were not used when initializing RobertaHiddenStateConcatForSequenceClassification: ['body_model.pooler.dense.bias', 'body_model.pooler.dense.weight']
- This IS expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Total parameters: 124068873
Total trainable parameters: 124068873

Predict Train/Validation set

df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------

df_val = df_val.to_pandas()
df_val.head()

	Title	Review Text	Multi_Label	label	input_ids	attention_mask	pred_Multi_Label	pred_prob_Multi_Label
0		. such a fun jacket ! great to wear in the spr...	[Intimate, General Petite]	[0, 0, 0, 1, 0, 1, 0, 0, 0]	[0, 4, 215, 10, 1531, 8443, 27785, 372, 7, 356...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, Jackets]	[0.0016225622, 0.0026141205, 0.52690035, 0.390...
1	simple and elegant	simple and elegant . i thought this shirt was ...	[Tops, General Petite]	[0, 0, 0, 1, 0, 0, 0, 1, 0]	[0, 41918, 8, 14878, 479, 939, 802, 42, 6399, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, Tops]	[0.00071811187, 0.0011346012, 0.63563234, 0.36...
2	retro and pretty	retro and pretty . this top has a bit of a ret...	[Tops, General]	[0, 0, 1, 0, 0, 0, 0, 1, 0]	[0, 4903, 1001, 8, 1256, 479, 42, 299, 34, 10,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, Tops]	[0.0008183706, 0.00083204935, 0.6578452, 0.357...
3	summer/fall wear	summer / fall wear . i first spotted this on a...	[Dresses, General Petite]	[0, 1, 0, 1, 0, 0, 0, 0, 0]	[0, 18581, 2089, 1589, 1136, 3568, 479, 939, 7...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Dresses, General]	[0.001032995, 0.97645515, 0.55536234, 0.428432...
4	perfect except slip	perfect except slip . this is my new favorite ...	[Dresses, General Petite]	[0, 1, 0, 1, 0, 0, 0, 0, 0]	[0, 20473, 4682, 9215, 479, 42, 16, 127, 92, 2...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Dresses, General]	[0.00093759556, 0.9759572, 0.54164094, 0.42817...

controller.data_store.label_lists

[['Bottoms',
  'Dresses',
  'General',
  'General Petite',
  'Initmates',
  'Intimate',
  'Jackets',
  'Tops',
  'Trend']]

You can try to get your metric to see if it matches your last traing epoch’s above

df_val['pred_prob_Multi_Label'].apply(lambda x: (np.array(x)>0.55).astype(int))

0       [0, 0, 0, 0, 0, 0, 1, 0, 0]
1       [0, 0, 1, 0, 0, 0, 0, 1, 0]
2       [0, 0, 1, 0, 0, 0, 0, 1, 0]
3       [0, 1, 1, 0, 0, 0, 0, 0, 0]
4       [0, 1, 0, 0, 0, 0, 0, 0, 0]
                   ...             
4521    [0, 0, 1, 0, 0, 0, 0, 1, 0]
4522    [0, 0, 1, 0, 0, 0, 0, 1, 0]
4523    [0, 1, 1, 0, 0, 0, 0, 0, 0]
4524    [1, 0, 1, 0, 0, 0, 0, 0, 0]
4525    [0, 0, 1, 0, 0, 0, 0, 1, 0]
Name: pred_prob_Multi_Label, Length: 4526, dtype: object

f1_score(df_val['label'].tolist(),
         df_val['pred_prob_Multi_Label'].apply(lambda x: (np.array(x)>0.55).astype(int)).tolist(),
         average='macro')
# 0.6073332283258327

0.5989434710627796

Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set

df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)
 
# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop('Department Name',axis=1,inplace=True)

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True, # since we have some text filtering in the processing
                                                   multilabel_threshold=0.55
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	input_ids	attention_mask	pred_Multi_Label	pred_prob_Multi_Label
0	perfect for work and play	perfect for work and play . this shirt works f...	[0, 20473, 13, 173, 8, 310, 479, 42, 6399, 136...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, Tops]	[0.0008787949, 0.0011617275, 0.6554981, 0.3466...
1		. i don't know why i had the opposite problem ...	[0, 4, 939, 218, 75, 216, 596, 939, 56, 5, 548...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Bottoms, General]	[0.981327, 0.0017199836, 0.72738814, 0.2887234...
2	great pants	great pants . thes e cords are great--lightwei...	[0, 12338, 9304, 479, 5, 29, 364, 37687, 32, 3...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Bottoms, General]	[0.98825514, 0.0013365657, 0.72356224, 0.28617...
3	surprisingly comfy for a button down	surprisingly comfy for a button down . i am a ...	[0, 33258, 3137, 24382, 13, 10, 6148, 159, 479...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, Tops]	[0.00110166, 0.0023981999, 0.6438682, 0.361307...
4	short and small	short and small . the shirt is mostly a thick ...	[0, 20263, 8, 650, 479, 5, 6399, 16, 2260, 10,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[General, Tops]	[0.00052336225, 0.0006901586, 0.5660622, 0.351...

# Since we have some metadatas (Title), we need to define a dictionary containing those values
raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt'}

controller.data_store.num_proc=1

df_result = controller.predict_raw_text(raw_content,multilabel_threshold=0.55)

-------------------- Start making predictions --------------------

df_result

{'Review Text': ['great shirt . this shirt is so comfortable i love it !'],
 'Title': ['great shirt'],
 'input_ids': [[0,
   12338,
   6399,
   479,
   42,
   6399,
   16,
   98,
   3473,
   939,
   657,
   24,
   27785,
   2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'pred_Multi_Label': [['General', 'Tops']],
 'pred_prob_Multi_Label': [[0.0008708593086339533,
   0.0008345667738467455,
   0.63884437084198,
   0.36591553688049316,
   0.011160656809806824,
   0.012419871985912323,
   0.0008258746238425374,
   0.9864105582237244,
   0.002458298346027732]]}