Roberta model (Regression)

This notebook contains some example of how to use the Roberta-based models in this NLP library

In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out

import os

#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything

from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import RobertaTokenizer
from datasets import Dataset

from transformers.models.roberta.modeling_roberta import RobertaModel
import torch

Define the custom augmentation function

def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

Create a TextDataController object

We will reuse the data and the preprocessings in this tutorial

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Rating',
                         sup_types='regression',
                         filter_dict={'Review Text': lambda x: x is not None},
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         # add "str.lower" here because nearby_aug might return uppercase character
                         val_ratio=0.2,
                         batch_size=1000,
                         seed=42,
                         num_proc=20,
                         verbose=False
                        )

Define our tokenizer for Roberta

_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Process and tokenize our dataset

tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)

tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Rating', 'Division Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18112
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Rating', 'Division Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4529
    })
})

Model Experiment: Roberta Single-Head Regression

from that_nlp_library.models.roberta.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import mean_absolute_error,mean_squared_log_error,f1_score, accuracy_score

Using HuggingFace model initialization

from transformers.models.roberta.modeling_roberta import RobertaForSequenceClassification

num_classes=1

seed_everything(42)
model = RobertaForSequenceClassification.from_pretrained('roberta-base',num_labels=num_classes)
model = model.to('cuda:0')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

metric_funcs = [mean_absolute_error,mean_squared_log_error]
controller = ModelController(model,tdc,seed=42)

lr = 1e-4
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

/home/quan/anaconda3/envs/fastai_v2/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(

[849/849 02:04, Epoch 3/3]

Epoch	Training Loss	Validation Loss	Mean Absolute Error Rating	Mean Squared Log Error Rating
1	No log	0.390571	0.395417	0.024300
2	1.053300	0.416924	0.460436	0.023507
3	1.053300	0.342913	0.397407	0.020786

Using Roberta-base model (no concatenation), but with a custom head to limit the output range

num_classes=1

roberta_body = RobertaModel.from_pretrained('roberta-base')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

class RobertaSigmoidRange(torch.nn.Module):
    def __init__(self,
                 config,
                 high,
                 low,
                 **kwargs
                ):
        super().__init__()
        self.high=high
        self.low=low
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels, bias=False)
    def forward(self, inp, **kwargs):
        logits = self.score(inp)
        return torch.sigmoid(logits)*(self.high-self.low)+self.low

_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class': RobertaSigmoidRange,
    # classfication head hyperparams
    'high':5, # the maximum rating
    'low': 1, # the minimum rating
}

model = model_init_classification(model_class = RobertaBaseForSequenceClassification,
                                  cpoint_path = 'roberta-base', 
                                  output_hidden_states=False,
                                  seed=42,
                                  body_model=roberta_body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture

metric_funcs = [mean_absolute_error,mean_squared_log_error]
controller = ModelController(model,tdc,seed=42)

seed_everything(42)

lr = 1e-4
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

/home/quan/anaconda3/envs/fastai_v2/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(

[849/849 02:11, Epoch 3/3]

Epoch	Training Loss	Validation Loss	Mean Absolute Error Rating	Mean Squared Log Error Rating
1	No log	0.366913	0.412213	0.023336
2	0.494600	0.320408	0.386471	0.020400
3	0.494600	0.313202	0.371211	0.019664

Using the Roberta custom model (concatenation)

from transformers.models.roberta.modeling_roberta import RobertaModel

num_classes=1

roberta_body = RobertaModel.from_pretrained('roberta-base')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class': ConcatHeadSimple,
    # classfication head hyperparams
    'layer2concat':4,
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'roberta-base', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=roberta_body,
                                  model_kwargs = _model_kwargs)

Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124648705
Total trainable parameters: 124648705

metric_funcs = [mean_absolute_error,mean_squared_log_error]
controller = ModelController(model,tdc,seed=42)

seed_everything(42)

lr = 1e-4
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
              )

[849/849 02:29, Epoch 3/3]

Epoch	Training Loss	Validation Loss	Mean Absolute Error Rating	Mean Squared Log Error Rating
1	No log	0.526983	0.560607	0.029044
2	1.226800	0.338014	0.397377	0.021910
3	1.226800	0.332928	0.389946	0.020401

controller.trainer.model.save_pretrained('./sample_weights/my_model')

Predict Validation

_model_kwargs

{'head_class_sizes': 1,
 'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
 'layer2concat': 4,
 'classifier_dropout': 0.1}

trained_model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

Some weights of the model checkpoint at sample_weights/my_model were not used when initializing RobertaHiddenStateConcatForSequenceClassification: ['body_model.pooler.dense.weight', 'body_model.pooler.dense.bias']
- This IS expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Total parameters: 124058113
Total trainable parameters: 124058113

df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------

df_val = df_val.to_pandas()
df_val.head()

	Title	Review Text	Rating	Division Name	label	input_ids	attention_mask	pred_Rating
0		general . . this picture doesn't do the skirt ...	5.0	general	5.0	[0, 15841, 479, 479, 42, 2170, 630, 75, 109, 5...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	5.009285
1		general . . easy to wear ! cute , comfy ... wi...	4.0	general	4.0	[0, 15841, 479, 479, 1365, 7, 3568, 27785, 119...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	4.959775
2		general . . nice sweater , just did not look g...	3.0	general	3.0	[0, 15841, 479, 479, 2579, 23204, 2156, 95, 22...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	2.813448
3	nice cropped jacket	general . nice cropped jacket . this jacket wa...	5.0	general	5.0	[0, 15841, 479, 2579, 30197, 8443, 479, 42, 84...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	4.425314
4	great dress!	general petite . great dress ! . i wasn't plan...	5.0	general petite	5.0	[0, 15841, 4716, 1459, 479, 372, 3588, 27785, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	5.152634

You can try to get your metric to see if it matches your last traing epoch’s above

mean_absolute_error(df_val['label'],df_val['pred_Rating'])
# 0.3844665757181577

0.38998359958971274

mean_squared_log_error(df_val['label'],df_val['pred_Rating'])
# 0.020327507632071154

0.020402761232161202

Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set

df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)


# save the label, as we will calculate some metrics later
true_labels = df_test[~df_test['Review Text'].isna()].Rating.values 

# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop('Rating',axis=1,inplace=True)

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True, # since we have some text filtering in the processing
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	Division Name	input_ids	attention_mask	pred_Rating
0	perfect for work and play	general . perfect for work and play . this shi...	general	[0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	5.153669
1		general petite . . i don't know why i had the ...	general petite	[0, 15841, 4716, 1459, 479, 479, 939, 218, 75,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	4.160178
2	great pants	general petite . great pants . thes e cords ar...	general petite	[0, 15841, 4716, 1459, 479, 372, 9304, 479, 5,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	5.120962
3	surprisingly comfy for a button down	general petite . surprisingly comfy for a butt...	general petite	[0, 15841, 4716, 1459, 479, 10262, 3137, 24382...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	4.653287
4	short and small	general petite . short and small . the shirt i...	general petite	[0, 15841, 4716, 1459, 479, 765, 8, 650, 479, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	2.688593

Let’s quickly check the score to make sure everything works correctly

mean_absolute_error(true_labels,df_test_predicted['pred_Rating'])

0.3419881553779289

mean_squared_log_error(true_labels,df_test_predicted['pred_Rating'])

0.016165478401855043

raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt',
             'Division Name': 'general'}
raw_content

{'Review Text': 'This shirt is so comfortable I love it!',
 'Title': 'Great shirt',
 'Division Name': 'general'}

df_result = controller.predict_raw_text(raw_content)

-------------------- Start making predictions --------------------

df_result

{'Review Text': ['general . great shirt . this shirt is so comfortable i love it !'],
 'Title': ['great shirt'],
 'Division Name': ['general'],
 'input_ids': [[0,
   15841,
   479,
   372,
   6399,
   479,
   42,
   6399,
   16,
   98,
   3473,
   939,
   657,
   24,
   27785,
   2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'pred_Rating': [5.104530334472656]}

Model Experiment: Roberta Multi-Head Regression

from that_nlp_library.models.roberta.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import mean_absolute_error,mean_squared_log_error,f1_score, accuracy_score

Re-define the TextDataController

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names=['Rating','Department Name'],
                         sup_types=['regression','classification'],
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None},
                         metadatas=['Title'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         # add "str.lower" here because nearby_aug might return uppercase character
                         val_ratio=0.2,
                         batch_size=1000,
                         seed=42,
                         num_proc=20,
                         verbose=False
                        )