Roberta model with a streamed dataset (Custom Single Head)

This notebook contains some example of how to train a Roberta-based model with a streamed dataset

In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out

import os

#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main_streaming import *
from that_nlp_library.utils import seed_everything

from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import RobertaTokenizer
from datasets import Dataset

Define the custom augmentation function

def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)

Create a TextDataController object

We will reuse the data and the preprocessings in this tutorial

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

ddict_with_val

DatasetDict({
    train: IterableDataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        n_shards: 1
    })
    validation: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        num_rows: 2349
    })
})

tdc = TextDataControllerStreaming(ddict_with_val,
                                  main_text='Review Text',
                                  label_names='Department Name',
                                  sup_types='classification',
                                  class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trending'],
                                  filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                              },
                                  label_tfm_dict={'Department Name': lambda x: x if x!='Trend' else 'Trending'},
                                  metadatas=['Title','Division Name'],
                                  content_transformations=[text_normalize,str.lower],
                                  content_augmentations=[nearby_aug_func,str.lower], 
                                  process_metas=True,
                                  batch_size=1000,
                                  num_proc=4,
                                  seed=42,
                                  verbose=False
                                 )

Define our tokenizer for Roberta

_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

Process and tokenize our dataset

tdc.process_and_tokenize(_tokenizer,max_length=150)

tdc.main_ddict

DatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 4
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2253
    })
})

Model Experiment: Roberta Single-Head Classification (with hidden layer concatenation)

Define and train a custom Roberta model

from transformers.models.roberta.modeling_roberta import RobertaModel

from that_nlp_library.models.roberta.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import f1_score, accuracy_score

num_classes = len(tdc.label_lists[0])

roberta_body = RobertaModel.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Then we can define a classification head. One trick we can use to boost the performance of our entire model is to concatenate the outputs of [CLS] from the four last layers of the pre-trained Roberta model (source: https://ieeexplore.ieee.org/document/9335912). We already define such custom head (ConcatHeadSimple), and the necessary architecture to make it work (RobertaHiddenStateConcatForSequenceClassification)

# our model is more complex, so it's best to define some of its arguments
_model_kwargs={
    # overall model hyperparams
    'head_class_sizes':num_classes,
    'head_class': ConcatHeadSimple,
    # classfication head hyperparams
    'layer2concat':2, # you can change the number of layers to concat (default is 4, based on the paper)
    'classifier_dropout':0.1 
}

model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'roberta-base', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation' technique
                                  seed=42,
                                  body_model=roberta_body,
                                  model_kwargs = _model_kwargs)

metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdc,seed=42)

Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124654854
Total trainable parameters: 124654854

And we can start training our model

seed_everything(42)

lr = 1e-4
bs=32
wd=0.01
epochs= 3

controller.fit(epochs,lr,
               metric_funcs=metric_funcs,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
               compute_metrics=compute_metrics,
               len_train=20000 # estimation of number of samples in train set
              )

max_steps is given, it will override any value given in num_train_epochs

[936/936 05:59, Epoch 2/9223372036854775807]

Epoch	Training Loss	Validation Loss	F1 Score Department name	Accuracy Score Department name
0	No log	0.312615	0.750225	0.921438
1	0.413400	0.274220	0.754716	0.923657
2	0.413400	0.253840	0.762595	0.932534

controller.trainer.model.save_pretrained('./sample_weights/my_model')

Make predictions

Load trained model

_model_kwargs

{'head_class_sizes': 6,
 'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
 'layer2concat': 2,
 'classifier_dropout': 0.1}

trained_model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                          cpoint_path = Path('./sample_weights/my_model'), 
                                          output_hidden_states=True,
                                          seed=42,
                                          model_kwargs = _model_kwargs)

controller = ModelController(trained_model,tdc,seed=42)

Some weights of the model checkpoint at sample_weights/my_model were not used when initializing RobertaHiddenStateConcatForSequenceClassification: ['body_model.pooler.dense.bias', 'body_model.pooler.dense.weight']
- This IS expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Total parameters: 124064262
Total trainable parameters: 124064262

Predict Train/Validation set

df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------

df_val = df_val.to_pandas()
df_val.head()

	Title	Review Text	Division Name	Department Name	label	input_ids	attention_mask	pred_Department Name	pred_prob_Department Name
0	soft, feminine and fun pockets!	general . soft , feminine and fun pockets ! . ...	general	Tops	4	[0, 15841, 479, 3793, 2156, 27360, 8, 1531, 12...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Tops	0.996728
1	a new staple!	general petite . a new staple ! . tried these ...	general petite	Bottoms	0	[0, 15841, 4716, 1459, 479, 10, 92, 17771, 277...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Bottoms	0.960517
2	maybe swing is for me!	general . maybe swing is for me ! . i love swi...	general	Tops	4	[0, 15841, 479, 2085, 7021, 16, 13, 162, 27785...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Tops	0.983545
3	too flare	general . too flare . too small ... too flare ...	general	Bottoms	0	[0, 15841, 479, 350, 24186, 479, 350, 650, 166...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Bottoms	0.986469
4	love	general . love . i love this top it is easy to...	general	Tops	4	[0, 15841, 479, 657, 479, 939, 657, 42, 299, 2...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Tops	0.997049

You can try to get your metric to see if it matches your last traing epoch’s above

f1_score(df_val['Department Name'],df_val['pred_Department Name'],average='macro')

0.7625951075732446

Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set

df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)

# save the label, as we will calculate some metrics later. We also filter out labels with NaN Review Text,
# as there will be a filtering processing on the test set
true_labels = df_test.loc[~df_test['Review Text'].isna(),'Department Name'].values 

# drop the label (you don't need to, but this is necessary to simulate an actual test set)
df_test.drop('Department Name',axis=1,inplace=True)

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True, # since we have some text filtering in the processing
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	Division Name	input_ids	attention_mask	pred_Department Name	pred_prob_Department Name
0	perfect for work and play	general . perfect for work and play . this shi...	general	[0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Tops	0.997284
1		general petite . . i don't know why i had the ...	general petite	[0, 15841, 4716, 1459, 479, 479, 939, 218, 75,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Bottoms	0.989114
2	great pants	general petite . great pants . thes e cords ar...	general petite	[0, 15841, 4716, 1459, 479, 372, 9304, 479, 5,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Bottoms	0.986304
3	surprisingly comfy for a button down	general petite . surprisingly comfy for a butt...	general petite	[0, 15841, 4716, 1459, 479, 10262, 3137, 24382...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Tops	0.990987
4	short and small	general petite . short and small . the shirt i...	general petite	[0, 15841, 4716, 1459, 479, 765, 8, 650, 479, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	Tops	0.996322

Let’s quickly check the f1 score to make sure everything works correctly

f1_score(true_labels,df_test_predicted['pred_Department Name'],average='macro')

0.7615999294223502

Predict top k results

_test_dset = Dataset.from_pandas(df_test)
_test_dset_predicted = controller.predict_raw_dset(_test_dset,
                                                   do_filtering=True,
                                                   topk=3
                                                  )

-------------------- Start making predictions --------------------

df_test_predicted = _test_dset_predicted.to_pandas()

df_test_predicted.head()

	Title	Review Text	Division Name	input_ids	attention_mask	pred_Department Name	pred_prob_Department Name
0	perfect for work and play	general . perfect for work and play . this shi...	general	[0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Tops, Intimate, Trending]	[0.9972837, 0.0011419549, 0.0010542183]
1		general petite . . i don't know why i had the ...	general petite	[0, 15841, 4716, 1459, 479, 479, 939, 218, 75,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Bottoms, Intimate, Trending]	[0.9891139, 0.006692194, 0.0033946035]
2	great pants	general petite . great pants . thes e cords ar...	general petite	[0, 15841, 4716, 1459, 479, 372, 9304, 479, 5,...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Bottoms, Intimate, Trending]	[0.98630387, 0.009437396, 0.0035212967]
3	surprisingly comfy for a button down	general petite . surprisingly comfy for a butt...	general petite	[0, 15841, 4716, 1459, 479, 10262, 3137, 24382...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Tops, Intimate, Jackets]	[0.9909869, 0.003928944, 0.0020938655]
4	short and small	general petite . short and small . the shirt i...	general petite	[0, 15841, 4716, 1459, 479, 765, 8, 650, 479, ...	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...	[Tops, Intimate, Trending]	[0.99632156, 0.0016891895, 0.0012567489]

# Since we have some metadatas (Title and Division Name), we need to define a dictionary containing those values
raw_content={'Review Text': 'This shirt is so comfortable I love it!',
             'Title': 'Great shirt',
             'Division Name': 'general'}

df_result = controller.predict_raw_text(raw_content,topk=3)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.

-------------------- Start making predictions --------------------

df_result

{'Review Text': ['general . great shirt . this shirt is so comfortable i love it !'],
 'Title': ['great shirt'],
 'Division Name': ['general'],
 'input_ids': [[0,
   15841,
   479,
   372,
   6399,
   479,
   42,
   6399,
   16,
   98,
   3473,
   939,
   657,
   24,
   27785,
   2]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'pred_Department Name': [['Tops', 'Intimate', 'Trending']],
 'pred_prob_Department Name': [[0.9973528385162354,
   0.0012270379811525345,
   0.0009467267664149404]]}

Predict a Streamed Test set

Let’s try to make predictions on a streamed dataset

df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)

# In this example we will keep all rows of the test set
true_labels = df_test['Department Name'].values 

df_test.drop('Department Name',axis=1,inplace=True)

df_test.shape

(4692, 9)

_test_dset_stream = Dataset.from_pandas(df_test).to_iterable_dataset()

Currently the ModelController does not support prediction on a streamied dataset yet, so we will have to manually create and predict each batch

We are going to predict each batch of 1000 items

from collections import defaultdict

We only keep these attributes in the results, as in streamied dataset, storing all attributes can be costly

cols_to_keep = ['Title'] + [f'{i}_Department Name' for i in ['pred','pred_prob']]
cols_to_keep

['Title', 'pred_Department Name', 'pred_prob_Department Name']

pred_bs = 1000
results=[]
batch_dic=defaultdict(list)
count=0
batch_count=0
for d in _test_dset_stream:
    # forming a batch
    for k,v in d.items():
        batch_dic[k].append(v)
    count+=1
    
    if count==pred_bs:
        # make predictions on complete batch
        # you can increase gpu batch size here, since inference is less costly than training
        _pred_dset = controller.predict_raw_dset(Dataset.from_dict(batch_dic),do_filtering=False,batch_size=64)
        _pred_dset = _pred_dset.remove_columns([c for c in _pred_dset.column_names if c not in cols_to_keep])
        results.append(_pred_dset)
        print(f'Finish prediction for batch {batch_count+1}')
        
        batch_count+=1
        count=0
        batch_dic=defaultdict(list)

# last batch of <1000 values
if count!=0:
    _pred_dset = controller.predict_raw_dset(Dataset.from_dict(batch_dic),do_filtering=False,batch_size=64)
    _pred_dset = _pred_dset.remove_columns([c for c in _pred_dset.column_names if c not in cols_to_keep])
    results.append(_pred_dset)
    print(f'Finish prediction for batch {batch_count+1}')

-------------------- Start making predictions --------------------
Finish prediction for batch 1
-------------------- Start making predictions --------------------
Finish prediction for batch 2
-------------------- Start making predictions --------------------
Finish prediction for batch 3
-------------------- Start making predictions --------------------
Finish prediction for batch 4
-------------------- Start making predictions --------------------
Finish prediction for batch 5

Example of first batch’s prediction

results[0].to_pandas()

	Title	pred_Department Name	pred_prob_Department Name
0	perfect for work and play	Tops	0.997284
1		Bottoms	0.989114
2	great pants	Bottoms	0.986304
3	surprisingly comfy for a button down	Tops	0.990987
4	short and small	Tops	0.996322
...	...	...	...
995	great design	Bottoms	0.976045
996	i'm wearing the hadley tunic for my birthday!!!!	Tops	0.996853
997	tunic has a beautiful print, sparkle detail	Dresses	0.489722
998	love the ruffle detail	Tops	0.996255
999	simply amazing	Tops	0.893297

1000 rows × 3 columns

Last batch

results[-1].to_pandas()

	Title	pred_Department Name	pred_prob_Department Name
0	comfortable meets cute	Dresses	0.988495
1	gorgeous!!	Tops	0.994889
2		Tops	0.996483
3		Bottoms	0.985292
4	nice summer dress	Dresses	0.988174
...	...	...	...
687	cute but...	Tops	0.989280
688	size down one or two sizes	Tops	0.829144
689	not worth it for the price	Bottoms	0.986075
690	beautiful maxi!	Dresses	0.988027
691	basic with a twist	Tops	0.996946

692 rows × 3 columns

Checking f1 score for all 5 batches

all_preds=[]
for r in results:
    all_preds += r['pred_Department Name'].tolist()

f1_score(true_labels,all_preds,average='macro')

0.7541583296621743