import os
Model Controller Tutorial: Classification
#This will specify a (or a list) of GPUs for training
'CUDA_VISIBLE_DEVICES'] = "0" os.environ[
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything
from underthesea import text_normalize
from functools import partial
from pathlib import Path
from transformers import DataCollatorWithPadding,RobertaTokenizer
from transformers.models.roberta.modeling_roberta import RobertaForSequenceClassification
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
import pandas as pd
import numpy as np
1. Define the custom augmentation function
def nlp_aug_stochastic(x,aug=None,p=0.5):
if not isinstance(x,list):
if random.random()<p: return aug.augment(x)[0]
return x
=[]
news=[]
originalsfor _x in x:
if random.random()<p: news.append(_x)
else: originals.append(_x)
# only perform augmentation when needed
if len(news): news = aug.augment(news)
return news+originals
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.3) nearby_aug_func
2. Create a TextDataController object
We will reuse the data and the preprocessings in this tutorial
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train') dset
list(map(lambda x: len(x.split()),[text for text in dset['Review Text'] if text is not None]))).describe() pd.Series(
count 22641.000000
mean 60.196679
std 28.534612
min 2.000000
25% 36.000000
50% 59.000000
75% 88.000000
max 115.000000
dtype: float64
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations# add "str.lower" here because nearby_aug might return uppercase character
=0.2,
val_ratio=1000,
batch_size=42,
seed=20,
num_proc=True
verbose )
Define our tokenizer for Roberta
= RobertaTokenizer.from_pretrained('roberta-base') _tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset
=100,shuffle_trn=True) tdc.process_and_tokenize(_tokenizer,max_length
-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----
----- Do <lambda> on Department Name -----
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
----- Label Encoding -----
Done
-------------------- Text Transformation --------------------
----- text_normalize -----
----- lower -----
Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 0, which is 0.00% of training set
-------------------- Text Augmentation --------------------
----- nlp_aug_stochastic -----
----- lower -----
Done
-------------------- Shuffling and flattening train set --------------------
Done
-------------------- Tokenization --------------------
Done
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18102
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})
Let’s see one example of how those content transformations and augmentations affect our input
= 'This is not what I expected 🤬. I gulped when I put this in my bag during retailer days because the price was still too much ... but thought this has to be wonderful to charge so much,right??'
sample_txt print(sample_txt)
This is not what I expected 🤬. I gulped when I put this in my bag during retailer days because the price was still too much ... but thought this has to be wonderful to charge so much,right??
two_steps_tokenization_explain(sample_txt,_tokenizer,=[text_normalize,str.lower],
content_tfms=[partial(nlp_aug_stochastic,aug=aug,p=1),str.lower]
aug_tfms )
------- Text Transformation Explained -------
----- Raw sentence -----
This is not what I expected 🤬. I gulped when I put this in my bag during retailer days because the price was still too much ... but thought this has to be wonderful to charge so much,right??
----- Content Transformations (on both train and test) -----
--- text_normalize ---
This is not what I expected 🤬 . I gulped when I put this in my bag during retailer days because the price was still too much ... but thought this has to be wonderful to charge so much , right ? ?
--- lower ---
this is not what i expected 🤬 . i gulped when i put this in my bag during retailer days because the price was still too much ... but thought this has to be wonderful to charge so much , right ? ?
----- Augmentations (on train only) -----
--- nlp_aug_stochastic ---
tMis is not what i expected 🤬. i gulped when i put this in my bag during rrtailer Cays because the price was still too much. .. but thought this has to be wonderful to Vharge so much, right??
--- lower ---
tmis is not what i expected 🤬. i gulped when i put this in my bag during rrtailer cays because the price was still too much. .. but thought this has to be wonderful to vharge so much, right??
------- Tokenizer Explained -------
----- Input -----
tmis is not what i expected 🤬. i gulped when i put this in my bag during rrtailer cays because the price was still too much. .. but thought this has to be wonderful to vharge so much, right??
----- Tokenized results -----
{'input_ids': [0, 26989, 354, 16, 45, 99, 939, 421, 8103, 10470, 11582, 4, 939, 42445, 9700, 77, 939, 342, 42, 11, 127, 3298, 148, 910, 338, 17624, 254, 740, 4113, 142, 5, 425, 21, 202, 350, 203, 4, 29942, 53, 802, 42, 34, 7, 28, 4613, 7, 748, 298, 16347, 98, 203, 6, 235, 28749, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
----- Results from tokenizer.convert_ids_to_tokens -----
['<s>', 'tm', 'is', 'Ġis', 'Ġnot', 'Ġwhat', 'Ġi', 'Ġexpected', 'ĠðŁ', '¤', '¬', '.', 'Ġi', 'Ġgul', 'ped', 'Ġwhen', 'Ġi', 'Ġput', 'Ġthis', 'Ġin', 'Ġmy', 'Ġbag', 'Ġduring', 'Ġr', 'r', 'tail', 'er', 'Ġc', 'ays', 'Ġbecause', 'Ġthe', 'Ġprice', 'Ġwas', 'Ġstill', 'Ġtoo', 'Ġmuch', '.', 'Ġ..', 'Ġbut', 'Ġthought', 'Ġthis', 'Ġhas', 'Ġto', 'Ġbe', 'Ġwonderful', 'Ġto', 'Ġv', 'h', 'arge', 'Ġso', 'Ġmuch', ',', 'Ġright', '??', '</s>']
----- Results from tokenizer.decode -----
<s>tmis is not what i expected 🤬. i gulped when i put this in my bag during rrtailer cays because the price was still too much... but thought this has to be wonderful to vharge so much, right??</s>
3. Model Experiment: Roberta Vanilla Single-Head Classification
from that_nlp_library.models.roberta.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import f1_score, accuracy_score
a) Train Roberta model using the Model Controller
Here are the unique values in our label
0] tdc.label_lists[
['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']
= len(tdc.label_lists[0]) num_classes
Let’s define our model
42) seed_everything(
='roberta-base'
model_name= RobertaForSequenceClassification.from_pretrained(model_name,num_labels=num_classes)
_model = _model.to('cuda:0') _model
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Then we can define the metrics to used, and the Model Controller object
= [partial(f1_score,average='macro'),accuracy_score]
metric_funcs # we will use both f1_macro and accuracy score as metrics
= ModelController(_model,
controller =tdc,
data_store=42) seed
And we can start training our model
= 1e-4
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics )
Epoch | Training Loss | Validation Loss | F1 Score Department name | Accuracy Score Department name |
---|---|---|---|---|
1 | No log | 0.304115 | 0.743430 | 0.914494 |
2 | 0.417500 | 0.264885 | 0.749442 | 0.919797 |
3 | 0.417500 | 0.281572 | 0.747713 | 0.918471 |
Logging your training
You can log your training using HuggingFace:
Supported platforms are “azure_ml”, “comet_ml”, “mlflow”, “neptune”, “tensorboard”, “clearml” and “wandb”
References:
https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/trainer#transformers.TrainingArguments
https://docs.wandb.ai/guides/integrations/huggingface
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics='wandb'
hf_report_to )
You can save your model weights at the end of your training
'./sample_weights/model_progress') controller.trainer.model.save_pretrained(
Or you can save your weights at every epochs during your training
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=True,
save_checkpoint='my_saved_weights',
o_dir=compute_metrics, compute_metrics
b) Train model with only a Tokenized DatasetDict (no TextDataController)
This part assumes you already have your tokenized datasetdict (you don’t even need to pad your tokens, as demonstrated below). We will ‘borrow’ TextDataController to create such datasetdict for us
# import copy
# main_ddict = copy.deepcopy(tdc.main_ddict)
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
dset
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations# add "str.lower" here because nearby_aug might return uppercase character
=0.2,
val_ratio=1000,
batch_size=42,
seed=20,
num_proc=True
verbose
)
= RobertaTokenizer.from_pretrained('roberta-base')
_tokenizer
# set max_length to -1 to skip the padding
=-1,shuffle_trn=True) tdc.process_and_tokenize(_tokenizer,max_length
-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----
----- Do <lambda> on Department Name -----
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
----- Label Encoding -----
Done
-------------------- Text Transformation --------------------
----- text_normalize -----
----- lower -----
Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 0, which is 0.00% of training set
-------------------- Text Augmentation --------------------
----- nlp_aug_stochastic -----
----- lower -----
Done
-------------------- Shuffling and flattening train set --------------------
Done
-------------------- Tokenization --------------------
Done
main_ddict
DatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18102
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})
Note that your DatasetDict must contain tokens besides raw text (which typically includes ‘input_ids’, ‘token_type_ids’, ‘attention_mask’)
= 6 # the number of classes num_classes
42) seed_everything(
='roberta-base'
model_name= RobertaForSequenceClassification.from_pretrained(model_name,num_labels=num_classes)
_model = _model.to('cuda:0') _model
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
= [partial(f1_score,average='macro'),accuracy_score]
metric_funcs
# note that you omit the `data_store` argument
= ModelController(_model,seed=42) controller
= 1e-4
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=main_ddict, # Put in your tokenized datasetdict here
ddict=metric_funcs,
metric_funcs='Department Name',
label_names=num_classes,
head_sizes=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics=_tokenizer,
tokenizer )
Epoch | Training Loss | Validation Loss | F1 Score Department name | Accuracy Score Department name |
---|---|---|---|---|
1 | No log | 0.299611 | 0.733846 | 0.910738 |
2 | 0.414500 | 0.257584 | 0.748776 | 0.920018 |
3 | 0.414500 | 0.263301 | 0.747707 | 0.921564 |
'./sample_weights/model_progress') controller.trainer.model.save_pretrained(
c) Make predictions, using TextDataController
Load trained model
= RobertaForSequenceClassification.from_pretrained('./sample_weights/model_progress',num_labels=6).to('cuda:0') trained_model
= ModelController(trained_model,tdc,seed=42) controller
Predict Train/Validation set
Make prediction on all validation set
= controller.predict_ddict(ds_type='validation') df_val
-------------------- Start making predictions --------------------
= df_val.to_pandas()
df_val df_val.head()
Title | Review Text | Division Name | Department Name | label | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|---|---|
0 | general petite . . such a fun jacket ! great t... | general petite | Intimate | 2 | [0, 15841, 4716, 1459, 479, 479, 215, 10, 1531... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Jackets | 0.823920 | |
1 | simple and elegant | general petite . simple and elegant . i though... | general petite | Tops | 4 | [0, 15841, 4716, 1459, 479, 2007, 8, 14878, 47... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.995652 |
2 | retro and pretty | general . retro and pretty . this top has a bi... | general | Tops | 4 | [0, 15841, 479, 11299, 8, 1256, 479, 42, 299, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.995805 |
3 | summer/fall wear | general petite . summer / fall wear . i first ... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1035, 1589, 1136, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Dresses | 0.985551 |
4 | perfect except slip | general petite . perfect except slip . this is... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1969, 4682, 9215, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Dresses | 0.985531 |
You can try to get your metric to see if it matches your last traing epoch’s above
'Department Name'],df_val['pred_Department Name'],average='macro') f1_score(df_val[
0.7485565717033943
You can also make predictions on all training set, by changing argument ds_type
to “train”
Predict Test set
We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set
= pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
df_test # drop NaN values in the label column
= df_test[~df_test['Department Name'].isna()].reset_index(drop=True)
df_test
# save the label, as we will calculate some metrics later. We also filter out labels with NaN Review Text,
# as there will be a filtering processing on the test set
= df_test.loc[~df_test['Review Text'].isna(),'Department Name'].values
true_labels
# drop the label (you don't need to, but this is necessary to simulate an actual test set)
'Department Name',axis=1,inplace=True) df_test.drop(
df_test.shape
(4692, 9)
5) df_test.head(
Clothing ID | Age | Title | Review Text | Rating | Recommended IND | Positive Feedback Count | Division Name | Class Name | |
---|---|---|---|---|---|---|---|---|---|
0 | 872 | 42 | Perfect for work and play | This shirt works for both going out and going ... | 5 | 1 | 0 | General | Knits |
1 | 1033 | 40 | NaN | I don't know why i had the opposite problem mo... | 4 | 1 | 0 | General Petite | Jeans |
2 | 1037 | 45 | Great pants | These cords are great--lightweight for fl wint... | 5 | 1 | 1 | General Petite | Jeans |
3 | 829 | 35 | Surprisingly comfy for a button down | I am a 10 m and got the 10. it fits perfectly ... | 5 | 1 | 1 | General Petite | Blouses |
4 | 872 | 29 | Short and small | The shirt is mostly a thick sweatshirt materia... | 3 | 0 | 15 | General Petite | Knits |
From here, you have 2 options
- Use
TextDataController
to process your data, thenModelController
’s job is to perform prediction - Convert your dataframe to a HuggingFace Dataset, and let the
ModelController
take care of the preprocessing and the prediction
Option 1:
= tdc.prepare_test_dataset_from_df(df_test,validate=True,do_filtering=True) _test_dset_processed
- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title 758
Review Text 164
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 2 rows
-------------------- Start Test Set Transformation --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Text Transformation --------------------
----- text_normalize -----
----- lower -----
Done
-------------------- Tokenization --------------------
Done
_test_dset_processed
Dataset({
features: ['Title', 'Review Text', 'Division Name', 'input_ids', 'attention_mask'],
num_rows: 4528
})
= controller.predict_ddict(_test_dset_processed) _test_dset_predicted
-------------------- Start making predictions --------------------
= _test_dset_predicted.to_pandas()
df_test_predicted df_test_predicted.head()
Title | Review Text | Division Name | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|
0 | perfect for work and play | general . perfect for work and play . this shi... | general | [0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.996438 |
1 | general petite . . i don't know why i had the ... | general petite | [0, 15841, 4716, 1459, 479, 479, 939, 218, 75,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Bottoms | 0.976738 | |
2 | great pants | general petite . great pants . thes e cords ar... | general petite | [0, 15841, 4716, 1459, 479, 372, 9304, 479, 5,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Bottoms | 0.958788 |
3 | surprisingly comfy for a button down | general petite . surprisingly comfy for a butt... | general petite | [0, 15841, 4716, 1459, 479, 10262, 3137, 24382... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.994487 |
4 | short and small | general petite . short and small . the shirt i... | general petite | [0, 15841, 4716, 1459, 479, 765, 8, 650, 479, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.995179 |
Option 2:
from datasets import Dataset
If you want to turn off the info printing, you can do it to the TextDataController
(stored as data_store
) in the ModelController
class
False) controller.data_store.set_verbose(
= Dataset.from_pandas(df_test)
_test_dset = controller.predict_raw_dset(_test_dset,
_test_dset_predicted =True, # since we have some text filtering in the processing
do_filtering )
-------------------- Start making predictions --------------------
= _test_dset_predicted.to_pandas() df_test_predicted
df_test_predicted.head()
Title | Review Text | Division Name | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|
0 | perfect for work and play | general . perfect for work and play . this shi... | general | [0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.996438 |
1 | general petite . . i don't know why i had the ... | general petite | [0, 15841, 4716, 1459, 479, 479, 939, 218, 75,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Bottoms | 0.976738 | |
2 | great pants | general petite . great pants . thes e cords ar... | general petite | [0, 15841, 4716, 1459, 479, 372, 9304, 479, 5,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Bottoms | 0.958788 |
3 | surprisingly comfy for a button down | general petite . surprisingly comfy for a butt... | general petite | [0, 15841, 4716, 1459, 479, 10262, 3137, 24382... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.994487 |
4 | short and small | general petite . short and small . the shirt i... | general petite | [0, 15841, 4716, 1459, 479, 765, 8, 650, 479, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.995179 |
Let’s quickly check the f1 score to make sure everything works correctly
'pred_Department Name'],average='macro') f1_score(true_labels,df_test_predicted[
0.759160993145196
This is not too far off from the validation F1 score. Notice that the ‘test set’ is just a sample from the original dataset, not the entire new set
We can even predict top k results
= Dataset.from_pandas(df_test)
_test_dset = controller.predict_raw_dset(_test_dset,
_test_dset_predicted =True,
do_filtering=3
topk )
-------------------- Start making predictions --------------------
= _test_dset_predicted.to_pandas()
df_test_predicted
df_test_predicted.head()
Title | Review Text | Division Name | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|
0 | perfect for work and play | general . perfect for work and play . this shi... | general | [0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Tops, Intimate, Trend] | [0.9964378, 0.0014704004, 0.00085006363] |
1 | general petite . . i don't know why i had the ... | general petite | [0, 15841, 4716, 1459, 479, 479, 939, 218, 75,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Bottoms, Intimate, Trend] | [0.97673845, 0.017872315, 0.0033529706] | |
2 | great pants | general petite . great pants . thes e cords ar... | general petite | [0, 15841, 4716, 1459, 479, 372, 9304, 479, 5,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Bottoms, Intimate, Trend] | [0.95878834, 0.033563487, 0.004869911] |
3 | surprisingly comfy for a button down | general petite . surprisingly comfy for a butt... | general petite | [0, 15841, 4716, 1459, 479, 10262, 3137, 24382... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Tops, Intimate, Jackets] | [0.994487, 0.0027335314, 0.0009791912] |
4 | short and small | general petite . short and small . the shirt i... | general petite | [0, 15841, 4716, 1459, 479, 765, 8, 650, 479, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Tops, Intimate, Trend] | [0.9951786, 0.002501535, 0.00096233515] |
If we just want to make a prediction on a small amount of data (single sentence, or a few sentences), we can use ModelController.predict_raw_text
# Since we have some metadatas (Title and Division Name), we need to define a dictionary containing those values
={'Review Text': 'This shirt is so comfortable I love it!',
raw_content'Title': 'Great shirt',
'Division Name': 'general'}
If you don’t use metadata, just create a string instead, e.g.
raw_content='This shirt is so comfortable I love it!'
= controller.predict_raw_text(raw_content) df_result
-------------------- Start making predictions --------------------
df_result
{'Review Text': ['general . great shirt . this shirt is so comfortable i love it !'],
'Title': ['great shirt'],
'Division Name': ['general'],
'input_ids': [[0,
15841,
479,
372,
6399,
479,
42,
6399,
16,
98,
3473,
939,
657,
24,
27785,
2]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
'pred_Department Name': ['Tops'],
'pred_prob_Department Name': [0.996221661567688]}
= controller.predict_raw_text(raw_content,topk=3) df_result
-------------------- Start making predictions --------------------
df_result
{'Review Text': ['general . great shirt . this shirt is so comfortable i love it !'],
'Title': ['great shirt'],
'Division Name': ['general'],
'input_ids': [[0,
15841,
479,
372,
6399,
479,
42,
6399,
16,
98,
3473,
939,
657,
24,
27785,
2]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
'pred_Department Name': [['Tops', 'Intimate', 'Trend']],
'pred_prob_Department Name': [[0.996221661567688,
0.0016704618465155363,
0.0008719302131794393]]}
d) Make predictions, using only Tokenized DatasetDict
Load trained model
# Load trained model from section 4.2
= RobertaForSequenceClassification.from_pretrained('./sample_weights/model_progress',num_labels=6).to('cuda:0')
trained_model = ModelController(trained_model,seed=42) controller
Predict Train/Validation set
main_ddict
DatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18102
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})
Since we don’t use a TextDataController
, we have to define a few arguments to make it work
='Department Name'
label_names=6
num_classes= ['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'] class_predefined
= controller.predict_ddict(main_ddict,
df_val ='validation',
ds_type=False,
is_multilabel=_tokenizer,
tokenizer=label_names,
label_names=class_predefined
class_names_predefined )
-------------------- Start making predictions --------------------
= df_val.to_pandas()
df_val df_val.head()
Title | Review Text | Division Name | Department Name | label | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|---|---|
0 | general petite . . such a fun jacket ! great t... | general petite | Intimate | 2 | [0, 15841, 4716, 1459, 479, 479, 215, 10, 1531... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Jackets | 0.823920 | |
1 | simple and elegant | general petite . simple and elegant . i though... | general petite | Tops | 4 | [0, 15841, 4716, 1459, 479, 2007, 8, 14878, 47... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.995665 |
2 | retro and pretty | general . retro and pretty . this top has a bi... | general | Tops | 4 | [0, 15841, 479, 11299, 8, 1256, 479, 42, 299, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.995805 |
3 | summer/fall wear | general petite . summer / fall wear . i first ... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1035, 1589, 1136, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Dresses | 0.985551 |
4 | perfect except slip | general petite . perfect except slip . this is... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1969, 4682, 9215, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Dresses | 0.985531 |
Predict Test set
= pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
df_test # drop NaN values in the label column
= df_test[~df_test['Department Name'].isna()].reset_index(drop=True)
df_test
# save the label, as we will calculate some metrics later. We also filter out labels with NaN Review Text,
# as there will be a filtering processing on the test set
= df_test.loc[~df_test['Review Text'].isna(),'Department Name'].values
true_labels
# drop the label (you don't need to, but this is necessary to simulate an actual test set)
'Department Name',axis=1,inplace=True) df_test.drop(
Similarly, you have to have your test dataset that has been preprocessed and tokenized, so that the final dataset should have some or all of these fields: input_ids
, token_type_ids
, attention_mask
. For now we will borrow the previous tdc
to do the preprocessing for us.
= tdc.prepare_test_dataset_from_df(df_test,validate=True,do_filtering=True) _test_dset_processed
- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title 758
Review Text 164
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 2 rows
_test_dset_processed
Dataset({
features: ['Title', 'Review Text', 'Division Name', 'input_ids', 'attention_mask'],
num_rows: 4528
})
Again, we are using TextDataController
to make this process easier to handle. If you have your own pipeline, feel free to use it to produce the processed test dataset. Also, as this point, all you need in your dataset is either (or all) of these features: input_ids
, token_type_ids
, attention_mask
. You can drop other features if you want, though it’s not required
= _test_dset_processed.remove_columns(['Title','Review Text','Division Name']) _test_dset_processed
= controller.predict_ddict(_test_dset_processed,
df_test_predicted =False,
is_multilabel=_tokenizer,
tokenizer=label_names,
label_names=class_predefined
class_names_predefined )
-------------------- Start making predictions --------------------
= df_test_predicted.to_pandas()
df_test_predicted df_test_predicted.head()
input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|
0 | [0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.996438 |
1 | [0, 15841, 4716, 1459, 479, 479, 939, 218, 75,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Bottoms | 0.976738 |
2 | [0, 15841, 4716, 1459, 479, 372, 9304, 479, 5,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Bottoms | 0.958788 |
3 | [0, 15841, 4716, 1459, 479, 10262, 3137, 24382... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.994487 |
4 | [0, 15841, 4716, 1459, 479, 765, 8, 650, 479, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.995179 |
4. Model Experiment: Roberta Custom Classification
from that_nlp_library.models.roberta.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import f1_score, accuracy_score
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
dset
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations# add "str.lower" here because nearby_aug might return uppercase character
=0.2,
val_ratio=1000,
batch_size=42,
seed=20,
num_proc=False
verbose
)
= RobertaTokenizer.from_pretrained('roberta-base')
_tokenizer
=100,shuffle_trn=True) tdc.process_and_tokenize(_tokenizer,max_length
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
a) Define and train a custom Roberta model
= len(tdc.label_lists[0])
num_classes num_classes
6
Let’s define a Roberta model (without a head), because we will create our custom classification head
from transformers.models.roberta.modeling_roberta import RobertaModel
= RobertaModel.from_pretrained('roberta-base') roberta_body
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Then we can define a classification head. One trick we can use to boost the performance of our entire model is to concatenate the outputs of [CLS]
from the four last layers of the pre-trained Roberta model (source: https://ieeexplore.ieee.org/document/9335912). We already define such custom head (ConcatHeadSimple
), and the necessary architecture to make it work (RobertaHiddenStateConcatForSequenceClassification
)
# our model is more complex, so it's best to define some of its arguments
={
_model_kwargs# overall model hyperparams
'head_class_sizes':num_classes,
'head_class': ConcatHeadSimple,
# classfication head hyperparams
'layer2concat':2, # you can change the number of layers to concat (default is 4, based on the paper)
'classifier_dropout':0.1
}
= model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
model = 'roberta-base',
cpoint_path =True, # since we are using 'hidden layer contatenation' technique
output_hidden_states=42,
seed=roberta_body,
body_model= _model_kwargs)
model_kwargs
= [partial(f1_score,average='macro'),accuracy_score]
metric_funcs = ModelController(model,tdc,seed=42) controller
Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124654854
Total trainable parameters: 124654854
And we can start training our model
42) seed_everything(
= 1e-4
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics )
Epoch | Training Loss | Validation Loss | F1 Score Department name | Accuracy Score Department name |
---|---|---|---|---|
1 | No log | 0.296447 | 0.744318 | 0.914936 |
2 | 0.428200 | 0.258439 | 0.752792 | 0.922669 |
3 | 0.428200 | 0.272308 | 0.747529 | 0.920018 |
b) Make predictions
Make prediction on all validation set
= controller.predict_ddict(ds_type='validation') df_val
-------------------- Start making predictions --------------------
= df_val.to_pandas()
df_val df_val.head()
Title | Review Text | Division Name | Department Name | label | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|---|---|
0 | general petite . . such a fun jacket ! great t... | general petite | Intimate | 2 | [0, 15841, 4716, 1459, 479, 479, 215, 10, 1531... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Jackets | 0.818531 | |
1 | simple and elegant | general petite . simple and elegant . i though... | general petite | Tops | 4 | [0, 15841, 4716, 1459, 479, 2007, 8, 14878, 47... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.997387 |
2 | retro and pretty | general . retro and pretty . this top has a bi... | general | Tops | 4 | [0, 15841, 479, 11299, 8, 1256, 479, 42, 299, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Tops | 0.997603 |
3 | summer/fall wear | general petite . summer / fall wear . i first ... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1035, 1589, 1136, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Dresses | 0.988438 |
4 | perfect except slip | general petite . perfect except slip . this is... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1969, 4682, 9215, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | Dresses | 0.989150 |
You can try to get your metric to see if it matches your last traing epoch’s above
'Department Name'],df_val['pred_Department Name'],average='macro') f1_score(df_val[
0.7475294947215362
= controller.predict_ddict(ds_type='validation',topk=2)
df_val = df_val.to_pandas()
df_val df_val.head()
-------------------- Start making predictions --------------------
Title | Review Text | Division Name | Department Name | label | input_ids | attention_mask | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|---|---|
0 | general petite . . such a fun jacket ! great t... | general petite | Intimate | 2 | [0, 15841, 4716, 1459, 479, 479, 215, 10, 1531... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Jackets, Tops] | [0.8185308, 0.14819466] | |
1 | simple and elegant | general petite . simple and elegant . i though... | general petite | Tops | 4 | [0, 15841, 4716, 1459, 479, 2007, 8, 14878, 47... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Tops, Intimate] | [0.9973871, 0.0010666925] |
2 | retro and pretty | general . retro and pretty . this top has a bi... | general | Tops | 4 | [0, 15841, 479, 11299, 8, 1256, 479, 42, 299, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Tops, Intimate] | [0.997603, 0.001102674] |
3 | summer/fall wear | general petite . summer / fall wear . i first ... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1035, 1589, 1136, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Dresses, Trend] | [0.98843807, 0.005955467] |
4 | perfect except slip | general petite . perfect except slip . this is... | general petite | Dresses | 1 | [0, 15841, 4716, 1459, 479, 1969, 4682, 9215, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [Dresses, Trend] | [0.9891495, 0.0058816983] |