import osModel Controller Tutorial: Training a Roberta Language Model
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main_lm import *
from that_nlp_library.utils import seed_everything
from that_nlp_library.model_lm_main import *from underthesea import text_normalize
from functools import partial
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import DataCollatorForLanguageModeling1. Train a Roberta Language Model From Scratch (with line-by-line tokenization)
a) Create a TextDataLMController object
We will reuse the data and the preprocessings in this tutorial
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas='Title',
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
verbose=False
)Define our tokenizer for Roberta
_tokenizer = AutoTokenizer.from_pretrained('roberta-base')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset (using line-by-line tokenization)
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=True,max_length=block_size)
# set max_length=-1 if you want the data collator (instead of the tokenizer) to padtdc.main_ddictDatasetDict({
train: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 18112
})
validation: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 4529
})
})
And set the data collator
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)b) Initialize and train Roberta Language Model from scratch
_config = AutoConfig.from_pretrained('roberta-base',
# just in case...
vocab_size=len(_tokenizer),
bos_token_id=_tokenizer.bos_token_id,
eos_token_id=_tokenizer.eos_token_id,
)
_configRobertaConfig {
"_name_or_path": "roberta-base",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"transformers_version": "4.40.1",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
# _config = AutoConfig.from_pretrained('roberta-base',
# # just in case...
# vocab_size=len(_tokenizer),
# bos_token_id=_tokenizer.bos_token_id,
# eos_token_id=_tokenizer.eos_token_id
# )
# _config_model = language_model_init(AutoModelForMaskedLM,
config=_config,
cpoint_path=None, # leave this as None to get a non-pretrained model
seed=42
)Initiate a new language model from scratch
Total parameters: 124697433
Total trainable parameters: 124697433
Create a model controller
controller = ModelLMController(_model,data_store=tdc,seed=42)And we can start training our model
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
batch_size=bs,
weight_decay=wd,
warmup_ratio=warmup_ratio,
save_checkpoint=False,
)| Epoch | Training Loss | Validation Loss | Accuracy |
|---|---|---|---|
| 1 | No log | 5.603508 | 0.132908 |
| 2 | 6.342900 | 5.404186 | 0.153836 |
| 3 | 6.342900 | 5.259489 | 0.169787 |
| 4 | 5.286800 | 5.232296 | 0.178328 |
Perplexity on validation set: 187.860
controller.trainer.model.save_pretrained('./sample_weights/lm_model')c) Fill mask using model
trained_model = language_model_init(AutoModelForMaskedLM,
cpoint_path='./sample_weights/lm_model',
)Total parameters: 124697433
Total trainable parameters: 124697433
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)controller2.data_store.tokenizer.mask_token'<mask>'
inp1 = {'Clothing ID':1,
'Title':'Flattering',
'Review Text': "Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug"
}controller2.predict_raw_text(inp1,print_result=True)Score: 0.160 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.113 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.050 >>> flattering. love this!. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.040 >>> flattering. love this is. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.027 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
You can input several raw texts
inp2 = {'Clothing ID':[1,2],
'Title':['Flattering','Lovely, but small'],
'Review Text': ["Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug",
"Love this skirt. The detail is amazing. Runs <mask>, I ordered a 12 I'm usually a 10, but still a little snug"]
}controller2.predict_raw_text(inp2,print_result=True)Score: 0.160 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.113 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.050 >>> flattering. love this!. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.040 >>> flattering. love this is. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.027 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
Score: 0.071 >>> lovely, but small. love this skirt. the detail is amazing. runs it, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.052 >>> lovely, but small. love this skirt. the detail is amazing. runs the, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.050 >>> lovely, but small. love this skirt. the detail is amazing. runs and, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.048 >>> lovely, but small. love this skirt. the detail is amazing. runs., i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.046 >>> lovely, but small. love this skirt. the detail is amazing. runs,, i ordered a 12 i'm usually a 10, but still a little snug
--------------------
2. Finetune a Roberta Language Model (with line-by-line tokenization)
a) Create a TextDataLMController object
We will reuse the data and the preprocessings in this tutorial
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas='Title',
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
verbose=False
)Define our tokenizer for Roberta
_tokenizer = AutoTokenizer.from_pretrained('roberta-base')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset (using line-by-line tokenization)
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=True,max_length=block_size)
# set max_length=-1 if you want the data collator to padtdc.main_ddictDatasetDict({
train: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 18112
})
validation: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 4529
})
})
And set the data collator
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)b) Initialize and train Roberta Language Model
_config = AutoConfig.from_pretrained('roberta-base',
vocab_size=len(_tokenizer))
_configRobertaConfig {
"_name_or_path": "roberta-base",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"transformers_version": "4.40.1",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
_model = language_model_init(AutoModelForMaskedLM,
config=_config,
cpoint_path='roberta-base',
seed=42
)Total parameters: 124697433
Total trainable parameters: 124697433
Create a model controller
controller = ModelLMController(_model,data_store=tdc,seed=42)And we can start training our model
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
batch_size=bs,
weight_decay=wd,
warmup_ratio=warmup_ratio,
save_checkpoint=False,
)| Epoch | Training Loss | Validation Loss | Accuracy |
|---|---|---|---|
| 1 | No log | 1.559132 | 0.650172 |
| 2 | 1.682400 | 1.451852 | 0.667497 |
| 3 | 1.682400 | 1.360187 | 0.684915 |
| 4 | 1.405600 | 1.331839 | 0.688361 |
Perplexity on validation set: 3.779
Finetuning from a pretrained model results in a massive improvement in terms of metrics
controller.trainer.model.save_pretrained('./sample_weights/lm_model')c) Fill mask using model
trained_model = language_model_init(AutoModelForMaskedLM,
cpoint_path='./sample_weights/lm_model',
)Total parameters: 124697433
Total trainable parameters: 124697433
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)controller2.data_store.tokenizer.mask_token'<mask>'
inp1 = {'Clothing ID':1,
'Title':'Flattering',
'Review Text': "Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug"
}controller2.predict_raw_text(inp1,print_result=True)Score: 0.285 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.244 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.187 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.070 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.068 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
You can input several raw texts
inp2 = {'Clothing ID':[1,2],
'Title':['Flattering','Lovely, but small'],
'Review Text': ["Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug",
"Love this skirt. The detail is amazing. Runs <mask>, I ordered a 12 I'm usually a 10, but still a little snug"]
}controller2.predict_raw_text(inp2,print_result=True)Score: 0.285 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.244 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.187 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.070 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.068 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
Score: 0.893 >>> lovely, but small. love this skirt. the detail is amazing. runs small, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.051 >>> lovely, but small. love this skirt. the detail is amazing. runs large, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.022 >>> lovely, but small. love this skirt. the detail is amazing. runs big, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.006 >>> lovely, but small. love this skirt. the detail is amazing. runs short, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.005 >>> lovely, but small. love this skirt. the detail is amazing. runs tiny, i ordered a 12 i'm usually a 10, but still a little snug
--------------------
controller2.predict_raw_text(inp2,print_result=False)[[{'score': 0.28502416610717773,
'token': 3588,
'token_str': ' dress',
'sequence': "flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.24447907507419586,
'token': 299,
'token_str': ' top',
'sequence': "flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.18709176778793335,
'token': 6399,
'token_str': ' shirt',
'sequence': "flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.06980422139167786,
'token': 23204,
'token_str': ' sweater',
'sequence': "flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.06781341880559921,
'token': 16576,
'token_str': ' skirt',
'sequence': "flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"}],
[{'score': 0.8933400511741638,
'token': 650,
'token_str': ' small',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs small, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.05062047392129898,
'token': 739,
'token_str': ' large',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs large, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.0221096184104681,
'token': 380,
'token_str': ' big',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs big, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.006218481808900833,
'token': 765,
'token_str': ' short',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs short, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.0046571786515414715,
'token': 5262,
'token_str': ' tiny',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs tiny, i ordered a 12 i'm usually a 10, but still a little snug"}]]
3. Finetune a Roberta Language Model (with token concatenation)
Since our data only contain short text (with maximum sentence length is around 120 words), using Token Concatenation technique might not be ideal (as this technique is more suitable for when the text is long). One perk is that this will reduce the amount of training data. With that being said, we will still run some experiments using this technique.
a) Create a TextDataLMController object
We will reuse the data and the preprocessings in this tutorial
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas='Title',
content_transformations=[text_normalize,str.lower],
seed=42,
verbose=False
)Define our tokenizer for Roberta
_tokenizer = AutoTokenizer.from_pretrained('roberta-base')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset (using token concatenation technique)
block_size=112
tdc.process_and_tokenize(_tokenizer,line_by_line=False,max_length=block_size)tdc.main_ddictDatasetDict({
train: Dataset({
features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 12901
})
validation: Dataset({
features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 3276
})
})
And set the data collator
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)b) Initialize and train Roberta Language Model
_config = AutoConfig.from_pretrained('roberta-base',
vocab_size=len(_tokenizer))
_configRobertaConfig {
"_name_or_path": "roberta-base",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"transformers_version": "4.40.1",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
_model = language_model_init(AutoModelForMaskedLM,
config=_config,
cpoint_path='roberta-base',
seed=42
)Total parameters: 124697433
Total trainable parameters: 124697433
Create a model controller
controller = ModelLMController(_model,data_store=tdc,seed=42)And we can start training our model
lr = 1e-4
bs=32
wd=0.01
epochs= 4
warmup_ratio=0.25
controller.fit(epochs,lr,
batch_size=bs,
weight_decay=wd,
warmup_ratio=warmup_ratio,
save_checkpoint=False,
)| Epoch | Training Loss | Validation Loss | Accuracy |
|---|---|---|---|
| 1 | No log | 1.694216 | 0.628713 |
| 2 | 1.860100 | 1.601513 | 0.642077 |
| 3 | 1.860100 | 1.515734 | 0.656354 |
| 4 | 1.561200 | 1.477700 | 0.662074 |
Perplexity on validation set: 4.413
Slightly less perplexity than the previous model
controller.trainer.model.save_pretrained('./sample_weights/lm_model')c) Fill mask using model
trained_model = language_model_init(AutoModelForMaskedLM,
cpoint_path='./sample_weights/lm_model',
)Total parameters: 124697433
Total trainable parameters: 124697433
controller2 = ModelLMController(trained_model,data_store=tdc,seed=42)inp1 = {'Title':'Flattering',
'Review Text': "Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug"
}controller2.predict_raw_text(inp1,print_result=True)Score: 0.328 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.304 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.128 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.076 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.045 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
You can input several raw texts
inp2 = {'Title':['Flattering','Lovely, but small'],
'Review Text': ["Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug",
"Love this skirt. The detail is amazing. Runs <mask>, I ordered a 12 I'm usually a 10, but still a little snug"]
}controller2.predict_raw_text(inp2,print_result=True)Score: 0.328 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.304 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.128 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.076 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.045 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
Score: 0.893 >>> lovely, but small. love this skirt. the detail is amazing. runs small, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.062 >>> lovely, but small. love this skirt. the detail is amazing. runs large, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.020 >>> lovely, but small. love this skirt. the detail is amazing. runs big, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.004 >>> lovely, but small. love this skirt. the detail is amazing. runs short, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.003 >>> lovely, but small. love this skirt. the detail is amazing. runs tiny, i ordered a 12 i'm usually a 10, but still a little snug
--------------------