import os
Model Controller Tutorial: Training a Roberta Language Model
#This will specify a (or a list) of GPUs for training
'CUDA_VISIBLE_DEVICES'] = "0" os.environ[
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main_lm import *
from that_nlp_library.utils import seed_everything
from that_nlp_library.model_lm_main import *
from underthesea import text_normalize
from functools import partial
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import DataCollatorForLanguageModeling
1. Train a Roberta Language Model From Scratch (with line-by-line tokenization)
a) Create a TextDataLMController object
We will reuse the data and the preprocessings in this tutorial
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
dset = TextDataLMController(dset,
tdc ='Review Text',
main_text={'Review Text': lambda x: x is not None},
filter_dict='Title',
metadatas=[text_normalize,str.lower],
content_transformations=['Clothing ID','Review Text'],
cols_to_keep=42,
seed=False
verbose )
Define our tokenizer for Roberta
= AutoTokenizer.from_pretrained('roberta-base') _tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset (using line-by-line tokenization)
=112
block_size=True,max_length=block_size)
tdc.process_and_tokenize(_tokenizer,line_by_line# set max_length=-1 if you want the data collator (instead of the tokenizer) to pad
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 18112
})
validation: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 4529
})
})
And set the data collator
=True,mlm_prob=0.15) tdc.set_data_collator(is_mlm
b) Initialize and train Roberta Language Model from scratch
= AutoConfig.from_pretrained('roberta-base',
_config # just in case...
=len(_tokenizer),
vocab_size=_tokenizer.bos_token_id,
bos_token_id=_tokenizer.eos_token_id,
eos_token_id
) _config
RobertaConfig {
"_name_or_path": "roberta-base",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"transformers_version": "4.40.1",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
# _config = AutoConfig.from_pretrained('roberta-base',
# # just in case...
# vocab_size=len(_tokenizer),
# bos_token_id=_tokenizer.bos_token_id,
# eos_token_id=_tokenizer.eos_token_id
# )
# _config
= language_model_init(AutoModelForMaskedLM,
_model =_config,
config=None, # leave this as None to get a non-pretrained model
cpoint_path=42
seed )
Initiate a new language model from scratch
Total parameters: 124697433
Total trainable parameters: 124697433
Create a model controller
= ModelLMController(_model,data_store=tdc,seed=42) controller
And we can start training our model
= 1e-4
lr =32
bs=0.01
wd= 4
epochs=0.25
warmup_ratio
controller.fit(epochs,lr,=bs,
batch_size=wd,
weight_decay=warmup_ratio,
warmup_ratio=False,
save_checkpoint )
Epoch | Training Loss | Validation Loss | Accuracy |
---|---|---|---|
1 | No log | 5.603508 | 0.132908 |
2 | 6.342900 | 5.404186 | 0.153836 |
3 | 6.342900 | 5.259489 | 0.169787 |
4 | 5.286800 | 5.232296 | 0.178328 |
Perplexity on validation set: 187.860
'./sample_weights/lm_model') controller.trainer.model.save_pretrained(
c) Fill mask using model
= language_model_init(AutoModelForMaskedLM,
trained_model ='./sample_weights/lm_model',
cpoint_path )
Total parameters: 124697433
Total trainable parameters: 124697433
= ModelLMController(trained_model,data_store=tdc,seed=42) controller2
controller2.data_store.tokenizer.mask_token
'<mask>'
= {'Clothing ID':1,
inp1 'Title':'Flattering',
'Review Text': "Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug"
}
=True) controller2.predict_raw_text(inp1,print_result
Score: 0.160 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.113 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.050 >>> flattering. love this!. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.040 >>> flattering. love this is. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.027 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
You can input several raw texts
= {'Clothing ID':[1,2],
inp2 'Title':['Flattering','Lovely, but small'],
'Review Text': ["Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug",
"Love this skirt. The detail is amazing. Runs <mask>, I ordered a 12 I'm usually a 10, but still a little snug"]
}
=True) controller2.predict_raw_text(inp2,print_result
Score: 0.160 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.113 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.050 >>> flattering. love this!. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.040 >>> flattering. love this is. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.027 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
Score: 0.071 >>> lovely, but small. love this skirt. the detail is amazing. runs it, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.052 >>> lovely, but small. love this skirt. the detail is amazing. runs the, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.050 >>> lovely, but small. love this skirt. the detail is amazing. runs and, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.048 >>> lovely, but small. love this skirt. the detail is amazing. runs., i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.046 >>> lovely, but small. love this skirt. the detail is amazing. runs,, i ordered a 12 i'm usually a 10, but still a little snug
--------------------
2. Finetune a Roberta Language Model (with line-by-line tokenization)
a) Create a TextDataLMController object
We will reuse the data and the preprocessings in this tutorial
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
dset = TextDataLMController(dset,
tdc ='Review Text',
main_text={'Review Text': lambda x: x is not None},
filter_dict='Title',
metadatas=[text_normalize,str.lower],
content_transformations=['Clothing ID','Review Text'],
cols_to_keep=42,
seed=False
verbose )
Define our tokenizer for Roberta
= AutoTokenizer.from_pretrained('roberta-base') _tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset (using line-by-line tokenization)
=112
block_size=True,max_length=block_size)
tdc.process_and_tokenize(_tokenizer,line_by_line# set max_length=-1 if you want the data collator to pad
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 18112
})
validation: Dataset({
features: ['Clothing ID', 'Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 4529
})
})
And set the data collator
=True,mlm_prob=0.15) tdc.set_data_collator(is_mlm
b) Initialize and train Roberta Language Model
= AutoConfig.from_pretrained('roberta-base',
_config =len(_tokenizer))
vocab_size _config
RobertaConfig {
"_name_or_path": "roberta-base",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"transformers_version": "4.40.1",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
= language_model_init(AutoModelForMaskedLM,
_model =_config,
config='roberta-base',
cpoint_path=42
seed )
Total parameters: 124697433
Total trainable parameters: 124697433
Create a model controller
= ModelLMController(_model,data_store=tdc,seed=42) controller
And we can start training our model
= 1e-4
lr =32
bs=0.01
wd= 4
epochs=0.25
warmup_ratio
controller.fit(epochs,lr,=bs,
batch_size=wd,
weight_decay=warmup_ratio,
warmup_ratio=False,
save_checkpoint )
Epoch | Training Loss | Validation Loss | Accuracy |
---|---|---|---|
1 | No log | 1.559132 | 0.650172 |
2 | 1.682400 | 1.451852 | 0.667497 |
3 | 1.682400 | 1.360187 | 0.684915 |
4 | 1.405600 | 1.331839 | 0.688361 |
Perplexity on validation set: 3.779
Finetuning from a pretrained model results in a massive improvement in terms of metrics
'./sample_weights/lm_model') controller.trainer.model.save_pretrained(
c) Fill mask using model
= language_model_init(AutoModelForMaskedLM,
trained_model ='./sample_weights/lm_model',
cpoint_path )
Total parameters: 124697433
Total trainable parameters: 124697433
= ModelLMController(trained_model,data_store=tdc,seed=42) controller2
controller2.data_store.tokenizer.mask_token
'<mask>'
= {'Clothing ID':1,
inp1 'Title':'Flattering',
'Review Text': "Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug"
}
=True) controller2.predict_raw_text(inp1,print_result
Score: 0.285 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.244 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.187 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.070 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.068 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
You can input several raw texts
= {'Clothing ID':[1,2],
inp2 'Title':['Flattering','Lovely, but small'],
'Review Text': ["Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug",
"Love this skirt. The detail is amazing. Runs <mask>, I ordered a 12 I'm usually a 10, but still a little snug"]
}
=True) controller2.predict_raw_text(inp2,print_result
Score: 0.285 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.244 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.187 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.070 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.068 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
Score: 0.893 >>> lovely, but small. love this skirt. the detail is amazing. runs small, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.051 >>> lovely, but small. love this skirt. the detail is amazing. runs large, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.022 >>> lovely, but small. love this skirt. the detail is amazing. runs big, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.006 >>> lovely, but small. love this skirt. the detail is amazing. runs short, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.005 >>> lovely, but small. love this skirt. the detail is amazing. runs tiny, i ordered a 12 i'm usually a 10, but still a little snug
--------------------
=False) controller2.predict_raw_text(inp2,print_result
[[{'score': 0.28502416610717773,
'token': 3588,
'token_str': ' dress',
'sequence': "flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.24447907507419586,
'token': 299,
'token_str': ' top',
'sequence': "flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.18709176778793335,
'token': 6399,
'token_str': ' shirt',
'sequence': "flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.06980422139167786,
'token': 23204,
'token_str': ' sweater',
'sequence': "flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.06781341880559921,
'token': 16576,
'token_str': ' skirt',
'sequence': "flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug"}],
[{'score': 0.8933400511741638,
'token': 650,
'token_str': ' small',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs small, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.05062047392129898,
'token': 739,
'token_str': ' large',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs large, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.0221096184104681,
'token': 380,
'token_str': ' big',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs big, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.006218481808900833,
'token': 765,
'token_str': ' short',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs short, i ordered a 12 i'm usually a 10, but still a little snug"},
{'score': 0.0046571786515414715,
'token': 5262,
'token_str': ' tiny',
'sequence': "lovely, but small. love this skirt. the detail is amazing. runs tiny, i ordered a 12 i'm usually a 10, but still a little snug"}]]
3. Finetune a Roberta Language Model (with token concatenation)
Since our data only contain short text (with maximum sentence length is around 120 words), using Token Concatenation technique might not be ideal (as this technique is more suitable for when the text is long). One perk is that this will reduce the amount of training data. With that being said, we will still run some experiments using this technique.
a) Create a TextDataLMController object
We will reuse the data and the preprocessings in this tutorial
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
dset = TextDataLMController(dset,
tdc ='Review Text',
main_text={'Review Text': lambda x: x is not None},
filter_dict='Title',
metadatas=[text_normalize,str.lower],
content_transformations=42,
seed=False
verbose )
Define our tokenizer for Roberta
= AutoTokenizer.from_pretrained('roberta-base') _tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset (using token concatenation technique)
=112
block_size=False,max_length=block_size) tdc.process_and_tokenize(_tokenizer,line_by_line
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 12901
})
validation: Dataset({
features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 3276
})
})
And set the data collator
=True,mlm_prob=0.15) tdc.set_data_collator(is_mlm
b) Initialize and train Roberta Language Model
= AutoConfig.from_pretrained('roberta-base',
_config =len(_tokenizer))
vocab_size _config
RobertaConfig {
"_name_or_path": "roberta-base",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"transformers_version": "4.40.1",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}
= language_model_init(AutoModelForMaskedLM,
_model =_config,
config='roberta-base',
cpoint_path=42
seed )
Total parameters: 124697433
Total trainable parameters: 124697433
Create a model controller
= ModelLMController(_model,data_store=tdc,seed=42) controller
And we can start training our model
= 1e-4
lr =32
bs=0.01
wd= 4
epochs=0.25
warmup_ratio
controller.fit(epochs,lr,=bs,
batch_size=wd,
weight_decay=warmup_ratio,
warmup_ratio=False,
save_checkpoint )
Epoch | Training Loss | Validation Loss | Accuracy |
---|---|---|---|
1 | No log | 1.694216 | 0.628713 |
2 | 1.860100 | 1.601513 | 0.642077 |
3 | 1.860100 | 1.515734 | 0.656354 |
4 | 1.561200 | 1.477700 | 0.662074 |
Perplexity on validation set: 4.413
Slightly less perplexity than the previous model
'./sample_weights/lm_model') controller.trainer.model.save_pretrained(
c) Fill mask using model
= language_model_init(AutoModelForMaskedLM,
trained_model ='./sample_weights/lm_model',
cpoint_path )
Total parameters: 124697433
Total trainable parameters: 124697433
= ModelLMController(trained_model,data_store=tdc,seed=42) controller2
= {'Title':'Flattering',
inp1 'Review Text': "Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug"
}
=True) controller2.predict_raw_text(inp1,print_result
Score: 0.328 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.304 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.128 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.076 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.045 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
You can input several raw texts
= {'Title':['Flattering','Lovely, but small'],
inp2 'Review Text': ["Love this <mask>. The detail is amazing. Runs small I ordered a 12 I'm usually a 10, but still a little snug",
"Love this skirt. The detail is amazing. Runs <mask>, I ordered a 12 I'm usually a 10, but still a little snug"]
}
=True) controller2.predict_raw_text(inp2,print_result
Score: 0.328 >>> flattering. love this top. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.304 >>> flattering. love this dress. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.128 >>> flattering. love this shirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.076 >>> flattering. love this sweater. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.045 >>> flattering. love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug
--------------------
Score: 0.893 >>> lovely, but small. love this skirt. the detail is amazing. runs small, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.062 >>> lovely, but small. love this skirt. the detail is amazing. runs large, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.020 >>> lovely, but small. love this skirt. the detail is amazing. runs big, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.004 >>> lovely, but small. love this skirt. the detail is amazing. runs short, i ordered a 12 i'm usually a 10, but still a little snug
Score: 0.003 >>> lovely, but small. love this skirt. the detail is amazing. runs tiny, i ordered a 12 i'm usually a 10, but still a little snug
--------------------