from transformers import AutoModelForCausalLM, AutoModelForMaskedLM
source
language_model_init
language_model_init (model_class, cpoint_path=None, config=None,
device=None, seed=None)
To initialize a language model, either masked or casual
model_class
Model’s class object, e.g. AutoModelForMaskedLM
cpoint_path
NoneType
None
Either model string name on HuggingFace, or the path to model checkpoint. Put None
to train from scratch
config
NoneType
None
Model config. If not provided, AutoConfig is used to load config from cpoint_path
device
NoneType
None
Device to train on
seed
NoneType
None
Random seed
_model1 = language_model_init(AutoModelForMaskedLM,
'roberta-base' )
_model1
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Total parameters: 124697433
Total trainable parameters: 124697433
RobertaForMaskedLM(
(roberta): RobertaModel(
(embeddings): RobertaEmbeddings(
(word_embeddings): Embedding(50265, 768, padding_idx=1)
(position_embeddings): Embedding(514, 768, padding_idx=1)
(token_type_embeddings): Embedding(1, 768)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): RobertaEncoder(
(layer): ModuleList(
(0-11): 12 x RobertaLayer(
(attention): RobertaAttention(
(self): RobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): RobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): RobertaIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): RobertaOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
(lm_head): RobertaLMHead(
(dense): Linear(in_features=768, out_features=768, bias=True)
(layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(decoder): Linear(in_features=768, out_features=50265, bias=True)
)
)
_model1 = language_model_init(AutoModelForMaskedLM,
'nguyenvulebinh/envibert' )
_model1
Total parameters: 70764377
Total trainable parameters: 70764377
RobertaForMaskedLM(
(roberta): RobertaModel(
(embeddings): RobertaEmbeddings(
(word_embeddings): Embedding(59993, 768, padding_idx=1)
(position_embeddings): Embedding(514, 768, padding_idx=1)
(token_type_embeddings): Embedding(1, 768)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): RobertaEncoder(
(layer): ModuleList(
(0-5): 6 x RobertaLayer(
(attention): RobertaAttention(
(self): RobertaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): RobertaSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): RobertaIntermediate(
(dense): Linear(in_features=768, out_features=1024, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): RobertaOutput(
(dense): Linear(in_features=1024, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
(lm_head): RobertaLMHead(
(dense): Linear(in_features=768, out_features=768, bias=True)
(layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(decoder): Linear(in_features=768, out_features=59993, bias=True)
)
)
_model2 = language_model_init(AutoModelForCausalLM,
'gpt2' )
_model2
Total parameters: 124439808
Total trainable parameters: 124439808
GPT2LMHeadModel(
(transformer): GPT2Model(
(wte): Embedding(50257, 768)
(wpe): Embedding(1024, 768)
(drop): Dropout(p=0.1, inplace=False)
(h): ModuleList(
(0-11): 12 x GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
source
finetune_lm
finetune_lm (lr, bs, wd, epochs, ddict, tokenizer, o_dir='./tmp_weights',
save_checkpoint=False, model=None, model_init=None,
data_collator=None, compute_metrics=None,
grad_accum_steps=2, lr_scheduler_type='cosine',
warmup_ratio=0.1, no_valid=False, val_bs=None, seed=None,
report_to='none', trainer_class=None, len_train=None)
The main model training/finetuning function
lr
Learning rate
bs
Batch size
wd
Weight decay
epochs
Number of epochs
ddict
The HuggingFace datasetdict
tokenizer
HuggingFace tokenizer
o_dir
str
./tmp_weights
Directory to save weights
save_checkpoint
bool
False
Whether to save weights (checkpoints) to o_dir
model
NoneType
None
NLP model
model_init
NoneType
None
A function to initialize model
data_collator
NoneType
None
HuggingFace data collator
compute_metrics
NoneType
None
A function to compute metric, default to compute_lm_accuracy
grad_accum_steps
int
2
The batch at each step will be divided by this integer and gradient will be accumulated over gradient_accumulation_steps steps.
lr_scheduler_type
str
cosine
The scheduler type to use. Including: linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup
warmup_ratio
float
0.1
The warmup ratio for some lr scheduler
no_valid
bool
False
Whether there is a validation set or not
val_bs
NoneType
None
Validation batch size
seed
NoneType
None
Random seed
report_to
str
none
The list of integrations to report the results and logs to. Supported platforms are “azure_ml”, “comet_ml”, “mlflow”, “neptune”, “tensorboard”,“clearml” and “wandb”. Use “all” to report to all integrations installed, “none” for no integrations.
trainer_class
NoneType
None
You can include the class name of your custom trainer here
len_train
NoneType
None
estimated number of samples in the whole training set (for streaming dataset only)
source
ModelLMController
ModelLMController (model, data_store=None, seed=None)
Initialize self. See help(type(self)) for accurate signature.
model
NLP language model
data_store
NoneType
None
a TextDataLMController/TextDataLMControllerStreaming object
seed
NoneType
None
Random seed
source
ModelLMController.fit
ModelLMController.fit (epochs, learning_rate, ddict=None,
compute_metrics=None, batch_size=16,
val_batch_size=None, weight_decay=0.01,
lr_scheduler_type='cosine', warmup_ratio=0.1,
o_dir='./tmp_weights', save_checkpoint=False,
hf_report_to='none', grad_accum_steps=2,
tokenizer=None, data_collator=None, is_mlm=None,
trainer_class=None, len_train=None)
epochs
Number of epochs
learning_rate
Learning rate
ddict
NoneType
None
DatasetDict to fit (will override data_store)
compute_metrics
NoneType
None
A function to compute metric, default to compute_lm_accuracy
batch_size
int
16
Batch size
val_batch_size
NoneType
None
Validation batch size. Set to batch_size if None
weight_decay
float
0.01
Weight decay
lr_scheduler_type
str
cosine
The scheduler type to use. Including: linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup
warmup_ratio
float
0.1
The warmup ratio for some lr scheduler
o_dir
str
./tmp_weights
Directory to save weights
save_checkpoint
bool
False
Whether to save weights (checkpoints) to o_dir
hf_report_to
str
none
The list of HuggingFace-allowed integrations to report the results and logs to
grad_accum_steps
int
2
Gradient will be accumulated over gradient_accumulation_steps steps.
tokenizer
NoneType
None
Tokenizer (to override one in data_store
)
data_collator
NoneType
None
Data Collator (to override one in data_store
)
is_mlm
NoneType
None
Whether this is masked LM or casual LM
trainer_class
NoneType
None
You can include the class name of your custom trainer here
len_train
NoneType
None
estimated number of samples in the whole training set (for streaming dataset only)
source
ModelLMController.predict_raw_text
ModelLMController.predict_raw_text (content:dict|list|str,
print_result=True, **kwargs)
content
dict | list | str
Either a single sentence, list of sentence or a dictionary where keys are metadata, values are list
print_result
bool
True
To whether print the result in readable format, or get the result returned
kwargs