import os
GPT2 model (Regression)
In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out
#This will specify a (or a list) of GPUs for training
'CUDA_VISIBLE_DEVICES'] = "0" os.environ[
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything
from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import AutoTokenizer
from datasets import Dataset
import torch
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
Define the custom augmentation function
def nlp_aug_stochastic(x,aug=None,p=0.5):
if not isinstance(x,list):
if random.random()<p: return aug.augment(x)[0]
return x
=[]
news=[]
originalsfor _x in x:
if random.random()<p: news.append(_x)
else: originals.append(_x)
# only perform augmentation when needed
if len(news): news = aug.augment(news)
return news+originals
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.3) nearby_aug_func
Create a TextDataController object
We will reuse the data and the preprocessings in this tutorial
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train') dset
= TextDataController(dset,
tdc ='Review Text',
main_text='Rating',
label_names='regression',
sup_types={'Review Text': lambda x: x is not None},
filter_dict=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations# add "str.lower" here because nearby_aug might return uppercase character
=0.2,
val_ratio=1000,
batch_size=42,
seed=20,
num_proc=False
verbose )
Define our tokenizer for GPT2
= AutoTokenizer.from_pretrained('gpt2') _tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
= _tokenizer.eos_token
_tokenizer.pad_token = 'left' _tokenizer.padding_side
print(_tokenizer)
print(len(_tokenizer))
GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
50257
Process and tokenize our dataset
=100,shuffle_trn=True) tdc.process_and_tokenize(_tokenizer,max_length
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Rating', 'Division Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18112
})
validation: Dataset({
features: ['Title', 'Review Text', 'Rating', 'Division Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4529
})
})
Model Experiment: GPT2 Single-Head Regression
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.models.gpt2.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import mean_absolute_error,mean_squared_log_error,f1_score, accuracy_score
Using HuggingFace model initialization
from transformers.models.gpt2.modeling_gpt2 import GPT2ForSequenceClassification
=1 num_classes
42)
seed_everything(= GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=num_classes)
model = model.to('cuda:0') model
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
= model.config.eos_token_id
model.config.pad_token_id len(_tokenizer)) model.resize_token_embeddings(
Embedding(50257, 768)
= [mean_absolute_error,mean_squared_log_error]
metric_funcs = ModelController(model,tdc,seed=42) controller
= 1e-4
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics )
Epoch | Training Loss | Validation Loss | Mean Absolute Error Rating | Mean Squared Log Error Rating |
---|---|---|---|---|
1 | No log | 0.401165 | 0.452420 | 0.026643 |
2 | 0.837500 | 0.346818 | 0.407272 | 0.022015 |
3 | 0.837500 | 0.335389 | 0.400436 | 0.021186 |
Using GPT2-based model (no concatenation), but with a custom head to limit the output range
=1 num_classes
= GPT2Model.from_pretrained('gpt2') gpt2_body
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
class GPT2SigmoidRange(torch.nn.Module):
def __init__(self,
config,
high,
low,**kwargs
):super().__init__()
self.high=high
self.low=low
self.score = torch.nn.Linear(config.n_embd, config.num_labels, bias=False)
def forward(self, inp, **kwargs):
= self.score(inp)
logits return torch.sigmoid(logits)*(self.high-self.low)+self.low
={
_model_kwargs# overall model hyperparams
'head_class_sizes':num_classes,
'head_class': GPT2SigmoidRange,
# classfication head hyperparams
'high':5, # the maximum rating
'low': 1, # the minimum rating
}
= model_init_classification(model_class = GPT2BaseForSequenceClassification,
model = 'gpt2',
cpoint_path =False,
output_hidden_states=42,
seed=gpt2_body,
body_model= _model_kwargs) model_kwargs
Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124440576
Total trainable parameters: 124440576
len(_tokenizer)) model.body_model.resize_token_embeddings(
Embedding(50257, 768)
= [mean_absolute_error,mean_squared_log_error]
metric_funcs = ModelController(model,tdc,seed=42) controller
42)
seed_everything(
= 1e-4
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics )
Epoch | Training Loss | Validation Loss | Mean Absolute Error Rating | Mean Squared Log Error Rating |
---|---|---|---|---|
1 | No log | 0.392251 | 0.473784 | 0.024639 |
2 | 0.591400 | 0.348463 | 0.412898 | 0.021849 |
3 | 0.591400 | 0.326785 | 0.391602 | 0.020824 |
Using the GPT2 custom model (concatenation)
=1 num_classes
= GPT2Model.from_pretrained('gpt2') gpt2_body
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
={
_model_kwargs# overall model hyperparams
'head_class_sizes':num_classes,
'head_class': ConcatHeadSimple,
# classfication head hyperparams
'layer2concat':4,
'classifier_dropout':0.1
}
= model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
model = 'gpt2',
cpoint_path =True, # since we are using 'hidden layer contatenation' technique
output_hidden_states=42,
seed=gpt2_body,
body_model= _model_kwargs)
model_kwargs len(_tokenizer)) model.body_model.resize_token_embeddings(
Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124442881
Total trainable parameters: 124442881
Embedding(50257, 768)
= [mean_absolute_error,mean_squared_log_error]
metric_funcs = ModelController(model,tdc,seed=42) controller
42)
seed_everything(
= 1e-4
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics )
Epoch | Training Loss | Validation Loss | Mean Absolute Error Rating | Mean Squared Log Error Rating |
---|---|---|---|---|
1 | No log | 0.590046 | 0.550116 | 0.037543 |
2 | 2.443300 | 0.409290 | 0.443936 | 0.025740 |
3 | 2.443300 | 0.379117 | 0.441241 | 0.023792 |
'./sample_weights/my_model') controller.trainer.model.save_pretrained(
Predict Validation
_model_kwargs
{'head_class_sizes': 1,
'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
'layer2concat': 4,
'classifier_dropout': 0.1}
= model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
trained_model = Path('./sample_weights/my_model'),
cpoint_path =True,
output_hidden_states=42,
seed= _model_kwargs)
model_kwargs
= ModelController(trained_model,tdc,seed=42) controller
Total parameters: 124442881
Total trainable parameters: 124442881
= controller.predict_ddict(ds_type='validation') df_val
-------------------- Start making predictions --------------------
= df_val.to_pandas()
df_val df_val.head()
Title | Review Text | Rating | Division Name | label | input_ids | attention_mask | pred_Rating | |
---|---|---|---|---|---|---|---|---|
0 | general . . this picture doesn't do the skirt ... | 5.0 | general | 5.0 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.741378 | |
1 | general . . easy to wear ! cute , comfy ... wi... | 4.0 | general | 4.0 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.796369 | |
2 | general . . nice sweater , just did not look g... | 3.0 | general | 3.0 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 2.616789 | |
3 | nice cropped jacket | general . nice cropped jacket . this jacket wa... | 5.0 | general | 5.0 | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.419146 |
4 | great dress! | general petite . great dress ! . i wasn't plan... | 5.0 | general petite | 5.0 | [24622, 4273, 578, 764, 1049, 6576, 5145, 764,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | 5.079652 |
You can try to get your metric to see if it matches your last traing epoch’s above
'label'],df_val['pred_Rating']) mean_absolute_error(df_val[
0.441279638942753
'label'],df_val['pred_Rating']) mean_squared_log_error(df_val[
0.023789442640954332
Predict Test set
We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set
= pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
df_test
# save the label, as we will calculate some metrics later
= df_test[~df_test['Review Text'].isna()].Rating.values
true_labels
# drop the label (you don't need to, but this is necessary to simulate an actual test set)
'Rating',axis=1,inplace=True) df_test.drop(
= Dataset.from_pandas(df_test)
_test_dset = controller.predict_raw_dset(_test_dset,
_test_dset_predicted =True, # since we have some text filtering in the processing
do_filtering )
-------------------- Start making predictions --------------------
= _test_dset_predicted.to_pandas() df_test_predicted
df_test_predicted.head()
Title | Review Text | Division Name | input_ids | attention_mask | pred_Rating | |
---|---|---|---|---|---|---|
0 | perfect for work and play | general . perfect for work and play . this shi... | general | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.962859 |
1 | general petite . . i don't know why i had the ... | general petite | [24622, 4273, 578, 764, 764, 1312, 836, 470, 7... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | 4.025272 | |
2 | great pants | general petite . great pants . thes e cords ar... | general petite | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.888588 |
3 | surprisingly comfy for a button down | general petite . surprisingly comfy for a butt... | general petite | [24622, 4273, 578, 764, 12362, 401, 24928, 329... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | 4.510598 |
4 | short and small | general petite . short and small . the shirt i... | general petite | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 2.427316 |
Let’s quickly check the score to make sure everything works correctly
'pred_Rating']) mean_absolute_error(true_labels,df_test_predicted[
0.41902880738461595
'pred_Rating']) mean_squared_log_error(true_labels,df_test_predicted[
0.022995927657672072
={'Review Text': 'This shirt is so comfortable I love it!',
raw_content'Title': 'Great shirt',
'Division Name': 'general'}
raw_content
{'Review Text': 'This shirt is so comfortable I love it!',
'Title': 'Great shirt',
'Division Name': 'general'}
= controller.predict_raw_text(raw_content) df_result
-------------------- Start making predictions --------------------
df_result
{'Review Text': ['general . great shirt . this shirt is so comfortable i love it !'],
'Title': ['great shirt'],
'Division Name': ['general'],
'input_ids': [[24622,
764,
1049,
10147,
764,
428,
10147,
318,
523,
6792,
1312,
1842,
340,
5145]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
'pred_Rating': [4.925951957702637]}
Model Experiment: GPT2 Multi-Head Regression
from that_nlp_library.models.roberta.classifiers import ConcatHeadSimple
from that_nlp_library.models.gpt2.classifiers import *
from that_nlp_library.model_main import *
from sklearn.metrics import mean_absolute_error,mean_squared_log_error,f1_score, accuracy_score
Re-define the TextDataController
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train') dset
= TextDataController(dset,
tdc ='Review Text',
main_text=['Rating','Department Name'],
label_names=['regression','classification'],
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None},
=['Title'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=0.2,
val_ratio=1000,
batch_size=42,
seed=20,
num_proc=False
verbose )
= AutoTokenizer.from_pretrained('gpt2') _tokenizer
= _tokenizer.eos_token
_tokenizer.pad_token = 'left' _tokenizer.padding_side
Process and tokenize our dataset
=100,shuffle_trn=True) tdc.process_and_tokenize(_tokenizer,max_length
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Rating', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18101
})
validation: Dataset({
features: ['Title', 'Review Text', 'Rating', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})
tdc.label_lists
[[], ['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']]
Using the GPT2 custom model (concatenation)
=[1,len(tdc.label_lists[1])] # 1 head size 1 for regression, 1 head size 6 for classification
num_classes num_classes
[1, 6]
= GPT2Model.from_pretrained('gpt2') gpt2_body
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
={
_model_kwargs# overall model hyperparams
'head_class_sizes':num_classes,
'head_class': ConcatHeadSimple,
# classfication head hyperparams
'layer2concat':3,
'classifier_dropout':0.1
}
= model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
model = 'gpt2',
cpoint_path =True,
output_hidden_states=42,
seed=gpt2_body,
body_model= _model_kwargs) model_kwargs
Loading body weights. This assumes the body is the very first block of your custom architecture
Total parameters: 124455943
Total trainable parameters: 124455943
len(_tokenizer)) model.body_model.resize_token_embeddings(
Embedding(50257, 768)
If you use multihead
and each head does a different supervised learning type (as in this case when you have both classification and regression head), and you have separate metric for each type, you must define a metric_types
to let the controller knows what metric functions to apply for each head
= [mean_absolute_error,partial(f1_score,average='macro'),accuracy_score]
metric_funcs = ['regression','classification','classification'] metric_types
= ModelController(model,tdc,seed=42) controller
42)
seed_everything(
= 1e-4
lr =32
bs=0.01
wd= 3
epochs
controller.fit(epochs,lr,=metric_funcs,
metric_funcs=metric_types,
metric_types=bs,
batch_size=wd,
weight_decay=False,
save_checkpoint=compute_metrics,
compute_metrics )
Epoch | Training Loss | Validation Loss | Mean Absolute Error Rating | F1 Score Department name | Accuracy Score Department name |
---|---|---|---|---|---|
1 | No log | 1.206080 | 0.571639 | 0.544659 | 0.821034 |
2 | 3.874900 | 1.046785 | 0.537101 | 0.618333 | 0.849757 |
3 | 3.874900 | 0.865819 | 0.460715 | 0.639875 | 0.865665 |
'./sample_weights/my_model') controller.trainer.model.save_pretrained(
Predict Validation
_model_kwargs
{'head_class_sizes': [1, 6],
'head_class': that_nlp_library.models.roberta.classifiers.ConcatHeadSimple,
'layer2concat': 3,
'classifier_dropout': 0.1}
= model_init_classification(model_class = GPT2HiddenStateConcatForSequenceClassification,
trained_model = Path('./sample_weights/my_model'),
cpoint_path =True,
output_hidden_states=42,
seed= _model_kwargs)
model_kwargs
= ModelController(trained_model,tdc,seed=42) controller
Total parameters: 124455943
Total trainable parameters: 124455943
= controller.predict_ddict(ds_type='validation') df_val
-------------------- Start making predictions --------------------
= df_val.to_pandas()
df_val df_val.head()
Title | Review Text | Rating | Department Name | label | input_ids | attention_mask | pred_Rating | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | . such a fun jacket ! great to wear in the spr... | 5.0 | Intimate | [5.0, 2.0] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 5.052881 | Jackets | 0.806607 | |
1 | simple and elegant | simple and elegant . i thought this shirt was ... | 5.0 | Tops | [5.0, 4.0] | [36439, 290, 19992, 764, 1312, 1807, 428, 1014... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | 3.821376 | Tops | 0.996092 |
2 | retro and pretty | retro and pretty . this top has a bit of a ret... | 5.0 | Tops | [5.0, 4.0] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.635709 | Tops | 0.988367 |
3 | summer/fall wear | summer / fall wear . i first spotted this on a... | 5.0 | Dresses | [5.0, 1.0] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ... | 4.859474 | Dresses | 0.931358 |
4 | perfect except slip | perfect except slip . this is my new favorite ... | 4.0 | Dresses | [4.0, 1.0] | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.317022 | Dresses | 0.987220 |
You can try to get your metric to see if it matches your last traing epoch’s above
'Rating'],df_val['pred_Rating']) mean_absolute_error(df_val[
0.46070288352823024
'Department Name'],df_val['pred_Department Name'],average='macro') f1_score(df_val[
0.6396845952183047
Predict Test set
We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let’s reuse the sample csv and pretend it’s our test set
= pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
df_test
# drop the label (you don't need to, but this is necessary to simulate an actual test set)
'Rating','Department Name'],axis=1,inplace=True) df_test.drop([
= Dataset.from_pandas(df_test)
_test_dset = controller.predict_raw_dset(_test_dset,
_test_dset_predicted =True, # since we have some text filtering in the processing
do_filtering=3
topk )
-------------------- Start making predictions --------------------
= _test_dset_predicted.to_pandas() df_test_predicted
df_test_predicted.head()
Title | Review Text | input_ids | attention_mask | pred_Rating | pred_Department Name | pred_prob_Department Name | |
---|---|---|---|---|---|---|---|
0 | perfect for work and play | perfect for work and play . this shirt works f... | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 5.155851 | [Tops, Intimate, Jackets] | [0.9964365, 0.0024720305, 0.0007216654] |
1 | . i don't know why i had the opposite problem ... | [13, 1312, 836, 470, 760, 1521, 1312, 550, 262... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | 4.188741 | [Bottoms, Intimate, Dresses] | [0.97982305, 0.01562881, 0.001401943] | |
2 | great pants | great pants . thes e cords are great--lightwei... | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 4.878179 | [Bottoms, Intimate, Jackets] | [0.97345406, 0.022959404, 0.0018552544] |
3 | surprisingly comfy for a button down | surprisingly comfy for a button down . i am a ... | [41199, 401, 24928, 329, 257, 4936, 866, 764, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | 4.531747 | [Tops, Intimate, Jackets] | [0.952523, 0.022002747, 0.015237918] |
4 | short and small | short and small . the shirt is mostly a thick ... | [50256, 50256, 50256, 50256, 50256, 50256, 502... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 2.355370 | [Tops, Intimate, Jackets] | [0.98523074, 0.010804329, 0.0018044778] |
={'Review Text': 'This shirt is so comfortable I love it!',
raw_content'Title': 'Great shirt'}
raw_content
{'Review Text': 'This shirt is so comfortable I love it!',
'Title': 'Great shirt'}
= controller.predict_raw_text(raw_content) df_result
-------------------- Start making predictions --------------------
df_result
{'Review Text': ['great shirt . this shirt is so comfortable i love it !'],
'Title': ['great shirt'],
'input_ids': [[18223,
10147,
764,
428,
10147,
318,
523,
6792,
1312,
1842,
340,
5145]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
'pred_Rating': [5.187490940093994],
'pred_Department Name': ['Tops'],
'pred_prob_Department Name': [0.9844864010810852]}