import os
Roberta model (Multi-Label)
This notebook contains some example of how to use the Roberta-based models in this NLP library
In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out
#This will specify a (or a list) of GPUs for training
'CUDA_VISIBLE_DEVICES'] = "0" os.environ[
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everything
from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import RobertaTokenizer
from datasets import Dataset
Define the custom augmentation function
def nlp_aug_stochastic(x,aug=None,p=0.5):
if not isinstance(x,list):
if random.random()<p: return aug.augment(x)[0]
return x
=[]
news=[]
originalsfor _x in x:
if random.random()<p: news.append(_x)
else: originals.append(_x)
# only perform augmentation when needed
if len(news): news = aug.augment(news)
return news+originals
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.3) nearby_aug_func
Create a TextDataController object
We will reuse the data and the preprocessings in this tutorial
Construct a ‘multi-label’ column by using both Deparment Name
and Division Name
= pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')
df = df[~df['Department Name'].isna()].reset_index(drop=True)
df 'Multi_Label'] = df[['Department Name','Division Name']].values.tolist()
df[# df['Fake Label'] = [np.random.choice(df['Department Name'].unique()[:-1],size=np.random.randint(2,6),replace=False) for _ in range(len(df))]
df.head()
Clothing ID | Age | Title | Review Text | Rating | Recommended IND | Positive Feedback Count | Division Name | Department Name | Class Name | Multi_Label | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 767 | 33 | NaN | Absolutely wonderful - silky and sexy and comf... | 4 | 1 | 0 | Initmates | Intimate | Intimates | [Intimate, Initmates] |
1 | 1080 | 34 | NaN | Love this dress! it's sooo pretty. i happene... | 5 | 1 | 4 | General | Dresses | Dresses | [Dresses, General] |
2 | 1077 | 60 | Some major design flaws | I had such high hopes for this dress and reall... | 3 | 0 | 0 | General | Dresses | Dresses | [Dresses, General] |
3 | 1049 | 50 | My favorite buy! | I love, love, love this jumpsuit. it's fun, fl... | 5 | 1 | 0 | General Petite | Bottoms | Pants | [Bottoms, General Petite] |
4 | 847 | 47 | Flattering shirt | This shirt is very flattering to all due to th... | 5 | 1 | 6 | General | Tops | Blouses | [Tops, General] |
= TextDataController.from_df(df,
tdc ='Review Text',
main_text='Multi_Label',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None},
filter_dict='Title',
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=0.2,
val_ratio=1000,
batch_size=42,
seed=20,
num_proc=False
verbose )
- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title 3809
Review Text 844
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows
Define our tokenizer for Roberta
= RobertaTokenizer.from_pretrained('roberta-base') _tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset
=100,shuffle_trn=True) tdc.process_and_tokenize(_tokenizer,max_length
tdc.main_ddict
DatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Multi_Label', 'label', 'input_ids', 'attention_mask'],
num_rows: 18101
})
validation: Dataset({
features: ['Title', 'Review Text', 'Multi_Label', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})