import osRoberta model with Conditional Probability
This notebook contains some example of how to use the Roberta-based models in this NLP library
In this tutorial, we walk through a special case of classification with multiple heads. This is inspired by this paper: https://arxiv.org/pdf/1911.06475.pdf
#This will specify a (or a list) of GPUs for training
os.environ['CUDA_VISIBLE_DEVICES'] = "0"from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.utils import seed_everythingfrom underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import RobertaTokenizer
from datasets import DatasetDefine the custom augmentation function
def nlp_aug_stochastic(x,aug=None,p=0.5):
if not isinstance(x,list):
if random.random()<p: return aug.augment(x)[0]
return x
news=[]
originals=[]
for _x in x:
if random.random()<p: news.append(_x)
else: originals.append(_x)
# only perform augmentation when needed
if len(news): news = aug.augment(news)
return news+originalsaug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.3)Create a TextDataController object
We will reuse the data and the preprocessings in this tutorial
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')tdc = TextDataController(dset,
main_text='Review Text',
label_names=['Division Name','Department Name'],
sup_types=['classification','classification'],
filter_dict={'Review Text': lambda x: x is not None,
'Department Name': lambda x: x is not None,
},
metadatas=['Title'],
content_transformations=[text_normalize,str.lower],
content_augmentations= [nearby_aug_func,str.lower],
val_ratio=0.2,
batch_size=1000,
seed=42,
num_proc=20,
verbose=False
)Define our tokenizer for Roberta
_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset
tdc.process_and_tokenize(_tokenizer,max_length=100,shuffle_trn=True)tdc.main_ddictDatasetDict({
train: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 18101
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 4526
})
})
tdc.main_ddict['validation']['label'][:5][[1, 2], [1, 4], [0, 4], [1, 1], [1, 1]]