import os
Roberta model with a streamed dataset (Custom Single Head)
This notebook contains some example of how to train a Roberta-based model with a streamed dataset
In this series, we walk through some of the capability of this library: single-head classification, multi-head classification, multi-label classification, and regression. If you want a more detailed tutorial, check this out
#This will specify a (or a list) of GPUs for training
'CUDA_VISIBLE_DEVICES'] = "0" os.environ[
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main_streaming import *
from that_nlp_library.utils import seed_everything
from underthesea import text_normalize
from functools import partial
from pathlib import Path
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
from datasets import load_dataset
import random
from transformers import RobertaTokenizer
from datasets import Dataset
Define the custom augmentation function
def nlp_aug_stochastic(x,aug=None,p=0.5):
if not isinstance(x,list):
if random.random()<p: return aug.augment(x)[0]
return x
=[]
news=[]
originalsfor _x in x:
if random.random()<p: news.append(_x)
else: originals.append(_x)
# only perform augmentation when needed
if len(news): news = aug.augment(news)
return news+originals
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.3) nearby_aug_func
Create a TextDataController object
We will reuse the data and the preprocessings in this tutorial
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
dset = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val 'validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
ddict_with_val[del ddict_with_val['test']
ddict_with_val
DatasetDict({
train: IterableDataset({
features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
n_shards: 1
})
validation: Dataset({
features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
num_rows: 2349
})
})
= TextDataControllerStreaming(ddict_with_val,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trending'],
class_names_predefined={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},={'Department Name': lambda x: x if x!='Trend' else 'Trending'},
label_tfm_dict=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations=[nearby_aug_func,str.lower],
content_augmentations=True,
process_metas=1000,
batch_size=4,
num_proc=42,
seed=False
verbose )
Define our tokenizer for Roberta
= RobertaTokenizer.from_pretrained('roberta-base') _tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Process and tokenize our dataset
=150) tdc.process_and_tokenize(_tokenizer,max_length
tdc.main_ddict
DatasetDict({
train: IterableDataset({
features: Unknown,
n_shards: 4
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
num_rows: 2253
})
})