# !conda list | grep 'datasets\|transformers\|torch\|accelerate'
# accelerate 0.29.3 pypi_0 pypi
# datasets 2.19.0 pypi_0 pypi
# torch 2.3.0 pypi_0 pypi
# transformers 4.40.1 pypi_0 pypi
Text Processing Benchmark
TextDataController
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.text_main_streaming import *
from datasets import load_dataset,enable_caching,disable_caching
from transformers import RobertaTokenizer
import os
import time
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
from functools import partial
import random
from memory_profiler import memory_usage
# disable huggingface caching to get a fair benchmark disable_caching()
1. Benchmark on medium-size dataset (~117k rows)
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaminglen(dset)
117430
= RobertaTokenizer.from_pretrained('roberta-base') tokenizer
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
=len(dset)//100
bs bs
1174
a) Non-streaming dataset
def benchmarking(tdc,tokenizer,n=10,shuffle_trn=True,time_list=[]):
= time.time()
time1 =512,shuffle_trn=shuffle_trn)
tdc.process_and_tokenize(tokenizer,max_length= time.time()
time2 = round(time2-time1,2)
process_time print(f'Time it takes to process + tokenize training texts: {process_time} s')
for i,v in enumerate(tdc.main_ddict['train']):
if n is not None and i==tdc.batch_size*n: break
= time.time()
time3
= round(time3-time2,2)
iteration_time if n is not None:
print(f'Time it takes to go through {n*tdc.batch_size} items: {iteration_time} s')
else:
print(f'Time it takes to go through all items: {iteration_time} s')
= round(time3-time1,2)
total_time print(f'Total time: {total_time} s')
+=process_time,iteration_time,total_time
time_list
def benchmarking_and_memory_usage(tdc,tokenizer,n=10,shuffle_trn=True,time_list=[]):
= memory_usage((benchmarking,[tdc,tokenizer,n,shuffle_trn,time_list]))
mem_usage = round(max(mem_usage),1)
total_usage print(f'Maximum memory usage: {total_usage} MiB')
time_list.append(total_usage)
def nlp_aug_stochastic(x,aug=None,p=0.5):
= aug.augment(x)
results if not isinstance(x,list): return results[0] if random.random()<p else x
return [a if random.random()<p else b for a,b in zip(results,x)]
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.5) nearby_aug_func
With filter
=[] timelist1
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=None,
val_ratio=bs,
batch_size=42,
seed=False
verbose
)=timelist1) benchmarking_and_memory_usage(tdc,tokenizer,time_list
Time it takes to process + tokenize training texts: 14.37 s
Time it takes to go through 11740 items: 1.27 s
Total time: 15.64 s
Maximum memory usage: 734.9 MiB
With filter + metadatas concatenation
=[] timelist2
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=None,
val_ratio=bs,
batch_size=42,
seed=False
verbose
)=timelist2) benchmarking_and_memory_usage(tdc,tokenizer,time_list
Time it takes to process + tokenize training texts: 15.26 s
Time it takes to go through 11740 items: 1.46 s
Total time: 16.72 s
Maximum memory usage: 748.6 MiB
With filter + metadatas concatenation + content transformation + content augmentation
=[] timelist3
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=bs,
batch_size=42,
seed=False
verbose
)=timelist3) benchmarking_and_memory_usage(tdc,tokenizer,time_list
Time it takes to process + tokenize training texts: 35.09 s
Time it takes to go through 11740 items: 1.52 s
Total time: 36.61 s
Maximum memory usage: 754.7 MiB
With filter + metadatas concatenation + content transformation + content augmentation + no shuffling
=[] timelist4
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=bs,
batch_size=42,
seed=False
verbose
)=False,time_list=timelist4) benchmarking_and_memory_usage(tdc,tokenizer,shuffle_trn
Time it takes to process + tokenize training texts: 34.36 s
Time it takes to go through 11740 items: 1.47 s
Total time: 35.83 s
Maximum memory usage: 777.3 MiB
With filter + metadatas concatenation + content transformation + content augmentation + higher batch size
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=bs*3,
batch_size=42,
seed=False
verbose
) benchmarking_and_memory_usage(tdc,tokenizer)
Time it takes to process + tokenize training texts: 35.7 s
Time it takes to go through 35220 items: 4.47 s
Total time: 40.17 s
Maximum memory usage: 761.9 MiB
With filter + metadatas concatenation + content transformation + content augmentation + higher num proc
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=bs,
batch_size=42,
seed=8,
num_proc=False
verbose
) benchmarking_and_memory_usage(tdc,tokenizer)
Time it takes to process + tokenize training texts: 24.7 s
Time it takes to go through 11740 items: 1.46 s
Total time: 26.16 s
Maximum memory usage: 754.2 MiB
With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)
=[] timelist5
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=bs,
batch_size=42,
seed=False
verbose
)=None,time_list=timelist5) benchmarking_and_memory_usage(tdc,tokenizer,n
Time it takes to process + tokenize training texts: 35.34 s
Time it takes to go through all items: 14.32 s
Total time: 49.66 s
Maximum memory usage: 869.6 MiB
b) With streaming
def benchmarking_streaming(tdc,tokenizer,n=10,time_list=[]):
= time.time()
time1 =512,line_by_line=True)
tdc.process_and_tokenize(tokenizer,max_length= time.time()
time2 = round(time2-time1,2)
process_time print(f'Time it takes to process + tokenize training texts: {process_time} s')
for i,v in enumerate(tdc.main_ddict['train']):
if n is not None and i==tdc.batch_size*n: break
= time.time()
time3 = round(time3-time2,2)
iteration_time if n is not None:
print(f'Time it takes to go through {n*tdc.batch_size} items: {iteration_time} s')
else:
print(f'Time it takes to go through all items: {iteration_time} s')
= round(time3-time1,2)
total_time print(f'Total time: {total_time} s')
+=process_time,iteration_time,total_time
time_listdef benchmarking_and_memory_usage_streaming(tdc,tokenizer,n=10,time_list=[]):
= memory_usage((benchmarking_streaming,[tdc,tokenizer,n,time_list]))
mem_usage = round(max(mem_usage),1)
total_usage print(f'Maximum memory usage: {total_usage} MiB')
time_list.append(total_usage)
def nlp_aug_stochastic(x,aug=None,p=0.5):
= aug.augment(x)
results if not isinstance(x,list): return results[0] if random.random()<p else x
return [a if random.random()<p else b for a,b in zip(results,x)]
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.5) nearby_aug_func
With filter
=[] ns_timelist1
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=True)
streaming
= TextDataControllerStreaming(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
class_names_predefined=bs,
batch_size=4,
num_proc=42,
seed=False
verbose
)=ns_timelist1) benchmarking_and_memory_usage_streaming(tdc,tokenizer,time_list
Time it takes to process + tokenize training texts: 0.8 s
Time it takes to go through 11740 items: 4.03 s
Total time: 4.82 s
Maximum memory usage: 743.0 MiB
With filter + metadatas concatenation
=[] ns_timelist2
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=True)
streaming
= TextDataControllerStreaming(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
class_names_predefined=['Title','Division Name'],
metadatas=bs,
batch_size=4,
num_proc=42,
seed=False
verbose
)=ns_timelist2) benchmarking_and_memory_usage_streaming(tdc,tokenizer,time_list
Time it takes to process + tokenize training texts: 0.79 s
Time it takes to go through 11740 items: 4.43 s
Total time: 5.22 s
Maximum memory usage: 745.9 MiB
With filter + metadatas concatenation + content transformation + content augmentation
=[] ns_timelist3
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=True)
streaming
= TextDataControllerStreaming(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
class_names_predefined=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=bs,
batch_size=4,
num_proc=42,
seed=False
verbose
)=ns_timelist3) benchmarking_and_memory_usage_streaming(tdc,tokenizer,time_list
Time it takes to process + tokenize training texts: 0.78 s
Time it takes to go through 11740 items: 12.23 s
Total time: 13.01 s
Maximum memory usage: 743.0 MiB
With filter + metadatas concatenation + content transformation + content augmentation + higher batch size (not recorded)
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=True)
streaming
= TextDataControllerStreaming(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
class_names_predefined=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=bs*3,
batch_size=4,
num_proc=42,
seed=False
verbose
) benchmarking_and_memory_usage_streaming(tdc,tokenizer)
Time it takes to process + tokenize training texts: 0.79 s
Time it takes to go through 35220 items: 36.66 s
Total time: 37.45 s
Maximum memory usage: 887.4 MiB
With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)
=[] ns_timelist4
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=True)
streaming
= TextDataControllerStreaming(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
class_names_predefined=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=bs,
batch_size=4,
num_proc=42,
seed=False
verbose
)=None,time_list=ns_timelist4) benchmarking_and_memory_usage_streaming(tdc,tokenizer,n
Time it takes to process + tokenize training texts: 0.8 s
Time it takes to go through all items: 111.93 s
Total time: 112.73 s
Maximum memory usage: 762.8 MiB
2. Test the effect of batch size and num proc (parallel process) on Non-streaming dataset
def benchmarking(tdc,tokenizer,n=10,shuffle_trn=True):
= time.time()
time1 =512,shuffle_trn=shuffle_trn)
tdc.process_and_tokenize(tokenizer,max_length= time.time()
time2 print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
for i,v in enumerate(tdc.main_ddict['train']):
if n is not None and i==tdc.batch_size*n: break
= time.time()
time3 if n is not None:
print(f'Time it takes to go through {n*tdc.batch_size} items: {(time3-time2):.3f} s')
else:
print(f'Time it takes to go through all items: {(time3-time2):.3f} s')
print(f'Total time: {(time3-time1):.3f} s')
def benchmarking_and_memory_usage(tdc,tokenizer,n=10,shuffle_trn=True):
= memory_usage((benchmarking,[tdc,tokenizer,n,shuffle_trn]))
mem_usage print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')
def nlp_aug_stochastic(x,aug=None,p=0.5):
= aug.augment(x)
results if not isinstance(x,list): return results[0] if random.random()<p else x
return [a if random.random()<p else b for a,b in zip(results,x)]
= nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
aug = partial(nlp_aug_stochastic,aug=aug,p=0.5) nearby_aug_func
For non-streaming dataset, text processing + tokenization are the most time-consuming tasks, thus we will check how different batch size and num proc will affect these tasks’ running time
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=100,
batch_size=2,
num_proc=42,
seed=False
verbose
)=None) benchmarking_and_memory_usage(tdc,tokenizer,n
Time it takes to process + tokenize training texts: 64.098 s
Time it takes to go through all items: 13.400 s
Total time: 77.499 s
Maximum memory usage: 925.188 MiB
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=1000,
batch_size=2,
num_proc=42,
seed=False
verbose
)=None) benchmarking_and_memory_usage(tdc,tokenizer,n
Time it takes to process + tokenize training texts: 61.297 s
Time it takes to go through all items: 14.427 s
Total time: 75.724 s
Maximum memory usage: 912.223 MiB
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=100,
batch_size=8,
num_proc=42,
seed=False
verbose
)=None) benchmarking_and_memory_usage(tdc,tokenizer,n
Time it takes to process + tokenize training texts: 25.857 s
Time it takes to go through all items: 13.776 s
Total time: 39.634 s
Maximum memory usage: 928.574 MiB
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=1000,
batch_size=8,
num_proc=42,
seed=False
verbose
)=None) benchmarking_and_memory_usage(tdc,tokenizer,n
Time it takes to process + tokenize training texts: 24.933 s
Time it takes to go through all items: 14.271 s
Total time: 39.204 s
Maximum memory usage: 913.266 MiB
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=2000,
batch_size=8,
num_proc=42,
seed=False
verbose
)=None) benchmarking_and_memory_usage(tdc,tokenizer,n
Time it takes to process + tokenize training texts: 25.600 s
Time it takes to go through all items: 14.465 s
Total time: 40.064 s
Maximum memory usage: 934.883 MiB
Increasing num_proc is more beneficial than increasing processing batch size
3. Improving processing time with caching
The worst processing time is recorded with non-streaming training set, with the following preprocessing: 2-column filtering, 2-column metadatas, 2 content transformations, 2 content augmentation.
With caching, we can significantly reduce the preprocessing time. That means, you only need to do all preprocessings once; all subsequent call will take advatages of this cached result.
enable_caching()
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=1000,
batch_size=4,
num_proc=42,
seed=False
verbose
)=512) tdc.process_and_tokenize(tokenizer,max_length
Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-420893192d8b876f_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-ee3f2ca19acd2369_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-27d1b7f9046ec1b4_*_of_00004.arrow
= load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
dset ='train',
split=False)
streaming
= TextDataController(dset,
tdc ='Review Text',
main_text='Department Name',
label_names='classification',
sup_types={'Review Text': lambda x: x is not None,
filter_dict'Department Name': lambda x: x is not None,
},=['Title','Division Name'],
metadatas=[text_normalize,str.lower],
content_transformations= [nearby_aug_func,str.lower],
content_augmentations=None,
val_ratio=1000,
batch_size=4,
num_proc=42,
seed=False
verbose
)=None) benchmarking_and_memory_usage(tdc,tokenizer,n
Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-420893192d8b876f_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-ee3f2ca19acd2369_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-27d1b7f9046ec1b4_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-57d938bbd364f406_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-10afb2ec3cb12852_*_of_00004.arrow
Loading cached shuffled indices for dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-5d8840c40fe75896.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-030424c28049222f_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-fe4290971e8d1087_*_of_00004.arrow
Time it takes to process + tokenize training texts: 0.979 s
Time it takes to go through all items: 16.824 s
Maximum memory usage: 823.531 MiB
If you cached, then you only need 0.979s to load the data back from caches, instead of wait for 35s to do the process all over again
4. Time and Space Complexity Comparison (as of 5/3/2024)
import pandas as pd
import numpy as np
= [timelist1,ns_timelist1]
exp1 = [timelist2,ns_timelist2]
exp2 = [timelist3,ns_timelist3]
exp3 = [timelist4,[None,None,None,None]] # no shuffling when streaming
exp4 = [timelist5,ns_timelist4] exp5
=['Filter + Shuffling Train','And 2 metadatas',
col_names'And 2 tfms + 2 augs','Same, but without train shuffling',
'Time to process 1 epoch']
=['Non-Streaming','Streaming'] idxs
=[]
_tmpfor i in range(2):
0] for l in [exp1,exp2,exp3,exp4,exp5]])
_tmp.append([l[i][= pd.DataFrame(np.array(_tmp),columns=col_names)
df = idxs
df.index = 'Time (s) to process and tokenize 117k records with batch size 1174'
df.index.name
=[]
_tmpfor i in range(2):
1] for l in [exp1,exp2,exp3,exp4,exp5]])
_tmp.append([l[i][= pd.DataFrame(np.array(_tmp),columns=col_names[:-1]+['Time to iterate 1 epoch'])
df2 = idxs
df2.index = 'Time (s) to iterate 10 batches (11740 items)'
df2.index.name
=[]
_tmpfor i in range(2):
2] for l in [exp1,exp2,exp3,exp4,exp5]])
_tmp.append([l[i][= pd.DataFrame(np.array(_tmp),columns=col_names[:-1]+['Total time to process + tokenize + iterate 1 epoch'])
df3 = idxs
df3.index = 'Total time (s) to process + tokenize + iterate 10 batches'
df3.index.name
=[]
_tmpfor i in range(2):
3] for l in [exp1,exp2,exp3,exp4,exp5]])
_tmp.append([l[i][= pd.DataFrame(np.array(_tmp),columns=col_names[:-1]+['Total memory to process + tokenize + iterate 1 epoch'])
df4 = idxs
df4.index = 'Total memory (MiB) to process + tokenize + iterate 10 batches' df4.index.name
df
Filter + Shuffling Train | And 2 metadatas | And 2 tfms + 2 augs | Same, but without train shuffling | Time to process 1 epoch | |
---|---|---|---|---|---|
Time (s) to process and tokenize 117k records with batch size 1174 | |||||
Non-Streaming | 14.37 | 15.26 | 35.09 | 34.36 | 35.34 |
Streaming | 0.8 | 0.79 | 0.78 | None | 0.8 |
df2
Filter + Shuffling Train | And 2 metadatas | And 2 tfms + 2 augs | Same, but without train shuffling | Time to iterate 1 epoch | |
---|---|---|---|---|---|
Time (s) to iterate 10 batches (11740 items) | |||||
Non-Streaming | 1.27 | 1.46 | 1.52 | 1.47 | 14.32 |
Streaming | 4.03 | 4.43 | 12.23 | None | 111.93 |
df3
Filter + Shuffling Train | And 2 metadatas | And 2 tfms + 2 augs | Same, but without train shuffling | Total time to process + tokenize + iterate 1 epoch | |
---|---|---|---|---|---|
Total time (s) to process + tokenize + iterate 10 batches | |||||
Non-Streaming | 15.64 | 16.72 | 36.61 | 35.83 | 49.66 |
Streaming | 4.82 | 5.22 | 13.01 | None | 112.73 |
df4
Filter + Shuffling Train | And 2 metadatas | And 2 tfms + 2 augs | Same, but without train shuffling | Total memory to process + tokenize + iterate 1 epoch | |
---|---|---|---|---|---|
Total memory (MiB) to process + tokenize + iterate 10 batches | |||||
Non-Streaming | 734.9 | 748.6 | 754.7 | 777.3 | 869.6 |
Streaming | 743.0 | 745.9 | 743.0 | None | 762.8 |
5. Tips and tricks
- For non-streaming data, the best way to minimize processing and iteration time is:
- Turn on dataset caching, and run the processing step once for it to be cached
- The more content transformations and augmentations added, the slower the process + iteration. This is especially true for streaming data
- For streaming, be aware of the pros and cons of batch-process and line-by-line process (read more here)