Text Processing Benchmark

This module contains some benchmarks for TextDataController
# !conda list | grep 'datasets\|transformers\|torch\|accelerate'
# accelerate                0.29.3                   pypi_0    pypi
# datasets                  2.19.0                   pypi_0    pypi
# torch                     2.3.0                    pypi_0    pypi
# transformers              4.40.1                   pypi_0    pypi
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.text_main_streaming import *
from datasets import load_dataset,enable_caching,disable_caching
from transformers import RobertaTokenizer
import os
import time
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
from functools import partial
import random
from memory_profiler import memory_usage
disable_caching() # disable huggingface caching to get a fair benchmark

1. Benchmark on medium-size dataset (~117k rows)

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)
len(dset)
117430
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
bs=len(dset)//100
bs
1174

a) Non-streaming dataset

def benchmarking(tdc,tokenizer,n=10,shuffle_trn=True,time_list=[]):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512,shuffle_trn=shuffle_trn)
    time2 = time.time() 
    process_time = round(time2-time1,2)
    print(f'Time it takes to process + tokenize training texts: {process_time} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    
    iteration_time = round(time3-time2,2)
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {iteration_time} s')
    else:
        print(f'Time it takes to go through all items: {iteration_time} s')
    
    total_time = round(time3-time1,2)
    print(f'Total time: {total_time} s')
    
    time_list+=process_time,iteration_time,total_time
    
def benchmarking_and_memory_usage(tdc,tokenizer,n=10,shuffle_trn=True,time_list=[]):
    mem_usage = memory_usage((benchmarking,[tdc,tokenizer,n,shuffle_trn,time_list]))
    total_usage = round(max(mem_usage),1)
    print(f'Maximum memory usage: {total_usage} MiB')
    time_list.append(total_usage)
def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

With filter

timelist1=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,time_list=timelist1)
Time it takes to process + tokenize training texts: 14.37 s
Time it takes to go through 11740 items: 1.27 s
Total time: 15.64 s
Maximum memory usage: 734.9 MiB

With filter + metadatas concatenation

timelist2=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,time_list=timelist2)
Time it takes to process + tokenize training texts: 15.26 s
Time it takes to go through 11740 items: 1.46 s
Total time: 16.72 s
Maximum memory usage: 748.6 MiB

With filter + metadatas concatenation + content transformation + content augmentation

timelist3=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,time_list=timelist3)
Time it takes to process + tokenize training texts: 35.09 s
Time it takes to go through 11740 items: 1.52 s
Total time: 36.61 s
Maximum memory usage: 754.7 MiB

With filter + metadatas concatenation + content transformation + content augmentation + no shuffling

timelist4=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,shuffle_trn=False,time_list=timelist4)
Time it takes to process + tokenize training texts: 34.36 s
Time it takes to go through 11740 items: 1.47 s
Total time: 35.83 s
Maximum memory usage: 777.3 MiB

With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs*3,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)
Time it takes to process + tokenize training texts: 35.7 s
Time it takes to go through 35220 items: 4.47 s
Total time: 40.17 s
Maximum memory usage: 761.9 MiB

With filter + metadatas concatenation + content transformation + content augmentation + higher num proc

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         num_proc=8,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)
Time it takes to process + tokenize training texts: 24.7 s
Time it takes to go through 11740 items: 1.46 s
Total time: 26.16 s
Maximum memory usage: 754.2 MiB

With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)

timelist5=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None,time_list=timelist5)
Time it takes to process + tokenize training texts: 35.34 s
Time it takes to go through all items: 14.32 s
Total time: 49.66 s
Maximum memory usage: 869.6 MiB

b) With streaming

def benchmarking_streaming(tdc,tokenizer,n=10,time_list=[]):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512,line_by_line=True)
    time2 = time.time() 
    process_time = round(time2-time1,2)
    print(f'Time it takes to process + tokenize training texts: {process_time} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    iteration_time = round(time3-time2,2)
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {iteration_time} s')
    else:
        print(f'Time it takes to go through all items: {iteration_time} s')
    
    total_time = round(time3-time1,2)
    print(f'Total time: {total_time} s')
    time_list+=process_time,iteration_time,total_time
def benchmarking_and_memory_usage_streaming(tdc,tokenizer,n=10,time_list=[]):
    mem_usage = memory_usage((benchmarking_streaming,[tdc,tokenizer,n,time_list]))
    total_usage = round(max(mem_usage),1)
    print(f'Maximum memory usage: {total_usage} MiB')
    time_list.append(total_usage)
def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

With filter

ns_timelist1=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer,time_list=ns_timelist1)
Time it takes to process + tokenize training texts: 0.8 s
Time it takes to go through 11740 items: 4.03 s
Total time: 4.82 s
Maximum memory usage: 743.0 MiB

With filter + metadatas concatenation

ns_timelist2=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer,time_list=ns_timelist2)
Time it takes to process + tokenize training texts: 0.79 s
Time it takes to go through 11740 items: 4.43 s
Total time: 5.22 s
Maximum memory usage: 745.9 MiB

With filter + metadatas concatenation + content transformation + content augmentation

ns_timelist3=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 content_transformations=[text_normalize,str.lower],
                                 content_augmentations= [nearby_aug_func,str.lower],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer,time_list=ns_timelist3)
Time it takes to process + tokenize training texts: 0.78 s
Time it takes to go through 11740 items: 12.23 s
Total time: 13.01 s
Maximum memory usage: 743.0 MiB

With filter + metadatas concatenation + content transformation + content augmentation + higher batch size (not recorded)

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 content_transformations=[text_normalize,str.lower],
                                 content_augmentations= [nearby_aug_func,str.lower],
                                 batch_size=bs*3,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)
Time it takes to process + tokenize training texts: 0.79 s
Time it takes to go through 35220 items: 36.66 s
Total time: 37.45 s
Maximum memory usage: 887.4 MiB

With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)

ns_timelist4=[]
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 content_transformations=[text_normalize,str.lower],
                                 content_augmentations= [nearby_aug_func,str.lower],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer,n=None,time_list=ns_timelist4)
Time it takes to process + tokenize training texts: 0.8 s
Time it takes to go through all items: 111.93 s
Total time: 112.73 s
Maximum memory usage: 762.8 MiB

2. Test the effect of batch size and num proc (parallel process) on Non-streaming dataset

def benchmarking(tdc,tokenizer,n=10,shuffle_trn=True):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512,shuffle_trn=shuffle_trn)
    time2 = time.time() 
    print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {(time3-time2):.3f} s')
    else:
        print(f'Time it takes to go through all items: {(time3-time2):.3f} s')

    print(f'Total time: {(time3-time1):.3f} s')
def benchmarking_and_memory_usage(tdc,tokenizer,n=10,shuffle_trn=True):
    mem_usage = memory_usage((benchmarking,[tdc,tokenizer,n,shuffle_trn]))
    print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')


def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

For non-streaming dataset, text processing + tokenization are the most time-consuming tasks, thus we will check how different batch size and num proc will affect these tasks’ running time

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=100,
                         num_proc=2,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)
Time it takes to process + tokenize training texts: 64.098 s
Time it takes to go through all items: 13.400 s
Total time: 77.499 s
Maximum memory usage: 925.188 MiB
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=2,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)
Time it takes to process + tokenize training texts: 61.297 s
Time it takes to go through all items: 14.427 s
Total time: 75.724 s
Maximum memory usage: 912.223 MiB
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=100,
                         num_proc=8,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)
Time it takes to process + tokenize training texts: 25.857 s
Time it takes to go through all items: 13.776 s
Total time: 39.634 s
Maximum memory usage: 928.574 MiB
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=8,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)
Time it takes to process + tokenize training texts: 24.933 s
Time it takes to go through all items: 14.271 s
Total time: 39.204 s
Maximum memory usage: 913.266 MiB
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=2000,
                         num_proc=8,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)
Time it takes to process + tokenize training texts: 25.600 s
Time it takes to go through all items: 14.465 s
Total time: 40.064 s
Maximum memory usage: 934.883 MiB

Increasing num_proc is more beneficial than increasing processing batch size

3. Improving processing time with caching

The worst processing time is recorded with non-streaming training set, with the following preprocessing: 2-column filtering, 2-column metadatas, 2 content transformations, 2 content augmentation.

With caching, we can significantly reduce the preprocessing time. That means, you only need to do all preprocessings once; all subsequent call will take advatages of this cached result.

enable_caching()
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
tdc.process_and_tokenize(tokenizer,max_length=512)
Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-420893192d8b876f_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-ee3f2ca19acd2369_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-27d1b7f9046ec1b4_*_of_00004.arrow
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)
Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-420893192d8b876f_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-ee3f2ca19acd2369_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-27d1b7f9046ec1b4_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-57d938bbd364f406_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-10afb2ec3cb12852_*_of_00004.arrow
Loading cached shuffled indices for dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-5d8840c40fe75896.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-030424c28049222f_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-fe4290971e8d1087_*_of_00004.arrow
Time it takes to process + tokenize training texts: 0.979 s
Time it takes to go through all items: 16.824 s
Maximum memory usage: 823.531 MiB

If you cached, then you only need 0.979s to load the data back from caches, instead of wait for 35s to do the process all over again

4. Time and Space Complexity Comparison (as of 5/3/2024)

import pandas as pd
import numpy as np
exp1 = [timelist1,ns_timelist1]
exp2 = [timelist2,ns_timelist2]
exp3 = [timelist3,ns_timelist3]
exp4 = [timelist4,[None,None,None,None]] # no shuffling when streaming
exp5 = [timelist5,ns_timelist4]
col_names=['Filter + Shuffling Train','And 2 metadatas',
           'And 2 tfms + 2 augs','Same, but without train shuffling',
           'Time to process 1 epoch']
idxs=['Non-Streaming','Streaming']
_tmp=[]
for i in range(2):
    _tmp.append([l[i][0] for l in [exp1,exp2,exp3,exp4,exp5]])
df = pd.DataFrame(np.array(_tmp),columns=col_names)
df.index = idxs
df.index.name= 'Time (s) to process and tokenize 117k records with batch size 1174'

_tmp=[]
for i in range(2):
    _tmp.append([l[i][1] for l in [exp1,exp2,exp3,exp4,exp5]])
df2 = pd.DataFrame(np.array(_tmp),columns=col_names[:-1]+['Time to iterate 1 epoch'])
df2.index = idxs
df2.index.name= 'Time (s) to iterate 10 batches (11740 items)'

_tmp=[]
for i in range(2):
    _tmp.append([l[i][2] for l in [exp1,exp2,exp3,exp4,exp5]])
df3 = pd.DataFrame(np.array(_tmp),columns=col_names[:-1]+['Total time to process + tokenize + iterate 1 epoch'])
df3.index = idxs
df3.index.name= 'Total time (s) to process + tokenize + iterate 10 batches'

_tmp=[]
for i in range(2):
    _tmp.append([l[i][3] for l in [exp1,exp2,exp3,exp4,exp5]])
df4 = pd.DataFrame(np.array(_tmp),columns=col_names[:-1]+['Total memory to process + tokenize + iterate 1 epoch'])
df4.index = idxs
df4.index.name= 'Total memory (MiB) to process + tokenize + iterate 10 batches'
df
Filter + Shuffling Train And 2 metadatas And 2 tfms + 2 augs Same, but without train shuffling Time to process 1 epoch
Time (s) to process and tokenize 117k records with batch size 1174
Non-Streaming 14.37 15.26 35.09 34.36 35.34
Streaming 0.8 0.79 0.78 None 0.8
df2
Filter + Shuffling Train And 2 metadatas And 2 tfms + 2 augs Same, but without train shuffling Time to iterate 1 epoch
Time (s) to iterate 10 batches (11740 items)
Non-Streaming 1.27 1.46 1.52 1.47 14.32
Streaming 4.03 4.43 12.23 None 111.93
df3
Filter + Shuffling Train And 2 metadatas And 2 tfms + 2 augs Same, but without train shuffling Total time to process + tokenize + iterate 1 epoch
Total time (s) to process + tokenize + iterate 10 batches
Non-Streaming 15.64 16.72 36.61 35.83 49.66
Streaming 4.82 5.22 13.01 None 112.73
df4
Filter + Shuffling Train And 2 metadatas And 2 tfms + 2 augs Same, but without train shuffling Total memory to process + tokenize + iterate 1 epoch
Total memory (MiB) to process + tokenize + iterate 10 batches
Non-Streaming 734.9 748.6 754.7 777.3 869.6
Streaming 743.0 745.9 743.0 None 762.8

5. Tips and tricks

  • For non-streaming data, the best way to minimize processing and iteration time is:
    • Turn on dataset caching, and run the processing step once for it to be cached
  • The more content transformations and augmentations added, the slower the process + iteration. This is especially true for streaming data
  • For streaming, be aware of the pros and cons of batch-process and line-by-line process (read more here)