Text Main For Language Model - Streaming

This module contains the main Python class for the streaming version of TextDataLMController

import pandas as pd
import numpy as np
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
from datasets import load_dataset
import os

Class TextDataLMControllerStreaming

source

TextDataLMControllerStreaming

 TextDataLMControllerStreaming (inp, main_text:str, filter_dict={},
                                metadatas=[], process_metas=True,
                                metas_sep='.', content_transformations=[],
                                seed=None, batch_size=1024, num_proc=1,
                                cols_to_keep=None, verbose=True)

Initialize self. See help(type(self)) for accurate signature.

	Type	Default	Details
inp			HuggingFainpce Dataset or DatasetDict
main_text	str		Name of the main text column
filter_dict	dict	{}	A dictionary: {feature: filtering_function_for_that_feature}
metadatas	list	[]	Names of the metadata columns
process_metas	bool	True	Whether to do simple text processing on the chosen metadatas
metas_sep	str	.	Separator, for multiple metadatas concatenation
content_transformations	list	[]	A list of text transformations
seed	NoneType	None	Random seed
batch_size	int	1024	Transformation + Tokenization batch size
num_proc	int	1	Number of process for multiprocessing
cols_to_keep	NoneType	None	Columns to keep after all processings
verbose	bool	True	Whether to prdint processing information

1. Load data + Basic use case

Dataset source: https://www.kaggle.com/datasets/kavita5/review_ecommerce

With line-by-line tokenization

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

ddict_with_val

DatasetDict({
    train: IterableDataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        n_shards: 1
    })
    validation: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        num_rows: 2349
    })
})

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    main_text='Review Text',
                                   )

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

source

TextDataLMControllerStreaming.process_and_tokenize

 TextDataLMControllerStreaming.process_and_tokenize (tokenizer,
                                                     max_length=None,
                                                     tok_num_proc=None,
                                                     line_by_line=True,
                                                     stride=None)

	Type	Default	Details
tokenizer			Tokenizer (preferably from HuggingFace)
max_length	NoneType	None	pad to model’s allowed max length (default is max_sequence_length). Use -1 for no padding at all
tok_num_proc	NoneType	None	Number of processes for tokenization
line_by_line	bool	True	To whether tokenize each sentence separately, or concatenate them
stride	NoneType	None	option to do striding when line_by_line is False

tdc.process_and_tokenize(tokenizer,line_by_line=True)

-------------------- Data Filtering --------------------
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----
Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done

tdc.main_ddict

DatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 1
    })
    validation: Dataset({
        features: ['Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 2260
    })
})

for i,v in enumerate(tdc.main_ddict['validation']):
    if i==1:break
    print(f"Input ids: {v['input_ids']}\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")

Input ids: [0, 17781, 129, 1381, 15, 53, 24, 16, 10, 182, 11962, 16576, 8, 939, 206, 40, 28, 3473, 7, 3568, 4, 939, 657, 5, 18632, 9, 5, 1468, 4, 939, 109, 2813, 24, 376, 11, 97, 8089, 4, 24, 16, 10, 410, 19351, 1468, 53, 45, 350, 203, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded: <s>Have only tried on but it is a very cute skirt and i think will be comfortable to wear. i love the texture of the material. i do wish it came in other colors. it is a little heavier material but not too much.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

for i,v in enumerate(tdc.main_ddict['train']):
    if i==1:break
    print(f"Input ids: {v['input_ids']}\n\nDecoded: {tokenizer.decode(v['input_ids'])}\n\nAttention Mask: {v['attention_mask']}")

Input ids: [0, 713, 23204, 21, 11, 12, 8005, 11, 10, 55, 7974, 3195, 24943, 8, 21, 98, 1256, 4, 77, 939, 1381, 24, 15, 939, 5324, 10, 367, 383, 35, 5, 23193, 8, 1318, 32, 2299, 89, 111, 42, 16, 10, 2579, 2125, 190, 23, 455, 12, 17212, 4, 5, 13422, 16, 14, 24, 1237, 182, 739, 4, 939, 437, 3700, 10, 501, 73, 1549, 8, 10, 1836, 3023, 462, 11, 144, 6215, 1964, 6, 8, 190, 5, 739, 21, 10, 828, 929, 219, 4, 5, 97, 631, 21, 14, 5, 21764, 9, 42, 23204, 32, 1256, 251, 36, 463, 939, 33, 251, 3701, 322, 528, 7, 5, 10342, 847, 9, 5, 24150, 47, 1705, 75, 190, 269, 740, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Decoded: <s>This sweater was in-store in a more neutral color palette and was so pretty. when i tried it on i noticed a few things: the versatility and quality are definitely there - this is a nice piece even at full-price. the downside is that it runs very large. i'm typically a 14/16 and a size xl in most retailer items, and even the large was a bit roomy. the other thing was that the sleeves of this sweater are pretty long (and i have long arms). due to the bell cut of the sleeve you couldn't even really c</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2. Filtering + Metadatas + Content Transformation + Tokenization

Define our tokenization

from transformers import RobertaTokenizer
from underthesea import text_normalize

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

from that_nlp_library.text_main_lm import TextDataLMController

a) Option 1: Tokenize our corpus line-by-line

With no padding

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s>

<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s>

for i,v in enumerate(tdc.main_ddict['train']):
    if i%100==0:
        print(i)
    if i==1024-1:
        break
    pass

0
100
200
300
400
500
600
700
800
900
1000
CPU times: user 884 ms, sys: 1.07 ms, total: 885 ms
Wall time: 877 ms

Compare to non-streamed version

dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            cols_to_keep=['Clothing ID','Review Text'],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1,shuffle_trn=False)

# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a['input_ids']==b['input_ids']

print(a)
print('-'*20)
print(b)

{'Clothing ID': 1056, 'Review Text': 'general . perfect pant . I picked these up the other day looking for a good jeans alternative. i love them. they are the perfect fit of slim but not skinny. i went with my normal size (26) and so far after one wear, they are still in good shape. a little bit of stretch, but not too much. the moss color is so crisp and goes with a lot. they will be perfect for transitioning into fall.', 'input_ids': [0, 15841, 479, 1969, 16259, 479, 939, 2738, 209, 62, 5, 97, 183, 546, 13, 10, 205, 10844, 3626, 479, 939, 657, 106, 479, 51, 32, 5, 1969, 2564, 9, 11875, 53, 45, 22877, 479, 939, 439, 19, 127, 2340, 1836, 36, 973, 4839, 8, 98, 444, 71, 65, 3568, 2156, 51, 32, 202, 11, 205, 3989, 479, 10, 410, 828, 9, 4140, 2156, 53, 45, 350, 203, 479, 5, 40711, 3195, 16, 98, 17766, 8, 1411, 19, 10, 319, 479, 51, 40, 28, 1969, 13, 26135, 88, 1136, 479, 2], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
--------------------
{'Clothing ID': 1056, 'Review Text': 'general . perfect pant . i picked these up the other day looking for a good jeans alternative . i love them . they are the perfect fit of slim but not skinny . i went with my normal size ( 26 ) and so far after one wear , they are still in good shape . a little bit of stretch , but not too much . the moss color is so crisp and goes with a lot . they will be perfect for transitioning into fall .', 'input_ids': [0, 15841, 479, 1969, 16259, 479, 939, 2738, 209, 62, 5, 97, 183, 546, 13, 10, 205, 10844, 3626, 479, 939, 657, 106, 479, 51, 32, 5, 1969, 2564, 9, 11875, 53, 45, 22877, 479, 939, 439, 19, 127, 2340, 1836, 36, 973, 4839, 8, 98, 444, 71, 65, 3568, 2156, 51, 32, 202, 11, 205, 3989, 479, 10, 410, 828, 9, 4140, 2156, 53, 45, 350, 203, 479, 5, 40711, 3195, 16, 98, 17766, 8, 1411, 19, 10, 319, 479, 51, 40, 28, 1969, 13, 26135, 88, 1136, 479, 2], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

With padding

(set max_length to None if you want to pad to model’s maximum sequence length)

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=True
                                    )
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=256,tok_num_proc=1)

-------------------- Data Filtering --------------------
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----
Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done

print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

Compare to non-streamed version

dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            cols_to_keep=['Clothing ID','Review Text'],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=True,max_length=256,shuffle_trn=False,tok_num_proc=1)

# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a==b

# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

b) Option 2: Tokenize every text, then concatenate them together before splitting them in smaller parts.

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=256,tok_num_proc=1)

print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><s>general. not as short on me ( petite ). i ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a bit linger still 9 lower on calf for me ), the straps are almost tight, so i would say the dress is a reversed taper shape. color is beautiful, i ordered green as the other color ( plum ) doesn't have petite available. green is rich, and classy, the fabric is surprisingly soft. i love the little details in the velvet. definitely need a

<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s><s>general petite. a new staple!. tried these on out of sheer curiosity -- i've got a long torso & was pleasantly surprised how flattering they are! they manage to look flowing & sleek without shortening the legs. took a size 6 with my 27 " waist, 37 " hips. it's a bit of a generous fit, especially around the waist, but they're extremely comfortable & have room to tuck tops into. i have the cowled sweater tank in gray & it looks fantastic over these! couldn't resist getting both the rust and black. perfect for a dressy casual look</s><s>general. maybe swing is for me!. i love swing dresses but they never seem

for i,v in enumerate(tdc.main_ddict['train']):
    if i%100==0:
        print(i)
    if i==1024-1:
        break
    pass

0
100
200
300
400
500
600
700
800
900
1000
CPU times: user 10.5 s, sys: 28.4 ms, total: 10.5 s
Wall time: 10.5 s

Compare to non-streamed version

dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=False,max_length=256,shuffle_trn=False,tok_num_proc=1)

# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a==b

# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

c) Striding (For Concatenation of tokens)

If your sentences (or paragraphs) are larger than max_length, after concatenation, they will be broken apart; your long paragraph will be incompleted in terms of meaning. Striding is a way to somewhat preserve the sentence’s meaning, by getting part of the sentence back. We will demonstrate it with an example, and you can compare it with the previous one (without striding) to see the differences

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20,tok_num_proc=1)
# Stride is 20, meaning for the next entry, we go back 20 tokens

for i,v in enumerate(tdc.main_ddict['train']):
    if i==2: break
    print(tokenizer.decode(v['input_ids']))
    print('-'*20)

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ",
--------------------
 however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><s>general. not as short on me ( petite ). i ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a
--------------------

print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][1]))

<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn
 running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s><s>general petite. a new staple!. tried these on out of sheer curiosity -- i've got a long torso & was pleasantly surprised how flattering they are! they manage to look flowing & sleek without shortening the legs. took a size 6 with my 27 " waist, 37 " hips. it's a bit

For the second entry, we can see it starts with the last 20 tokens of the previous entry

Compare to non-streamed version

dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,shuffle_trn=False,
                          stride=20,tok_num_proc=1)

# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a==b

# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

3. Data Collator

from underthesea import text_normalize
from transformers import AutoTokenizer

For masked language model

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

Let’s define our text controller first

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

We will tokenize our corpus line-by-line

tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

tdc.set_data_collator()

tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

tdc.data_collator

DataCollatorForLanguageModeling(tokenizer=RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
    0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
    1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
    2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
    3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
    50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, mlm=True, mlm_probability=0.15, pad_to_multiple_of=8, tf_experimental_compile=False, return_tensors='pt')

Before applying the collator…

for i,v in enumerate(tdc.main_ddict['train']):
    if i==2: break
    print(v)
    print(f"Length of input_ids: {len(v['input_ids'])}")
    print('-'*20)

{'Clothing ID': 841, 'Review Text': 'general petite . beautiful top, worth the necessary tailoring . The beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture; clearly the model\'s arms are placed in front of all the extra fabric to hold the ruffle back.\r\nhowever, the fabric is beautiful, the fit was perfect (size 2, 5\'4", 106 lbs.), the quality is great and i love the print so i decided to take it to my tailor to "sew away" the "wings" on both si', 'input_ids': [0, 15841, 4716, 1459, 479, 2721, 299, 2156, 966, 5, 2139, 7886, 5137, 479, 5, 2721, 7457, 5780, 4855, 162, 7, 42, 299, 8, 24, 222, 45, 17534, 2115, 18245, 479, 959, 2156, 5, 2576, 910, 15315, 28, 9970, 98, 444, 66, 15, 349, 526, 14, 24, 21, 38677, 27785, 5, 3031, 2564, 16, 1085, 101, 5, 2170, 25606, 2563, 5, 1421, 18, 3701, 32, 2325, 11, 760, 9, 70, 5, 1823, 10199, 7, 946, 5, 910, 15315, 124, 479, 959, 2156, 5, 10199, 16, 2721, 2156, 5, 2564, 21, 1969, 36, 1836, 132, 2156, 195, 128, 204, 22, 2156, 13442, 23246, 479, 4839, 2156, 5, 1318, 16, 372, 8, 939, 657, 5, 5780, 98, 939, 1276, 7, 185, 24, 7, 127, 26090, 7, 22, 35043, 409, 22, 5, 22, 11954, 22, 15, 258, 3391, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}
Length of input_ids: 136
--------------------
{'Clothing ID': 1110, 'Review Text': "general . not as short on me (petite) . I ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a bit linger still 9lower on calf for me), the straps are almost tight, so i would say the dress is a reversed taper shape. color is beautiful, i ordered green as the other color (plum) doesn't have petite available. green is rich, and classy, the fabric is surprisingly soft. i love the little details in the velvet. definitely need a strapless bra for this one.\r\n\r\n115 lbsm 30d", 'input_ids': [0, 15841, 479, 45, 25, 765, 15, 162, 36, 4716, 1459, 4839, 479, 939, 2740, 5, 37863, 29, 181, 25, 42, 3588, 16, 45, 10, 15898, 3588, 2156, 8, 14, 21, 5, 235, 1836, 13, 162, 479, 129, 631, 16, 5, 5933, 16, 10, 828, 18277, 202, 361, 795, 15, 16701, 13, 162, 4839, 2156, 5, 31622, 32, 818, 3229, 2156, 98, 939, 74, 224, 5, 3588, 16, 10, 13173, 326, 15888, 3989, 479, 3195, 16, 2721, 2156, 939, 2740, 2272, 25, 5, 97, 3195, 36, 36838, 4839, 630, 75, 33, 4716, 1459, 577, 479, 2272, 16, 4066, 2156, 8, 30228, 2156, 5, 10199, 16, 10262, 3793, 479, 939, 657, 5, 410, 1254, 11, 5, 29986, 479, 2299, 240, 10, 18052, 16979, 11689, 13, 42, 65, 479, 12312, 23246, 119, 389, 385, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}
Length of input_ids: 133
--------------------

We can see that the length of each token list is different from each other

Let’s apply the collator

# extract only the required keys
inp_keys = tokenizer.model_input_names
iter1 = iter(tdc.main_ddict['train'])
_result=[]
for i in range(5):
    _inp = next(iter1)
    _result.append({k:_inp[k] for k in inp_keys})

out = tdc.data_collator(_result)

out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

Now all token lists have the same length, which is a multiple of 8

out['input_ids'].shape

torch.Size([5, 136])

out['input_ids'][:3,:]

tensor([[    0, 15841,  4716,  1459,   479,  2721,   299,  2156,   966, 50264,
          2139,  7886,  5137,   479,     5,  2721,  7457,  5780,  4855,   162,
             7,    42,   299,     8, 50264,   222,    45, 17534,  2115, 50264,
           479, 50264,  2156,     5,  2576,   910, 15315,    28,  9970,    98,
           444,    66,    15,   349,   526,    14,    24,    21, 38677, 27785,
         17138,  3031, 50264,    16,  1085,   101,     5,  2170, 25606,  2563,
             5,  1421,    18,  3701,    32,  2325,    11,   760,     9,    70,
             5,  1823, 10199, 50264, 29261, 50264,   910, 15315,   124,   479,
           959,  2156,     5, 10199,    16,  2721,  2156, 50264,  2564,    21,
          1969,    36,  1836,   132, 50264,   195,   128,   204,    22, 50264,
         13442, 23246,   479, 50264,  2156,     5, 50264,    16, 50264,     8,
           939,   657,     5, 50264,    98,   939,  1276, 50264,   185,    24,
             7,   127, 26090,     7,    22, 35043,   409, 50264,     5,    22,
         11954,    22, 50264,   258,  3391,     2],
        [    0, 15841,   479,    45,    25, 13055,    15, 50264,    36,  4716,
          1459,  4839,   479, 50264,  2740,     5, 37863, 50264,   181, 50264,
            42,  3588,    16,    45,    10, 15898,  3588, 50264,     8,    14,
            21,     5,   235,  1836,    13,   162,   479,   129,   631,    16,
             5,  5933,    16, 38152,   828, 18277,   202,   361,   795,    15,
         16701,    13,   162,  4839,  2156,     5, 31622,    32,   818,  3229,
          2156,    98,   939,    74, 50264,     5,  3588,    16,    10, 13173,
           326, 50264,  3989,   479,  3195,    16, 50264,  2156,   939,  2740,
          2272,    25,     5,    97,  3195,    36, 36838,  4839, 50264,    75,
            33,  4716,  1459, 50264, 50264,  2272,    16,  4066,  2156,     8,
         30228, 50264, 50264, 50264,    16, 10262,  3793,   479,   939, 50264,
             5,   410,  1254,    11,     5, 50264,   479,  2299,   240, 50264,
         18052, 16979, 11689,    13,    42, 50264,   479, 12312, 23246,   119,
           389,   385,     2,     1,     1,     1],
        [    0, 15841,   479,  1969, 50264,    13,    80, 50264,   479, 15983,
         10717, 39574,    47,   240, 50264,   216,    14,    42,  1907,     9,
         50264, 50264,     5,    65,    14,    40,   120,  6538,    36,   939,
          2162,     5, 50264,    65,  4839,   479,    24,    16,  2422,  7174,
         50264,  9869,  2156,    53,   939,    21,   129,   441,     7,   120,
            80, 15033,    66, 50264,    24, 50264, 50264,   222, 10397,    24,
         50264, 50264,  4925, 50264,    18,  1836,   142,   939,  1079, 47904,
            24, 50264, 50264,   172, 10601,     7,  3841,   479,   939,    21,
          2422,  5779, 50264,     5,  3568,    53, 10874,   145,   441,     7,
         50264,    24,   396,   864,    23, 50264, 50264,  6215,   479,     2,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1]])

The labels have also been constructed, which shows the “mask” tokens (non -100) in which the model has to predict. To increase the amount of masked tokens, increase the mlm_prob

out['labels'][:3,:]

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,    24,  -100,  -100,  -100,  -100, 18245,
          -100,   959,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             5,  -100,  2564,  -100,  -100,  -100,  -100,  -100,  -100,  2563,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,     7,   946,     5,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,  -100,  -100,
          -100,  -100,  -100,  -100,  2156,  -100,  -100,  -100,  -100,  2156,
          -100,  -100,  -100,  4839,  -100,  -100,  1318,  -100,   372,  -100,
          -100,  -100,  -100,  5780,  -100,  -100,  -100,     7,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,    22,  -100,  -100,
          -100,  -100,    15,  -100,  -100,  -100],
        [ -100, 15841,  -100,  -100,  -100,   765,  -100,   162,  -100,  -100,
          -100,  -100,  -100,   939,  -100,  -100,  -100,    29,  -100,    25,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  2156,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             5,  -100,  -100,    10,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,   224,  -100,  -100,  -100,  -100,  -100,
          -100, 15888,  -100,  -100,  -100,  -100,  2721,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,   630,  -100,
          -100,  -100,  -100,   577,   479,  -100,  -100,  -100,  -100,  -100,
          -100,  2156,     5, 10199,  -100,  -100,  -100,  -100,  -100,   657,
          -100,  -100,  -100,  -100,  -100, 29986,  -100,  -100,  -100,    10,
          -100,  -100,  -100,  -100,  -100,    65,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ -100,  -100,  -100,  -100, 39574,  -100,  -100, 15033,  -100,  -100,
          -100,  -100,  -100,  -100,     7,  -100,  -100,  -100,  -100,  -100,
         10199,    16,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  1104,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             8,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,     9,  -100,   479,   939,  -100,  -100,  -100,
             8,    24,  -100,    24,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,   150,  7727,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,    59,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           671,  -100,  -100,  -100,  -100,   127,   400,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100]])

If you apply padding in the tokenization step (by adjusting the max_length argument), no matter whether it’s line-by-line tokenization or not, the data collator will skip the padding step

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

Let’s apply the collator

# extract only the required keys
inp_keys = tokenizer.model_input_names
iter1 = iter(tdc.main_ddict['train'])
_result=[]
for i in range(5):
    _inp = next(iter1)
    _result.append({k:_inp[k] for k in inp_keys})
        

out = tdc.data_collator(_result)

out['input_ids'].shape

torch.Size([5, 100])

out['input_ids'][:2,:]

tensor([[    0, 15841,  4716,  1459,   479,  2721,   299,  2156,   966, 50264,
          2139,  7886,  5137,   479,     5,  2721,  7457,  5780,  4855,   162,
             7,    42,   299,     8,    24,   222,    45, 17534,  2115, 50264,
           479, 50264,  2156,     5,  2576,   910, 15315,    28,  9970,    98,
           444,    66,    15,   349,   526,    14,    24,    21, 38677, 27785,
         50264,  3031, 50264,    16,  1085,   101,     5,  2170, 25606, 41316,
             5,  1421,    18,  3701,    32,  2325,    11,   760,     9,    70,
             5,  1823, 10199, 50264, 17204, 50264,   910, 15315,   124,   479,
           959,  2156,     5, 10199,    16,  2721,  2156, 50264,  2564,    21,
          1969,    36,  1836,   132, 50264,   195,   128,   204,    22, 50264],
        [13442, 23246,   479, 50264,  2156,     5, 50264,    16, 23781,     8,
           939,   657,     5,  5780,    98,   939,  1276, 50264,   185,    24,
             7,   127, 26090,     7,    22, 35043,   409, 50264,     5,    22,
         11954,    22, 50264,   258,  3391,     2,     0, 50264,   479,    45,
            25, 50264,    15, 44224,    36,  4716,  1459,  4839,   479, 50264,
          2740,     5, 37863, 50264,   181,    25,    42,  3588,    16,    45,
            10, 15898,  3588, 50264,     8,    14,    21,     5,   235,  1836,
            13,   162,   479,   129,   631,    16, 50264,  5933,    16, 50264,
           828, 18277,   202,   361,   795,    15, 16701,    13,   162,  4839,
          2156,     5, 31622,    32,   818,  3229,  2156,    98,   939,    74]])

out['labels'][:2,:]

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,    24,  -100,  -100,  -100,  -100, 18245,
          -100,   959,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             5,  -100,  2564,  -100,  -100,  -100,  -100,  -100,  -100,  2563,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,     7,   946,     5,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,  -100,  -100,
          -100,  -100,  -100,  -100,  2156,  -100,  -100,  -100,  -100,  2156],
        [ -100,  -100,  -100,  4839,  -100,  -100,  1318,  -100,   372,  -100,
          -100,  -100,  -100,  5780,  -100,  -100,  -100,     7,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,    22,  -100,  -100,
          -100,  -100,    15,  -100,  -100,  -100,  -100, 15841,  -100,  -100,
          -100,   765,  -100,   162,  -100,  -100,  -100,  -100,  -100,   939,
          -100,  -100,  -100,    29,  -100,    25,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  2156,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,     5,  -100,  -100,    10,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100]])

Since we are using the concatenation-of-tokenization technique, one smart thing that the HuggingFace’s DataCollatorForLanguageModeling (which is the data collator we use) does is to allow maskings at every position, at opposed to to the previous cases (with line-by-line tokenization), there’s no masking near the end tokens of each list, because those end tokens are padding tokens

For causal language model

from transformers import AutoTokenizer
from tokenizers import processors

Let’s define our GPT2 tokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')

tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
    50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

GPT2 does not use start/end-of-sentence token:

print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone']

If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.eos_token, tokenizer.eos_token_id)],
)
tokenizer.pad_token = tokenizer.eos_token

print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone', '<|endoftext|>']

With this modified tokenizer, let’s perform concatenation-of-tokenization using GPT2

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Since it’s casual language modeling, let’s turn off is_mlm

tdc.set_data_collator(is_mlm=False)

Let’s apply the collator

iter1 = iter(tdc.main_ddict['train'])
out = tdc.data_collator([next(iter1) for i in range(5)]) # simulation with batch size 5

out['input_ids'].shape

torch.Size([5, 100])

out['input_ids'][:2,:]

tensor([[24622,  4273,   578,   764,  4950,  1353,   837,  2861,   262,  3306,
          7894,  3255,   764,   262,  4950, 10758,  3601,  9859,   502,   284,
           428,  1353,   290,   340,   750,   407,  6703,  2402, 14507,   764,
          2158,   837,   262,  4220,   374, 18137,   307,  3353,   523,  1290,
           503,   319,  1123,  1735,   326,   340,   373, 47623,  5145,   262,
          4036,  4197,   318,  2147,   588,   262,  4286,  2162,  4084,   262,
          2746,   338,  5101,   389,  4624,   287,  2166,   286,   477,   262,
          3131,  9664,   284,  1745,   262,   374, 18137,   736,   764,  2158,
           837,   262,  9664,   318,  4950,   837,   262,  4197,   373,  2818,
           357,  2546,   362,   837,   642,   705,   604,   366,   837, 15696],
        [15785,   764,  1267,   837,   262,  3081,   318,  1049,   290,  1312,
          1842,   262,  3601,   523,  1312,  3066,   284,  1011,   340,   284,
           616, 35280,   284,   366, 34249,  1497,   366,   262,   366, 12098,
           366,   319,  1111, 33721, 50256, 24622,   764,   407,   355,  1790,
           319,   502,   357,  4273,   578,  1267,   764,  1312,  6149,   262,
         31383,    82,   279,   355,   428,  6576,   318,   407,   257, 18235,
          6576,   837,   290,   326,   373,   262,   826,  2546,   329,   502,
           764,   691,  1517,   318,   262,  4129,   318,   257,  1643, 31402,
           991,   860,  2793,   319, 31134,   329,   502,  1267,   837,   262,
         29552,   389,  2048,  5381,   837,   523,  1312,   561,   910,   262]])

out['labels'][:2,:]

tensor([[24622,  4273,   578,   764,  4950,  1353,   837,  2861,   262,  3306,
          7894,  3255,   764,   262,  4950, 10758,  3601,  9859,   502,   284,
           428,  1353,   290,   340,   750,   407,  6703,  2402, 14507,   764,
          2158,   837,   262,  4220,   374, 18137,   307,  3353,   523,  1290,
           503,   319,  1123,  1735,   326,   340,   373, 47623,  5145,   262,
          4036,  4197,   318,  2147,   588,   262,  4286,  2162,  4084,   262,
          2746,   338,  5101,   389,  4624,   287,  2166,   286,   477,   262,
          3131,  9664,   284,  1745,   262,   374, 18137,   736,   764,  2158,
           837,   262,  9664,   318,  4950,   837,   262,  4197,   373,  2818,
           357,  2546,   362,   837,   642,   705,   604,   366,   837, 15696],
        [15785,   764,  1267,   837,   262,  3081,   318,  1049,   290,  1312,
          1842,   262,  3601,   523,  1312,  3066,   284,  1011,   340,   284,
           616, 35280,   284,   366, 34249,  1497,   366,   262,   366, 12098,
           366,   319,  1111, 33721,  -100, 24622,   764,   407,   355,  1790,
           319,   502,   357,  4273,   578,  1267,   764,  1312,  6149,   262,
         31383,    82,   279,   355,   428,  6576,   318,   407,   257, 18235,
          6576,   837,   290,   326,   373,   262,   826,  2546,   329,   502,
           764,   691,  1517,   318,   262,  4129,   318,   257,  1643, 31402,
           991,   860,  2793,   319, 31134,   329,   502,  1267,   837,   262,
         29552,   389,  2048,  5381,   837,   523,  1312,   561,   910,   262]])

For CLM, the labels are essentially the same as input_ids. From HuggingFace documentation:

`DataCollatorForLanguageModeling` will take care of creating the language model labels — in causal language modeling the inputs serve as labels too (just shifted by one element), and this data collator creates them on the fly during training.

4. Save and Load TextDataController

source

TextDataLMControllerStreaming.save_as_pickles

 TextDataLMControllerStreaming.save_as_pickles (fname,
                                                parent='pickle_files')

	Type	Default	Details
fname			Name of the pickle file
parent	str	pickle_files	Parent folder

source

TextDataControllerStreaming.from_pickle

 TextDataControllerStreaming.from_pickle (fname, parent='pickle_files')

	Type	Default	Details
fname			Name of the pickle file
parent	str	pickle_files	Parent folder

TextDataLMControllerStreaming object can be saved and loaded with ease. This is especially useful after text processing and/or tokenization have been done

from datasets import disable_caching

disable_caching()

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

tdc.save_as_pickles('my_lm_tdc')

Load back our object

tdc2 = TextDataLMController.from_pickle('my_lm_tdc')

You can still access all its attributes, data, preprocessings, transformations …

tdc2.main_ddict

DatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 1
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 2253
    })
})

tdc2.filter_dict,tdc2.content_tfms

({'Review Text': <function __main__.<lambda>(x)>},
 [<function underthesea.pipeline.text_normalize.text_normalize(text, tokenizer='underthesea')>,
  <method 'lower' of 'str' objects>])