import pandas as pd
import numpy as np
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
from datasets import load_dataset
import osText Main For Language Model - Streaming
TextDataLMController
Class TextDataLMControllerStreaming
TextDataLMControllerStreaming
TextDataLMControllerStreaming (inp, main_text:str, filter_dict={}, metadatas=[], process_metas=True, metas_sep='.', content_transformations=[], seed=None, batch_size=1024, num_proc=1, cols_to_keep=None, verbose=True)
Initialize self. See help(type(self)) for accurate signature.
| Type | Default | Details | |
|---|---|---|---|
| inp | HuggingFainpce Dataset or DatasetDict | ||
| main_text | str | Name of the main text column | |
| filter_dict | dict | {} | A dictionary: {feature: filtering_function_for_that_feature} |
| metadatas | list | [] | Names of the metadata columns |
| process_metas | bool | True | Whether to do simple text processing on the chosen metadatas |
| metas_sep | str | . | Separator, for multiple metadatas concatenation |
| content_transformations | list | [] | A list of text transformations |
| seed | NoneType | None | Random seed |
| batch_size | int | 1024 | Transformation + Tokenization batch size |
| num_proc | int | 1 | Number of process for multiprocessing |
| cols_to_keep | NoneType | None | Columns to keep after all processings |
| verbose | bool | True | Whether to prdint processing information |
1. Load data + Basic use case
Dataset source: https://www.kaggle.com/datasets/kavita5/review_ecommerce
With line-by-line tokenization
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']ddict_with_valDatasetDict({
train: IterableDataset({
features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
n_shards: 1
})
validation: Dataset({
features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
num_rows: 2349
})
})
tdc = TextDataLMControllerStreaming(ddict_with_val,
filter_dict={'Review Text': lambda x: x is not None},
main_text='Review Text',
)from transformers import AutoTokenizertokenizer = AutoTokenizer.from_pretrained('roberta-base')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
TextDataLMControllerStreaming.process_and_tokenize
TextDataLMControllerStreaming.process_and_tokenize (tokenizer, max_length=None, tok_num_proc=None, line_by_line=True, stride=None)
| Type | Default | Details | |
|---|---|---|---|
| tokenizer | Tokenizer (preferably from HuggingFace) | ||
| max_length | NoneType | None | pad to model’s allowed max length (default is max_sequence_length). Use -1 for no padding at all |
| tok_num_proc | NoneType | None | Number of processes for tokenization |
| line_by_line | bool | True | To whether tokenize each sentence separately, or concatenate them |
| stride | NoneType | None | option to do striding when line_by_line is False |
tdc.process_and_tokenize(tokenizer,line_by_line=True)-------------------- Data Filtering --------------------
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----
Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done
tdc.main_ddictDatasetDict({
train: IterableDataset({
features: Unknown,
n_shards: 1
})
validation: Dataset({
features: ['Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 2260
})
})
for i,v in enumerate(tdc.main_ddict['validation']):
if i==1:break
print(f"Input ids: {v['input_ids']}\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")Input ids: [0, 17781, 129, 1381, 15, 53, 24, 16, 10, 182, 11962, 16576, 8, 939, 206, 40, 28, 3473, 7, 3568, 4, 939, 657, 5, 18632, 9, 5, 1468, 4, 939, 109, 2813, 24, 376, 11, 97, 8089, 4, 24, 16, 10, 410, 19351, 1468, 53, 45, 350, 203, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded: <s>Have only tried on but it is a very cute skirt and i think will be comfortable to wear. i love the texture of the material. i do wish it came in other colors. it is a little heavier material but not too much.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
for i,v in enumerate(tdc.main_ddict['train']):
if i==1:break
print(f"Input ids: {v['input_ids']}\n\nDecoded: {tokenizer.decode(v['input_ids'])}\n\nAttention Mask: {v['attention_mask']}")Input ids: [0, 713, 23204, 21, 11, 12, 8005, 11, 10, 55, 7974, 3195, 24943, 8, 21, 98, 1256, 4, 77, 939, 1381, 24, 15, 939, 5324, 10, 367, 383, 35, 5, 23193, 8, 1318, 32, 2299, 89, 111, 42, 16, 10, 2579, 2125, 190, 23, 455, 12, 17212, 4, 5, 13422, 16, 14, 24, 1237, 182, 739, 4, 939, 437, 3700, 10, 501, 73, 1549, 8, 10, 1836, 3023, 462, 11, 144, 6215, 1964, 6, 8, 190, 5, 739, 21, 10, 828, 929, 219, 4, 5, 97, 631, 21, 14, 5, 21764, 9, 42, 23204, 32, 1256, 251, 36, 463, 939, 33, 251, 3701, 322, 528, 7, 5, 10342, 847, 9, 5, 24150, 47, 1705, 75, 190, 269, 740, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded: <s>This sweater was in-store in a more neutral color palette and was so pretty. when i tried it on i noticed a few things: the versatility and quality are definitely there - this is a nice piece even at full-price. the downside is that it runs very large. i'm typically a 14/16 and a size xl in most retailer items, and even the large was a bit roomy. the other thing was that the sleeves of this sweater are pretty long (and i have long arms). due to the bell cut of the sleeve you couldn't even really c</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2. Filtering + Metadatas + Content Transformation + Tokenization
Define our tokenization
from transformers import RobertaTokenizer
from underthesea import text_normalizetokenizer = RobertaTokenizer.from_pretrained('roberta-base')from that_nlp_library.text_main_lm import TextDataLMControllera) Option 1: Tokenize our corpus line-by-line
With no padding
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
batch_size=1024,
verbose=False
)tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s>
<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s>
for i,v in enumerate(tdc.main_ddict['train']):
if i%100==0:
print(i)
if i==1024-1:
break
pass0
100
200
300
400
500
600
700
800
900
1000
CPU times: user 884 ms, sys: 1.07 ms, total: 885 ms
Wall time: 877 ms
Compare to non-streamed version
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']
tdc2 = TextDataLMController(ddict_with_val2,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
batch_size=1024,
verbose=False
)
tdc2.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1,shuffle_trn=False)# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
assert a['input_ids']==b['input_ids']print(a)
print('-'*20)
print(b){'Clothing ID': 1056, 'Review Text': 'general . perfect pant . I picked these up the other day looking for a good jeans alternative. i love them. they are the perfect fit of slim but not skinny. i went with my normal size (26) and so far after one wear, they are still in good shape. a little bit of stretch, but not too much. the moss color is so crisp and goes with a lot. they will be perfect for transitioning into fall.', 'input_ids': [0, 15841, 479, 1969, 16259, 479, 939, 2738, 209, 62, 5, 97, 183, 546, 13, 10, 205, 10844, 3626, 479, 939, 657, 106, 479, 51, 32, 5, 1969, 2564, 9, 11875, 53, 45, 22877, 479, 939, 439, 19, 127, 2340, 1836, 36, 973, 4839, 8, 98, 444, 71, 65, 3568, 2156, 51, 32, 202, 11, 205, 3989, 479, 10, 410, 828, 9, 4140, 2156, 53, 45, 350, 203, 479, 5, 40711, 3195, 16, 98, 17766, 8, 1411, 19, 10, 319, 479, 51, 40, 28, 1969, 13, 26135, 88, 1136, 479, 2], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
--------------------
{'Clothing ID': 1056, 'Review Text': 'general . perfect pant . i picked these up the other day looking for a good jeans alternative . i love them . they are the perfect fit of slim but not skinny . i went with my normal size ( 26 ) and so far after one wear , they are still in good shape . a little bit of stretch , but not too much . the moss color is so crisp and goes with a lot . they will be perfect for transitioning into fall .', 'input_ids': [0, 15841, 479, 1969, 16259, 479, 939, 2738, 209, 62, 5, 97, 183, 546, 13, 10, 205, 10844, 3626, 479, 939, 657, 106, 479, 51, 32, 5, 1969, 2564, 9, 11875, 53, 45, 22877, 479, 939, 439, 19, 127, 2340, 1836, 36, 973, 4839, 8, 98, 444, 71, 65, 3568, 2156, 51, 32, 202, 11, 205, 3989, 479, 10, 410, 828, 9, 4140, 2156, 53, 45, 350, 203, 479, 5, 40711, 3195, 16, 98, 17766, 8, 1411, 19, 10, 319, 479, 51, 40, 28, 1969, 13, 26135, 88, 1136, 479, 2], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])
iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
assert a==bWith padding
(set max_length to None if you want to pad to model’s maximum sequence length)
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']
tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
batch_size=1024,
verbose=True
)
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=256,tok_num_proc=1)-------------------- Data Filtering --------------------
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----
Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done
print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Compare to non-streamed version
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']
tdc2 = TextDataLMController(ddict_with_val2,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
batch_size=1024,
verbose=False
)
tdc2.process_and_tokenize(tokenizer,line_by_line=True,max_length=256,shuffle_trn=False,tok_num_proc=1)# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
assert a==b# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])
iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
assert a==bb) Option 2: Tokenize every text, then concatenate them together before splitting them in smaller parts.
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']
tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
seed=42,
batch_size=1024,
verbose=False
)
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=256,tok_num_proc=1)print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><s>general. not as short on me ( petite ). i ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a bit linger still 9 lower on calf for me ), the straps are almost tight, so i would say the dress is a reversed taper shape. color is beautiful, i ordered green as the other color ( plum ) doesn't have petite available. green is rich, and classy, the fabric is surprisingly soft. i love the little details in the velvet. definitely need a
<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s><s>general petite. a new staple!. tried these on out of sheer curiosity -- i've got a long torso & was pleasantly surprised how flattering they are! they manage to look flowing & sleek without shortening the legs. took a size 6 with my 27 " waist, 37 " hips. it's a bit of a generous fit, especially around the waist, but they're extremely comfortable & have room to tuck tops into. i have the cowled sweater tank in gray & it looks fantastic over these! couldn't resist getting both the rust and black. perfect for a dressy casual look</s><s>general. maybe swing is for me!. i love swing dresses but they never seem
for i,v in enumerate(tdc.main_ddict['train']):
if i%100==0:
print(i)
if i==1024-1:
break
pass0
100
200
300
400
500
600
700
800
900
1000
CPU times: user 10.5 s, sys: 28.4 ms, total: 10.5 s
Wall time: 10.5 s
Compare to non-streamed version
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']
tdc2 = TextDataLMController(ddict_with_val2,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
seed=42,
batch_size=1024,
verbose=False
)
tdc2.process_and_tokenize(tokenizer,line_by_line=False,max_length=256,shuffle_trn=False,tok_num_proc=1)# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])
iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
assert a==b# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])
iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
assert a==bc) Striding (For Concatenation of tokens)
If your sentences (or paragraphs) are larger than max_length, after concatenation, they will be broken apart; your long paragraph will be incompleted in terms of meaning. Striding is a way to somewhat preserve the sentence’s meaning, by getting part of the sentence back. We will demonstrate it with an example, and you can compare it with the previous one (without striding) to see the differences
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']
tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
seed=42,
batch_size=1024,
verbose=False
)tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20,tok_num_proc=1)
# Stride is 20, meaning for the next entry, we go back 20 tokensfor i,v in enumerate(tdc.main_ddict['train']):
if i==2: break
print(tokenizer.decode(v['input_ids']))
print('-'*20)<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ",
--------------------
however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><s>general. not as short on me ( petite ). i ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a
--------------------
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][1]))<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn
running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s><s>general petite. a new staple!. tried these on out of sheer curiosity -- i've got a long torso & was pleasantly surprised how flattering they are! they manage to look flowing & sleek without shortening the legs. took a size 6 with my 27 " waist, 37 " hips. it's a bit
For the second entry, we can see it starts with the last 20 tokens of the previous entry
Compare to non-streamed version
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']
tdc2 = TextDataLMController(ddict_with_val2,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
seed=42,
batch_size=1024,
verbose=False
)
tdc2.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,shuffle_trn=False,
stride=20,tok_num_proc=1)# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])
iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
assert a==b# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])
iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
assert a==b3. Data Collator
from underthesea import text_normalize
from transformers import AutoTokenizerFor masked language model
tokenizer = AutoTokenizer.from_pretrained('roberta-base')/home/quan/anaconda3/envs/nlp_dev/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Let’s define our text controller first
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']
tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
batch_size=1024,
verbose=False
)We will tokenize our corpus line-by-line
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)tdc.set_data_collator()tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)tdc.data_collatorDataCollatorForLanguageModeling(tokenizer=RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, mlm=True, mlm_probability=0.15, pad_to_multiple_of=8, tf_experimental_compile=False, return_tensors='pt')
Before applying the collator…
for i,v in enumerate(tdc.main_ddict['train']):
if i==2: break
print(v)
print(f"Length of input_ids: {len(v['input_ids'])}")
print('-'*20){'Clothing ID': 841, 'Review Text': 'general petite . beautiful top, worth the necessary tailoring . The beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture; clearly the model\'s arms are placed in front of all the extra fabric to hold the ruffle back.\r\nhowever, the fabric is beautiful, the fit was perfect (size 2, 5\'4", 106 lbs.), the quality is great and i love the print so i decided to take it to my tailor to "sew away" the "wings" on both si', 'input_ids': [0, 15841, 4716, 1459, 479, 2721, 299, 2156, 966, 5, 2139, 7886, 5137, 479, 5, 2721, 7457, 5780, 4855, 162, 7, 42, 299, 8, 24, 222, 45, 17534, 2115, 18245, 479, 959, 2156, 5, 2576, 910, 15315, 28, 9970, 98, 444, 66, 15, 349, 526, 14, 24, 21, 38677, 27785, 5, 3031, 2564, 16, 1085, 101, 5, 2170, 25606, 2563, 5, 1421, 18, 3701, 32, 2325, 11, 760, 9, 70, 5, 1823, 10199, 7, 946, 5, 910, 15315, 124, 479, 959, 2156, 5, 10199, 16, 2721, 2156, 5, 2564, 21, 1969, 36, 1836, 132, 2156, 195, 128, 204, 22, 2156, 13442, 23246, 479, 4839, 2156, 5, 1318, 16, 372, 8, 939, 657, 5, 5780, 98, 939, 1276, 7, 185, 24, 7, 127, 26090, 7, 22, 35043, 409, 22, 5, 22, 11954, 22, 15, 258, 3391, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}
Length of input_ids: 136
--------------------
{'Clothing ID': 1110, 'Review Text': "general . not as short on me (petite) . I ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a bit linger still 9lower on calf for me), the straps are almost tight, so i would say the dress is a reversed taper shape. color is beautiful, i ordered green as the other color (plum) doesn't have petite available. green is rich, and classy, the fabric is surprisingly soft. i love the little details in the velvet. definitely need a strapless bra for this one.\r\n\r\n115 lbsm 30d", 'input_ids': [0, 15841, 479, 45, 25, 765, 15, 162, 36, 4716, 1459, 4839, 479, 939, 2740, 5, 37863, 29, 181, 25, 42, 3588, 16, 45, 10, 15898, 3588, 2156, 8, 14, 21, 5, 235, 1836, 13, 162, 479, 129, 631, 16, 5, 5933, 16, 10, 828, 18277, 202, 361, 795, 15, 16701, 13, 162, 4839, 2156, 5, 31622, 32, 818, 3229, 2156, 98, 939, 74, 224, 5, 3588, 16, 10, 13173, 326, 15888, 3989, 479, 3195, 16, 2721, 2156, 939, 2740, 2272, 25, 5, 97, 3195, 36, 36838, 4839, 630, 75, 33, 4716, 1459, 577, 479, 2272, 16, 4066, 2156, 8, 30228, 2156, 5, 10199, 16, 10262, 3793, 479, 939, 657, 5, 410, 1254, 11, 5, 29986, 479, 2299, 240, 10, 18052, 16979, 11689, 13, 42, 65, 479, 12312, 23246, 119, 389, 385, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}
Length of input_ids: 133
--------------------
We can see that the length of each token list is different from each other
Let’s apply the collator
# extract only the required keys
inp_keys = tokenizer.model_input_names
iter1 = iter(tdc.main_ddict['train'])
_result=[]
for i in range(5):
_inp = next(iter1)
_result.append({k:_inp[k] for k in inp_keys})out = tdc.data_collator(_result)out.keys()dict_keys(['input_ids', 'attention_mask', 'labels'])
Now all token lists have the same length, which is a multiple of 8
out['input_ids'].shapetorch.Size([5, 136])
out['input_ids'][:3,:]tensor([[ 0, 15841, 4716, 1459, 479, 2721, 299, 2156, 966, 50264,
2139, 7886, 5137, 479, 5, 2721, 7457, 5780, 4855, 162,
7, 42, 299, 8, 50264, 222, 45, 17534, 2115, 50264,
479, 50264, 2156, 5, 2576, 910, 15315, 28, 9970, 98,
444, 66, 15, 349, 526, 14, 24, 21, 38677, 27785,
17138, 3031, 50264, 16, 1085, 101, 5, 2170, 25606, 2563,
5, 1421, 18, 3701, 32, 2325, 11, 760, 9, 70,
5, 1823, 10199, 50264, 29261, 50264, 910, 15315, 124, 479,
959, 2156, 5, 10199, 16, 2721, 2156, 50264, 2564, 21,
1969, 36, 1836, 132, 50264, 195, 128, 204, 22, 50264,
13442, 23246, 479, 50264, 2156, 5, 50264, 16, 50264, 8,
939, 657, 5, 50264, 98, 939, 1276, 50264, 185, 24,
7, 127, 26090, 7, 22, 35043, 409, 50264, 5, 22,
11954, 22, 50264, 258, 3391, 2],
[ 0, 15841, 479, 45, 25, 13055, 15, 50264, 36, 4716,
1459, 4839, 479, 50264, 2740, 5, 37863, 50264, 181, 50264,
42, 3588, 16, 45, 10, 15898, 3588, 50264, 8, 14,
21, 5, 235, 1836, 13, 162, 479, 129, 631, 16,
5, 5933, 16, 38152, 828, 18277, 202, 361, 795, 15,
16701, 13, 162, 4839, 2156, 5, 31622, 32, 818, 3229,
2156, 98, 939, 74, 50264, 5, 3588, 16, 10, 13173,
326, 50264, 3989, 479, 3195, 16, 50264, 2156, 939, 2740,
2272, 25, 5, 97, 3195, 36, 36838, 4839, 50264, 75,
33, 4716, 1459, 50264, 50264, 2272, 16, 4066, 2156, 8,
30228, 50264, 50264, 50264, 16, 10262, 3793, 479, 939, 50264,
5, 410, 1254, 11, 5, 50264, 479, 2299, 240, 50264,
18052, 16979, 11689, 13, 42, 50264, 479, 12312, 23246, 119,
389, 385, 2, 1, 1, 1],
[ 0, 15841, 479, 1969, 50264, 13, 80, 50264, 479, 15983,
10717, 39574, 47, 240, 50264, 216, 14, 42, 1907, 9,
50264, 50264, 5, 65, 14, 40, 120, 6538, 36, 939,
2162, 5, 50264, 65, 4839, 479, 24, 16, 2422, 7174,
50264, 9869, 2156, 53, 939, 21, 129, 441, 7, 120,
80, 15033, 66, 50264, 24, 50264, 50264, 222, 10397, 24,
50264, 50264, 4925, 50264, 18, 1836, 142, 939, 1079, 47904,
24, 50264, 50264, 172, 10601, 7, 3841, 479, 939, 21,
2422, 5779, 50264, 5, 3568, 53, 10874, 145, 441, 7,
50264, 24, 396, 864, 23, 50264, 50264, 6215, 479, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1]])
The labels have also been constructed, which shows the “mask” tokens (non -100) in which the model has to predict. To increase the amount of masked tokens, increase the mlm_prob
out['labels'][:3,:]tensor([[ -100, -100, -100, -100, -100, -100, -100, -100, -100, 5,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, 24, -100, -100, -100, -100, 18245,
-100, 959, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
5, -100, 2564, -100, -100, -100, -100, -100, -100, 2563,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, 7, 946, 5, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, 5, -100, -100,
-100, -100, -100, -100, 2156, -100, -100, -100, -100, 2156,
-100, -100, -100, 4839, -100, -100, 1318, -100, 372, -100,
-100, -100, -100, 5780, -100, -100, -100, 7, -100, -100,
-100, -100, -100, -100, -100, -100, -100, 22, -100, -100,
-100, -100, 15, -100, -100, -100],
[ -100, 15841, -100, -100, -100, 765, -100, 162, -100, -100,
-100, -100, -100, 939, -100, -100, -100, 29, -100, 25,
-100, -100, -100, -100, -100, -100, -100, 2156, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
5, -100, -100, 10, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, 224, -100, -100, -100, -100, -100,
-100, 15888, -100, -100, -100, -100, 2721, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, 630, -100,
-100, -100, -100, 577, 479, -100, -100, -100, -100, -100,
-100, 2156, 5, 10199, -100, -100, -100, -100, -100, 657,
-100, -100, -100, -100, -100, 29986, -100, -100, -100, 10,
-100, -100, -100, -100, -100, 65, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100],
[ -100, -100, -100, -100, 39574, -100, -100, 15033, -100, -100,
-100, -100, -100, -100, 7, -100, -100, -100, -100, -100,
10199, 16, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, 1104, -100, -100, -100, -100, -100, -100, -100,
8, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, 9, -100, 479, 939, -100, -100, -100,
8, 24, -100, 24, -100, -100, -100, -100, -100, -100,
-100, 150, 7727, -100, -100, -100, -100, -100, -100, -100,
-100, -100, 59, -100, -100, -100, -100, -100, -100, -100,
671, -100, -100, -100, -100, 127, 400, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100]])
If you apply padding in the tokenization step (by adjusting the max_length argument), no matter whether it’s line-by-line tokenization or not, the data collator will skip the padding step
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']
tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
cols_to_keep=['Clothing ID','Review Text'],
seed=42,
batch_size=1024,
verbose=False
)tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)Let’s apply the collator
# extract only the required keys
inp_keys = tokenizer.model_input_names
iter1 = iter(tdc.main_ddict['train'])
_result=[]
for i in range(5):
_inp = next(iter1)
_result.append({k:_inp[k] for k in inp_keys})
out = tdc.data_collator(_result)out['input_ids'].shapetorch.Size([5, 100])
out['input_ids'][:2,:]tensor([[ 0, 15841, 4716, 1459, 479, 2721, 299, 2156, 966, 50264,
2139, 7886, 5137, 479, 5, 2721, 7457, 5780, 4855, 162,
7, 42, 299, 8, 24, 222, 45, 17534, 2115, 50264,
479, 50264, 2156, 5, 2576, 910, 15315, 28, 9970, 98,
444, 66, 15, 349, 526, 14, 24, 21, 38677, 27785,
50264, 3031, 50264, 16, 1085, 101, 5, 2170, 25606, 41316,
5, 1421, 18, 3701, 32, 2325, 11, 760, 9, 70,
5, 1823, 10199, 50264, 17204, 50264, 910, 15315, 124, 479,
959, 2156, 5, 10199, 16, 2721, 2156, 50264, 2564, 21,
1969, 36, 1836, 132, 50264, 195, 128, 204, 22, 50264],
[13442, 23246, 479, 50264, 2156, 5, 50264, 16, 23781, 8,
939, 657, 5, 5780, 98, 939, 1276, 50264, 185, 24,
7, 127, 26090, 7, 22, 35043, 409, 50264, 5, 22,
11954, 22, 50264, 258, 3391, 2, 0, 50264, 479, 45,
25, 50264, 15, 44224, 36, 4716, 1459, 4839, 479, 50264,
2740, 5, 37863, 50264, 181, 25, 42, 3588, 16, 45,
10, 15898, 3588, 50264, 8, 14, 21, 5, 235, 1836,
13, 162, 479, 129, 631, 16, 50264, 5933, 16, 50264,
828, 18277, 202, 361, 795, 15, 16701, 13, 162, 4839,
2156, 5, 31622, 32, 818, 3229, 2156, 98, 939, 74]])
out['labels'][:2,:]tensor([[ -100, -100, -100, -100, -100, -100, -100, -100, -100, 5,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, 24, -100, -100, -100, -100, 18245,
-100, 959, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
5, -100, 2564, -100, -100, -100, -100, -100, -100, 2563,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, 7, 946, 5, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, 5, -100, -100,
-100, -100, -100, -100, 2156, -100, -100, -100, -100, 2156],
[ -100, -100, -100, 4839, -100, -100, 1318, -100, 372, -100,
-100, -100, -100, 5780, -100, -100, -100, 7, -100, -100,
-100, -100, -100, -100, -100, -100, -100, 22, -100, -100,
-100, -100, 15, -100, -100, -100, -100, 15841, -100, -100,
-100, 765, -100, 162, -100, -100, -100, -100, -100, 939,
-100, -100, -100, 29, -100, 25, -100, -100, -100, -100,
-100, -100, -100, 2156, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, 5, -100, -100, 10,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])
Since we are using the concatenation-of-tokenization technique, one smart thing that the HuggingFace’s DataCollatorForLanguageModeling (which is the data collator we use) does is to allow maskings at every position, at opposed to to the previous cases (with line-by-line tokenization), there’s no masking near the end tokens of each list, because those end tokens are padding tokens
For causal language model
from transformers import AutoTokenizer
from tokenizers import processorsLet’s define our GPT2 tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')tokenizerGPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
GPT2 does not use start/end-of-sentence token:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone']
If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:
tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
single="$A " + tokenizer.eos_token,
special_tokens=[(tokenizer.eos_token, tokenizer.eos_token_id)],
)
tokenizer.pad_token = tokenizer.eos_tokenprint(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone', '<|endoftext|>']
With this modified tokenizer, let’s perform concatenation-of-tokenization using GPT2
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']
tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
seed=42,
batch_size=1024,
verbose=False
)tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)Since it’s casual language modeling, let’s turn off is_mlm
tdc.set_data_collator(is_mlm=False)Let’s apply the collator
iter1 = iter(tdc.main_ddict['train'])
out = tdc.data_collator([next(iter1) for i in range(5)]) # simulation with batch size 5out['input_ids'].shapetorch.Size([5, 100])
out['input_ids'][:2,:]tensor([[24622, 4273, 578, 764, 4950, 1353, 837, 2861, 262, 3306,
7894, 3255, 764, 262, 4950, 10758, 3601, 9859, 502, 284,
428, 1353, 290, 340, 750, 407, 6703, 2402, 14507, 764,
2158, 837, 262, 4220, 374, 18137, 307, 3353, 523, 1290,
503, 319, 1123, 1735, 326, 340, 373, 47623, 5145, 262,
4036, 4197, 318, 2147, 588, 262, 4286, 2162, 4084, 262,
2746, 338, 5101, 389, 4624, 287, 2166, 286, 477, 262,
3131, 9664, 284, 1745, 262, 374, 18137, 736, 764, 2158,
837, 262, 9664, 318, 4950, 837, 262, 4197, 373, 2818,
357, 2546, 362, 837, 642, 705, 604, 366, 837, 15696],
[15785, 764, 1267, 837, 262, 3081, 318, 1049, 290, 1312,
1842, 262, 3601, 523, 1312, 3066, 284, 1011, 340, 284,
616, 35280, 284, 366, 34249, 1497, 366, 262, 366, 12098,
366, 319, 1111, 33721, 50256, 24622, 764, 407, 355, 1790,
319, 502, 357, 4273, 578, 1267, 764, 1312, 6149, 262,
31383, 82, 279, 355, 428, 6576, 318, 407, 257, 18235,
6576, 837, 290, 326, 373, 262, 826, 2546, 329, 502,
764, 691, 1517, 318, 262, 4129, 318, 257, 1643, 31402,
991, 860, 2793, 319, 31134, 329, 502, 1267, 837, 262,
29552, 389, 2048, 5381, 837, 523, 1312, 561, 910, 262]])
out['labels'][:2,:]tensor([[24622, 4273, 578, 764, 4950, 1353, 837, 2861, 262, 3306,
7894, 3255, 764, 262, 4950, 10758, 3601, 9859, 502, 284,
428, 1353, 290, 340, 750, 407, 6703, 2402, 14507, 764,
2158, 837, 262, 4220, 374, 18137, 307, 3353, 523, 1290,
503, 319, 1123, 1735, 326, 340, 373, 47623, 5145, 262,
4036, 4197, 318, 2147, 588, 262, 4286, 2162, 4084, 262,
2746, 338, 5101, 389, 4624, 287, 2166, 286, 477, 262,
3131, 9664, 284, 1745, 262, 374, 18137, 736, 764, 2158,
837, 262, 9664, 318, 4950, 837, 262, 4197, 373, 2818,
357, 2546, 362, 837, 642, 705, 604, 366, 837, 15696],
[15785, 764, 1267, 837, 262, 3081, 318, 1049, 290, 1312,
1842, 262, 3601, 523, 1312, 3066, 284, 1011, 340, 284,
616, 35280, 284, 366, 34249, 1497, 366, 262, 366, 12098,
366, 319, 1111, 33721, -100, 24622, 764, 407, 355, 1790,
319, 502, 357, 4273, 578, 1267, 764, 1312, 6149, 262,
31383, 82, 279, 355, 428, 6576, 318, 407, 257, 18235,
6576, 837, 290, 326, 373, 262, 826, 2546, 329, 502,
764, 691, 1517, 318, 262, 4129, 318, 257, 1643, 31402,
991, 860, 2793, 319, 31134, 329, 502, 1267, 837, 262,
29552, 389, 2048, 5381, 837, 523, 1312, 561, 910, 262]])
For CLM, the labels are essentially the same as input_ids. From HuggingFace documentation:
`DataCollatorForLanguageModeling` will take care of creating the language model labels — in causal language modeling the inputs serve as labels too (just shifted by one element), and this data collator creates them on the fly during training.
4. Save and Load TextDataController
TextDataLMControllerStreaming.save_as_pickles
TextDataLMControllerStreaming.save_as_pickles (fname, parent='pickle_files')
| Type | Default | Details | |
|---|---|---|---|
| fname | Name of the pickle file | ||
| parent | str | pickle_files | Parent folder |
TextDataControllerStreaming.from_pickle
TextDataControllerStreaming.from_pickle (fname, parent='pickle_files')
| Type | Default | Details | |
|---|---|---|---|
| fname | Name of the pickle file | ||
| parent | str | pickle_files | Parent folder |
TextDataLMControllerStreaming object can be saved and loaded with ease. This is especially useful after text processing and/or tokenization have been done
from datasets import disable_cachingdisable_caching()tokenizer = AutoTokenizer.from_pretrained('roberta-base')
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']
tdc = TextDataLMControllerStreaming(ddict_with_val,
main_text='Review Text',
filter_dict={'Review Text': lambda x: x is not None},
metadatas=['Title','Division Name'],
content_transformations=[text_normalize,str.lower],
seed=42,
batch_size=1024,
verbose=False
)tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)tdc.save_as_pickles('my_lm_tdc')Load back our object
tdc2 = TextDataLMController.from_pickle('my_lm_tdc')You can still access all its attributes, data, preprocessings, transformations …
tdc2.main_ddictDatasetDict({
train: IterableDataset({
features: Unknown,
n_shards: 1
})
validation: Dataset({
features: ['Title', 'Review Text', 'Division Name', 'input_ids', 'attention_mask', 'special_tokens_mask'],
num_rows: 2253
})
})
tdc2.filter_dict,tdc2.content_tfms({'Review Text': <function __main__.<lambda>(x)>},
[<function underthesea.pipeline.text_normalize.text_normalize(text, tokenizer='underthesea')>,
<method 'lower' of 'str' objects>])