Tokenizerに書き換えた後、トークナイズした結果が揃うことの検証
Post-processを追加していれば揃う
code:検証スクリプト.py
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaTokenizerFast
text_column_name = "text"
old_tokenizer = RobertaTokenizer.from_pretrained("KantaiBERT")
new_tokenizer = RobertaTokenizerFast(tokenizer_file="kantai-tokenizer.json")
assert len(old_tokenizer.get_vocab()) == len(new_tokenizer.get_vocab())
assert old_tokenizer.get_vocab() == new_tokenizer.get_vocab()
def filter_empty_line(examples):
return [
line
if len(line) > 0 and not line.isspace()
]
def new_tokenize_function(examples):
return new_tokenizer(
padding=False,
truncation=True,
max_length=512,
return_special_tokens_mask=True,
)
def old_tokenize_function(examples):
return old_tokenizer(
padding=False,
truncation=True,
max_length=512,
return_special_tokens_mask=True,
)
old_tokenizer_raw_datasets = load_dataset("text", data_files="kant.txt")
old_tokenizer_processed_datasets = old_tokenizer_raw_datasets.map(
old_tokenize_function,
batched=True,
num_proc=None,
load_from_cache_file=False,
desc="Running old tokenizer line_by_line",
)
old_tokenizer_train_dataset = old_tokenizer_processed_datasets"train" new_tokenizer_raw_datasets = load_dataset("text", data_files="kant.txt")
new_tokenizer_processed_datasets = new_tokenizer_raw_datasets.map(
new_tokenize_function,
batched=True,
num_proc=None,
load_from_cache_file=False,
desc="Running new tokenizer line_by_line",
)
new_tokenizer_train_dataset = new_tokenizer_processed_datasets"train" assert len(old_tokenizer_train_dataset) == len(new_tokenizer_train_dataset)
for old, new in zip(old_tokenizer_train_dataset, new_tokenizer_train_dataset):
assert old == new