Retrain tokenizer for case sensitive
Browse files- tokenizer.json +0 -0
- train_tokenizer.py +1 -3
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
train_tokenizer.py
CHANGED
|
@@ -18,7 +18,7 @@ def train_val_files():
|
|
| 18 |
print(f"Number of files {len(data_files)} after adding {path}")
|
| 19 |
|
| 20 |
# add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
|
| 21 |
-
add_jsonlines_dir(f"{data_dir}/
|
| 22 |
add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
|
| 23 |
add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
|
| 24 |
random.Random(SEED).shuffle(data_files)
|
|
@@ -42,8 +42,6 @@ train, val = train_val_files()
|
|
| 42 |
|
| 43 |
dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
|
| 44 |
|
| 45 |
-
model_dir = "/t5-small-dutch" # ${MODEL_DIR}
|
| 46 |
-
|
| 47 |
vocab_size = 32000
|
| 48 |
input_sentence_size = None
|
| 49 |
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
|
|
|
|
| 18 |
print(f"Number of files {len(data_files)} after adding {path}")
|
| 19 |
|
| 20 |
# add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
|
| 21 |
+
add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*47*.gz")
|
| 22 |
add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
|
| 23 |
add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
|
| 24 |
random.Random(SEED).shuffle(data_files)
|
|
|
|
| 42 |
|
| 43 |
dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
|
| 44 |
|
|
|
|
|
|
|
| 45 |
vocab_size = 32000
|
| 46 |
input_sentence_size = None
|
| 47 |
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
|