Younes Belkada
commited on
Commit
·
f2449c8
1
Parent(s):
f342985
add python file
Browse files- tokenizer.py +1 -7
tokenizer.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
from datasets import load_dataset
|
2 |
from transformers import AutoTokenizer
|
3 |
-
from huggingface_hub import Repository
|
4 |
-
|
5 |
-
repo = Repository(".", clone_from="ybelkada/japanese-dummy-tokenizer")
|
6 |
|
7 |
def get_training_corpus(dataset):
|
8 |
"""
|
@@ -22,7 +19,4 @@ print("Old Tokenizer:", old_tokenizer.tokenize("誰が一番に着くか私に
|
|
22 |
new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(train_dataset), 52000)
|
23 |
|
24 |
print("New Tokenizer:",new_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
|
25 |
-
new_tokenizer.save_pretrained("japanese-dummy-tokenizer")
|
26 |
-
repo.git_add()
|
27 |
-
repo.git_commit("Add tokenizer file")
|
28 |
-
repo.git_push()
|
|
|
1 |
from datasets import load_dataset
|
2 |
from transformers import AutoTokenizer
|
|
|
|
|
|
|
3 |
|
4 |
def get_training_corpus(dataset):
|
5 |
"""
|
|
|
19 |
new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(train_dataset), 52000)
|
20 |
|
21 |
print("New Tokenizer:",new_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
|
22 |
+
new_tokenizer.save_pretrained("japanese-dummy-tokenizer")
|
|
|
|
|
|