default unidic_lite
Browse files
distilbert_japanese_tokenizer.py
CHANGED
@@ -440,7 +440,7 @@ class MecabTokenizer:
|
|
440 |
do_lower_case=False,
|
441 |
never_split=None,
|
442 |
normalize_text=True,
|
443 |
-
mecab_dic: Optional[str] = "
|
444 |
mecab_option: Optional[str] = None,
|
445 |
):
|
446 |
"""
|
@@ -454,7 +454,7 @@ class MecabTokenizer:
|
|
454 |
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
|
455 |
**normalize_text**: (*optional*) boolean (default True)
|
456 |
Whether to apply unicode normalization to text before tokenization.
|
457 |
-
**mecab_dic**: (*optional*) string (default "
|
458 |
Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
|
459 |
set this option to `None` and modify *mecab_option*.
|
460 |
**mecab_option**: (*optional*) string
|
|
|
440 |
do_lower_case=False,
|
441 |
never_split=None,
|
442 |
normalize_text=True,
|
443 |
+
mecab_dic: Optional[str] = "unidic_lite",
|
444 |
mecab_option: Optional[str] = None,
|
445 |
):
|
446 |
"""
|
|
|
454 |
[`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
|
455 |
**normalize_text**: (*optional*) boolean (default True)
|
456 |
Whether to apply unicode normalization to text before tokenization.
|
457 |
+
**mecab_dic**: (*optional*) string (default "unidic_lite")
|
458 |
Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
|
459 |
set this option to `None` and modify *mecab_option*.
|
460 |
**mecab_option**: (*optional*) string
|