Commit
·
876f9a6
1
Parent(s):
37ce1b9
Training in progress, step 500
Browse files- .gitignore +1 -0
- added_tokens.json +1 -0
- config.json +107 -0
- preprocessor_config.json +9 -0
- pytorch_model.bin +3 -0
- run.sh +35 -0
- run_speech_recognition_ctc.py +737 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- training_args.bin +3 -0
- vocab.json +1 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
checkpoint-*/
|
added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<s>": 3029, "</s>": 3030}
|
config.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "facebook/wav2vec2-xls-r-300m",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"adapter_kernel_size": 3,
|
5 |
+
"adapter_stride": 2,
|
6 |
+
"add_adapter": false,
|
7 |
+
"apply_spec_augment": true,
|
8 |
+
"architectures": [
|
9 |
+
"Wav2Vec2ForCTC"
|
10 |
+
],
|
11 |
+
"attention_dropout": 0.0,
|
12 |
+
"bos_token_id": 1,
|
13 |
+
"classifier_proj_size": 256,
|
14 |
+
"codevector_dim": 768,
|
15 |
+
"contrastive_logits_temperature": 0.1,
|
16 |
+
"conv_bias": true,
|
17 |
+
"conv_dim": [
|
18 |
+
512,
|
19 |
+
512,
|
20 |
+
512,
|
21 |
+
512,
|
22 |
+
512,
|
23 |
+
512,
|
24 |
+
512
|
25 |
+
],
|
26 |
+
"conv_kernel": [
|
27 |
+
10,
|
28 |
+
3,
|
29 |
+
3,
|
30 |
+
3,
|
31 |
+
3,
|
32 |
+
2,
|
33 |
+
2
|
34 |
+
],
|
35 |
+
"conv_stride": [
|
36 |
+
5,
|
37 |
+
2,
|
38 |
+
2,
|
39 |
+
2,
|
40 |
+
2,
|
41 |
+
2,
|
42 |
+
2
|
43 |
+
],
|
44 |
+
"ctc_loss_reduction": "mean",
|
45 |
+
"ctc_zero_infinity": false,
|
46 |
+
"diversity_loss_weight": 0.1,
|
47 |
+
"do_stable_layer_norm": true,
|
48 |
+
"eos_token_id": 2,
|
49 |
+
"feat_extract_activation": "gelu",
|
50 |
+
"feat_extract_dropout": 0.0,
|
51 |
+
"feat_extract_norm": "layer",
|
52 |
+
"feat_proj_dropout": 0.0,
|
53 |
+
"feat_quantizer_dropout": 0.0,
|
54 |
+
"final_dropout": 0.0,
|
55 |
+
"hidden_act": "gelu",
|
56 |
+
"hidden_dropout": 0.0,
|
57 |
+
"hidden_size": 1024,
|
58 |
+
"initializer_range": 0.02,
|
59 |
+
"intermediate_size": 4096,
|
60 |
+
"layer_norm_eps": 1e-05,
|
61 |
+
"layerdrop": 0.0,
|
62 |
+
"mask_feature_length": 64,
|
63 |
+
"mask_feature_min_masks": 0,
|
64 |
+
"mask_feature_prob": 0.25,
|
65 |
+
"mask_time_length": 10,
|
66 |
+
"mask_time_min_masks": 2,
|
67 |
+
"mask_time_prob": 0.75,
|
68 |
+
"model_type": "wav2vec2",
|
69 |
+
"num_adapter_layers": 3,
|
70 |
+
"num_attention_heads": 16,
|
71 |
+
"num_codevector_groups": 2,
|
72 |
+
"num_codevectors_per_group": 320,
|
73 |
+
"num_conv_pos_embedding_groups": 16,
|
74 |
+
"num_conv_pos_embeddings": 128,
|
75 |
+
"num_feat_extract_layers": 7,
|
76 |
+
"num_hidden_layers": 24,
|
77 |
+
"num_negatives": 100,
|
78 |
+
"output_hidden_size": 1024,
|
79 |
+
"pad_token_id": 3028,
|
80 |
+
"proj_codevector_dim": 768,
|
81 |
+
"tdnn_dilation": [
|
82 |
+
1,
|
83 |
+
2,
|
84 |
+
3,
|
85 |
+
1,
|
86 |
+
1
|
87 |
+
],
|
88 |
+
"tdnn_dim": [
|
89 |
+
512,
|
90 |
+
512,
|
91 |
+
512,
|
92 |
+
512,
|
93 |
+
1500
|
94 |
+
],
|
95 |
+
"tdnn_kernel": [
|
96 |
+
5,
|
97 |
+
3,
|
98 |
+
3,
|
99 |
+
1,
|
100 |
+
1
|
101 |
+
],
|
102 |
+
"torch_dtype": "float32",
|
103 |
+
"transformers_version": "4.17.0.dev0",
|
104 |
+
"use_weighted_layer_sum": false,
|
105 |
+
"vocab_size": 3031,
|
106 |
+
"xvector_output_dim": 512
|
107 |
+
}
|
preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88b176f57552ec96e597aba50551da55604b80f1a0fd5bbc54f03565bb9239af
|
3 |
+
size 1274350833
|
run.sh
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
python run_speech_recognition_ctc.py \
|
3 |
+
--dataset_name="mozilla-foundation/common_voice_7_0" \
|
4 |
+
--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
|
5 |
+
--dataset_config_name="zh-TW" \
|
6 |
+
--output_dir="./" \
|
7 |
+
--overwrite_output_dir \
|
8 |
+
--num_train_epochs="50" \
|
9 |
+
--per_device_train_batch_size="8" \
|
10 |
+
--per_device_eval_batch_size="8" \
|
11 |
+
--gradient_accumulation_steps="4" \
|
12 |
+
--learning_rate="7.5e-5" \
|
13 |
+
--warmup_steps="2000" \
|
14 |
+
--length_column_name="input_length" \
|
15 |
+
--evaluation_strategy="steps" \
|
16 |
+
--text_column_name="sentence" \
|
17 |
+
--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
|
18 |
+
--save_steps="500" \
|
19 |
+
--eval_steps="500" \
|
20 |
+
--logging_steps="100" \
|
21 |
+
--layerdrop="0.0" \
|
22 |
+
--activation_dropout="0.1" \
|
23 |
+
--save_total_limit="3" \
|
24 |
+
--freeze_feature_encoder \
|
25 |
+
--feat_proj_dropout="0.0" \
|
26 |
+
--mask_time_prob="0.75" \
|
27 |
+
--mask_time_length="10" \
|
28 |
+
--mask_feature_prob="0.25" \
|
29 |
+
--mask_feature_length="64" \
|
30 |
+
--gradient_checkpointing \
|
31 |
+
--use_auth_token \
|
32 |
+
--fp16 \
|
33 |
+
--group_by_length \
|
34 |
+
--do_train --do_eval \
|
35 |
+
--push_to_hub
|
run_speech_recognition_ctc.py
ADDED
@@ -0,0 +1,737 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding=utf-8
|
3 |
+
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
|
16 |
+
""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
|
17 |
+
|
18 |
+
import functools
|
19 |
+
import json
|
20 |
+
import logging
|
21 |
+
import os
|
22 |
+
import re
|
23 |
+
import sys
|
24 |
+
import warnings
|
25 |
+
from dataclasses import dataclass, field
|
26 |
+
from typing import Dict, List, Optional, Union
|
27 |
+
|
28 |
+
import datasets
|
29 |
+
import numpy as np
|
30 |
+
import torch
|
31 |
+
from datasets import DatasetDict, load_dataset, load_metric
|
32 |
+
|
33 |
+
import transformers
|
34 |
+
from transformers import (
|
35 |
+
AutoConfig,
|
36 |
+
AutoFeatureExtractor,
|
37 |
+
AutoModelForCTC,
|
38 |
+
AutoProcessor,
|
39 |
+
AutoTokenizer,
|
40 |
+
HfArgumentParser,
|
41 |
+
Trainer,
|
42 |
+
TrainingArguments,
|
43 |
+
Wav2Vec2Processor,
|
44 |
+
set_seed,
|
45 |
+
)
|
46 |
+
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
47 |
+
from transformers.utils import check_min_version
|
48 |
+
from transformers.utils.versions import require_version
|
49 |
+
|
50 |
+
|
51 |
+
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
52 |
+
check_min_version("4.17.0.dev0")
|
53 |
+
|
54 |
+
require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
55 |
+
|
56 |
+
|
57 |
+
logger = logging.getLogger(__name__)
|
58 |
+
|
59 |
+
|
60 |
+
def list_field(default=None, metadata=None):
|
61 |
+
return field(default_factory=lambda: default, metadata=metadata)
|
62 |
+
|
63 |
+
|
64 |
+
@dataclass
|
65 |
+
class ModelArguments:
|
66 |
+
"""
|
67 |
+
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
68 |
+
"""
|
69 |
+
|
70 |
+
model_name_or_path: str = field(
|
71 |
+
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
|
72 |
+
)
|
73 |
+
tokenizer_name_or_path: Optional[str] = field(
|
74 |
+
default=None,
|
75 |
+
metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
|
76 |
+
)
|
77 |
+
cache_dir: Optional[str] = field(
|
78 |
+
default=None,
|
79 |
+
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
80 |
+
)
|
81 |
+
freeze_feature_encoder: bool = field(
|
82 |
+
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
|
83 |
+
)
|
84 |
+
attention_dropout: float = field(
|
85 |
+
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
|
86 |
+
)
|
87 |
+
activation_dropout: float = field(
|
88 |
+
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
|
89 |
+
)
|
90 |
+
feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
|
91 |
+
hidden_dropout: float = field(
|
92 |
+
default=0.0,
|
93 |
+
metadata={
|
94 |
+
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
|
95 |
+
},
|
96 |
+
)
|
97 |
+
final_dropout: float = field(
|
98 |
+
default=0.0,
|
99 |
+
metadata={"help": "The dropout probability for the final projection layer."},
|
100 |
+
)
|
101 |
+
mask_time_prob: float = field(
|
102 |
+
default=0.05,
|
103 |
+
metadata={
|
104 |
+
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
|
105 |
+
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
|
106 |
+
"vectors will be masked along the time axis."
|
107 |
+
},
|
108 |
+
)
|
109 |
+
mask_time_length: int = field(
|
110 |
+
default=10,
|
111 |
+
metadata={"help": "Length of vector span to mask along the time axis."},
|
112 |
+
)
|
113 |
+
mask_feature_prob: float = field(
|
114 |
+
default=0.0,
|
115 |
+
metadata={
|
116 |
+
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
|
117 |
+
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
|
118 |
+
},
|
119 |
+
)
|
120 |
+
mask_feature_length: int = field(
|
121 |
+
default=10,
|
122 |
+
metadata={"help": "Length of vector span to mask along the feature axis."},
|
123 |
+
)
|
124 |
+
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
|
125 |
+
ctc_loss_reduction: Optional[str] = field(
|
126 |
+
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
|
127 |
+
)
|
128 |
+
|
129 |
+
|
130 |
+
@dataclass
|
131 |
+
class DataTrainingArguments:
|
132 |
+
"""
|
133 |
+
Arguments pertaining to what data we are going to input our model for training and eval.
|
134 |
+
|
135 |
+
Using `HfArgumentParser` we can turn this class
|
136 |
+
into argparse arguments to be able to specify them on
|
137 |
+
the command line.
|
138 |
+
"""
|
139 |
+
|
140 |
+
dataset_name: str = field(
|
141 |
+
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
142 |
+
)
|
143 |
+
dataset_config_name: str = field(
|
144 |
+
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
145 |
+
)
|
146 |
+
train_split_name: str = field(
|
147 |
+
default="train+validation",
|
148 |
+
metadata={
|
149 |
+
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train+validation'"
|
150 |
+
},
|
151 |
+
)
|
152 |
+
eval_split_name: str = field(
|
153 |
+
default="test",
|
154 |
+
metadata={
|
155 |
+
"help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
|
156 |
+
},
|
157 |
+
)
|
158 |
+
audio_column_name: str = field(
|
159 |
+
default="audio",
|
160 |
+
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
161 |
+
)
|
162 |
+
text_column_name: str = field(
|
163 |
+
default="text",
|
164 |
+
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
|
165 |
+
)
|
166 |
+
overwrite_cache: bool = field(
|
167 |
+
default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
|
168 |
+
)
|
169 |
+
preprocessing_num_workers: Optional[int] = field(
|
170 |
+
default=None,
|
171 |
+
metadata={"help": "The number of processes to use for the preprocessing."},
|
172 |
+
)
|
173 |
+
max_train_samples: Optional[int] = field(
|
174 |
+
default=None,
|
175 |
+
metadata={
|
176 |
+
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
|
177 |
+
"value if set."
|
178 |
+
},
|
179 |
+
)
|
180 |
+
max_eval_samples: Optional[int] = field(
|
181 |
+
default=None,
|
182 |
+
metadata={
|
183 |
+
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
|
184 |
+
"value if set."
|
185 |
+
},
|
186 |
+
)
|
187 |
+
chars_to_ignore: Optional[List[str]] = list_field(
|
188 |
+
default=None,
|
189 |
+
metadata={"help": "A list of characters to remove from the transcripts."},
|
190 |
+
)
|
191 |
+
eval_metrics: List[str] = list_field(
|
192 |
+
default=["wer"],
|
193 |
+
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
|
194 |
+
)
|
195 |
+
max_duration_in_seconds: float = field(
|
196 |
+
default=20.0,
|
197 |
+
metadata={
|
198 |
+
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
199 |
+
},
|
200 |
+
)
|
201 |
+
min_duration_in_seconds: float = field(
|
202 |
+
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
|
203 |
+
)
|
204 |
+
preprocessing_only: bool = field(
|
205 |
+
default=False,
|
206 |
+
metadata={
|
207 |
+
"help": "Whether to only do data preprocessing and skip training. "
|
208 |
+
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
|
209 |
+
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
|
210 |
+
"so that the cached datasets can consequently be loaded in distributed training"
|
211 |
+
},
|
212 |
+
)
|
213 |
+
use_auth_token: bool = field(
|
214 |
+
default=False,
|
215 |
+
metadata={
|
216 |
+
"help": "If :obj:`True`, will use the token generated when running"
|
217 |
+
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
|
218 |
+
},
|
219 |
+
)
|
220 |
+
unk_token: str = field(
|
221 |
+
default="[UNK]",
|
222 |
+
metadata={"help": "The unk token for the tokenizer"},
|
223 |
+
)
|
224 |
+
pad_token: str = field(
|
225 |
+
default="[PAD]",
|
226 |
+
metadata={"help": "The padding token for the tokenizer"},
|
227 |
+
)
|
228 |
+
word_delimiter_token: str = field(
|
229 |
+
default="|",
|
230 |
+
metadata={"help": "The word delimiter token for the tokenizer"},
|
231 |
+
)
|
232 |
+
phoneme_language: Optional[str] = field(
|
233 |
+
default=None,
|
234 |
+
metadata={
|
235 |
+
"help": "The target language that should be used be"
|
236 |
+
" passed to the tokenizer for tokenization. Note that"
|
237 |
+
" this is only relevant if the model classifies the"
|
238 |
+
" input audio to a sequence of phoneme sequences."
|
239 |
+
},
|
240 |
+
)
|
241 |
+
|
242 |
+
|
243 |
+
@dataclass
|
244 |
+
class DataCollatorCTCWithPadding:
|
245 |
+
"""
|
246 |
+
Data collator that will dynamically pad the inputs received.
|
247 |
+
Args:
|
248 |
+
processor (:class:`~transformers.AutoProcessor`)
|
249 |
+
The processor used for proccessing the data.
|
250 |
+
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
251 |
+
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
252 |
+
among:
|
253 |
+
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
254 |
+
sequence if provided).
|
255 |
+
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
256 |
+
maximum acceptable input length for the model if that argument is not provided.
|
257 |
+
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
258 |
+
different lengths).
|
259 |
+
max_length (:obj:`int`, `optional`):
|
260 |
+
Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
|
261 |
+
max_length_labels (:obj:`int`, `optional`):
|
262 |
+
Maximum length of the ``labels`` returned list and optionally padding length (see above).
|
263 |
+
pad_to_multiple_of (:obj:`int`, `optional`):
|
264 |
+
If set will pad the sequence to a multiple of the provided value.
|
265 |
+
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
|
266 |
+
7.5 (Volta).
|
267 |
+
"""
|
268 |
+
|
269 |
+
processor: AutoProcessor
|
270 |
+
padding: Union[bool, str] = "longest"
|
271 |
+
pad_to_multiple_of: Optional[int] = None
|
272 |
+
pad_to_multiple_of_labels: Optional[int] = None
|
273 |
+
|
274 |
+
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
275 |
+
# split inputs and labels since they have to be of different lenghts and need
|
276 |
+
# different padding methods
|
277 |
+
input_features = [{"input_values": feature["input_values"]} for feature in features]
|
278 |
+
label_features = [{"input_ids": feature["labels"]} for feature in features]
|
279 |
+
|
280 |
+
batch = self.processor.pad(
|
281 |
+
input_features,
|
282 |
+
padding=self.padding,
|
283 |
+
pad_to_multiple_of=self.pad_to_multiple_of,
|
284 |
+
return_tensors="pt",
|
285 |
+
)
|
286 |
+
|
287 |
+
with self.processor.as_target_processor():
|
288 |
+
labels_batch = self.processor.pad(
|
289 |
+
label_features,
|
290 |
+
padding=self.padding,
|
291 |
+
pad_to_multiple_of=self.pad_to_multiple_of_labels,
|
292 |
+
return_tensors="pt",
|
293 |
+
)
|
294 |
+
|
295 |
+
# replace padding with -100 to ignore loss correctly
|
296 |
+
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
|
297 |
+
|
298 |
+
batch["labels"] = labels
|
299 |
+
|
300 |
+
return batch
|
301 |
+
|
302 |
+
|
303 |
+
def create_vocabulary_from_data(
|
304 |
+
datasets: DatasetDict,
|
305 |
+
word_delimiter_token: Optional[str] = None,
|
306 |
+
unk_token: Optional[str] = None,
|
307 |
+
pad_token: Optional[str] = None,
|
308 |
+
):
|
309 |
+
# Given training and test labels create vocabulary
|
310 |
+
def extract_all_chars(batch):
|
311 |
+
all_text = " ".join(batch["target_text"])
|
312 |
+
vocab = list(set(all_text))
|
313 |
+
return {"vocab": [vocab], "all_text": [all_text]}
|
314 |
+
|
315 |
+
vocabs = datasets.map(
|
316 |
+
extract_all_chars,
|
317 |
+
batched=True,
|
318 |
+
batch_size=-1,
|
319 |
+
keep_in_memory=True,
|
320 |
+
remove_columns=datasets["train"].column_names,
|
321 |
+
)
|
322 |
+
|
323 |
+
# take union of all unique characters in each dataset
|
324 |
+
vocab_set = functools.reduce(
|
325 |
+
lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
|
326 |
+
)
|
327 |
+
|
328 |
+
vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
|
329 |
+
|
330 |
+
# replace white space with delimiter token
|
331 |
+
if word_delimiter_token is not None:
|
332 |
+
vocab_dict[word_delimiter_token] = vocab_dict[" "]
|
333 |
+
del vocab_dict[" "]
|
334 |
+
|
335 |
+
# add unk and pad token
|
336 |
+
if unk_token is not None:
|
337 |
+
vocab_dict[unk_token] = len(vocab_dict)
|
338 |
+
|
339 |
+
if pad_token is not None:
|
340 |
+
vocab_dict[pad_token] = len(vocab_dict)
|
341 |
+
|
342 |
+
return vocab_dict
|
343 |
+
|
344 |
+
|
345 |
+
def main():
|
346 |
+
# See all possible arguments in src/transformers/training_args.py
|
347 |
+
# or by passing the --help flag to this script.
|
348 |
+
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
349 |
+
|
350 |
+
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
351 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
352 |
+
# If we pass only one argument to the script and it's the path to a json file,
|
353 |
+
# let's parse it to get our arguments.
|
354 |
+
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
355 |
+
else:
|
356 |
+
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
357 |
+
|
358 |
+
# Detecting last checkpoint.
|
359 |
+
last_checkpoint = None
|
360 |
+
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
361 |
+
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
362 |
+
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
363 |
+
raise ValueError(
|
364 |
+
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
365 |
+
"Use --overwrite_output_dir to overcome."
|
366 |
+
)
|
367 |
+
elif last_checkpoint is not None:
|
368 |
+
logger.info(
|
369 |
+
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
370 |
+
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
371 |
+
)
|
372 |
+
|
373 |
+
# Setup logging
|
374 |
+
logging.basicConfig(
|
375 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
376 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
377 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
378 |
+
)
|
379 |
+
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
|
380 |
+
|
381 |
+
# Log on each process the small summary:
|
382 |
+
logger.warning(
|
383 |
+
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
384 |
+
f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
385 |
+
)
|
386 |
+
# Set the verbosity to info of the Transformers logger (on main process only):
|
387 |
+
if is_main_process(training_args.local_rank):
|
388 |
+
transformers.utils.logging.set_verbosity_info()
|
389 |
+
logger.info("Training/evaluation parameters %s", training_args)
|
390 |
+
|
391 |
+
# Set seed before initializing model.
|
392 |
+
set_seed(training_args.seed)
|
393 |
+
|
394 |
+
# 1. First, let's load the dataset
|
395 |
+
raw_datasets = DatasetDict()
|
396 |
+
|
397 |
+
if training_args.do_train:
|
398 |
+
raw_datasets["train"] = load_dataset(
|
399 |
+
data_args.dataset_name,
|
400 |
+
data_args.dataset_config_name,
|
401 |
+
split=data_args.train_split_name,
|
402 |
+
use_auth_token=data_args.use_auth_token,
|
403 |
+
)
|
404 |
+
|
405 |
+
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
406 |
+
raise ValueError(
|
407 |
+
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
|
408 |
+
"Make sure to set `--audio_column_name` to the correct audio column - one of "
|
409 |
+
f"{', '.join(raw_datasets['train'].column_names)}."
|
410 |
+
)
|
411 |
+
|
412 |
+
if data_args.text_column_name not in raw_datasets["train"].column_names:
|
413 |
+
raise ValueError(
|
414 |
+
f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
|
415 |
+
"Make sure to set `--text_column_name` to the correct text column - one of "
|
416 |
+
f"{', '.join(raw_datasets['train'].column_names)}."
|
417 |
+
)
|
418 |
+
|
419 |
+
if data_args.max_train_samples is not None:
|
420 |
+
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
|
421 |
+
|
422 |
+
if training_args.do_eval:
|
423 |
+
raw_datasets["eval"] = load_dataset(
|
424 |
+
data_args.dataset_name,
|
425 |
+
data_args.dataset_config_name,
|
426 |
+
split=data_args.eval_split_name,
|
427 |
+
use_auth_token=data_args.use_auth_token,
|
428 |
+
)
|
429 |
+
|
430 |
+
if data_args.max_eval_samples is not None:
|
431 |
+
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
432 |
+
|
433 |
+
# 2. We remove some special characters from the datasets
|
434 |
+
# that make training complicated and do not help in transcribing the speech
|
435 |
+
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
436 |
+
# that could be easily picked up by the model
|
437 |
+
chars_to_ignore_regex = (
|
438 |
+
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
|
439 |
+
)
|
440 |
+
text_column_name = data_args.text_column_name
|
441 |
+
|
442 |
+
def remove_special_characters(batch):
|
443 |
+
if chars_to_ignore_regex is not None:
|
444 |
+
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
|
445 |
+
else:
|
446 |
+
batch["target_text"] = batch[text_column_name].lower() + " "
|
447 |
+
return batch
|
448 |
+
|
449 |
+
with training_args.main_process_first(desc="dataset map special characters removal"):
|
450 |
+
raw_datasets = raw_datasets.map(
|
451 |
+
remove_special_characters,
|
452 |
+
remove_columns=[text_column_name],
|
453 |
+
desc="remove special characters from datasets",
|
454 |
+
)
|
455 |
+
|
456 |
+
# save special tokens for tokenizer
|
457 |
+
word_delimiter_token = data_args.word_delimiter_token
|
458 |
+
unk_token = data_args.unk_token
|
459 |
+
pad_token = data_args.pad_token
|
460 |
+
|
461 |
+
# 3. Next, let's load the config as we might need it to create
|
462 |
+
# the tokenizer
|
463 |
+
# load config
|
464 |
+
config = AutoConfig.from_pretrained(
|
465 |
+
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
|
466 |
+
)
|
467 |
+
|
468 |
+
# 4. Next, if no tokenizer file is defined,
|
469 |
+
# we create the vocabulary of the model by extracting all unique characters from
|
470 |
+
# the training and evaluation datasets
|
471 |
+
# We need to make sure that only first rank saves vocabulary
|
472 |
+
# make sure all processes wait until vocab is created
|
473 |
+
tokenizer_name_or_path = model_args.tokenizer_name_or_path
|
474 |
+
tokenizer_kwargs = {}
|
475 |
+
if tokenizer_name_or_path is None:
|
476 |
+
# save vocab in training output dir
|
477 |
+
tokenizer_name_or_path = training_args.output_dir
|
478 |
+
|
479 |
+
vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
|
480 |
+
|
481 |
+
with training_args.main_process_first():
|
482 |
+
if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
|
483 |
+
os.remove(vocab_file)
|
484 |
+
|
485 |
+
with training_args.main_process_first(desc="dataset map vocabulary creation"):
|
486 |
+
if not os.path.isfile(vocab_file):
|
487 |
+
os.makedirs(tokenizer_name_or_path, exist_ok=True)
|
488 |
+
vocab_dict = create_vocabulary_from_data(
|
489 |
+
raw_datasets,
|
490 |
+
word_delimiter_token=word_delimiter_token,
|
491 |
+
unk_token=unk_token,
|
492 |
+
pad_token=pad_token,
|
493 |
+
)
|
494 |
+
|
495 |
+
# save vocab dict to be loaded into tokenizer
|
496 |
+
with open(vocab_file, "w") as file:
|
497 |
+
json.dump(vocab_dict, file)
|
498 |
+
|
499 |
+
# if tokenizer has just been created
|
500 |
+
# it is defined by `tokenizer_class` if present in config else by `model_type`
|
501 |
+
tokenizer_kwargs = {
|
502 |
+
"config": config if config.tokenizer_class is not None else None,
|
503 |
+
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
|
504 |
+
"unk_token": unk_token,
|
505 |
+
"pad_token": pad_token,
|
506 |
+
"word_delimiter_token": word_delimiter_token,
|
507 |
+
}
|
508 |
+
|
509 |
+
# 5. Now we can instantiate the feature extractor, tokenizer and model
|
510 |
+
# Note for distributed training, the .from_pretrained methods guarantee that only
|
511 |
+
# one local process can concurrently download model & vocab.
|
512 |
+
|
513 |
+
# load feature_extractor and tokenizer
|
514 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
515 |
+
tokenizer_name_or_path,
|
516 |
+
use_auth_token=data_args.use_auth_token,
|
517 |
+
**tokenizer_kwargs,
|
518 |
+
)
|
519 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
520 |
+
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
|
521 |
+
)
|
522 |
+
|
523 |
+
# adapt config
|
524 |
+
config.update(
|
525 |
+
{
|
526 |
+
"feat_proj_dropout": model_args.feat_proj_dropout,
|
527 |
+
"attention_dropout": model_args.attention_dropout,
|
528 |
+
"hidden_dropout": model_args.hidden_dropout,
|
529 |
+
"final_dropout": model_args.final_dropout,
|
530 |
+
"mask_time_prob": model_args.mask_time_prob,
|
531 |
+
"mask_time_length": model_args.mask_time_length,
|
532 |
+
"mask_feature_prob": model_args.mask_feature_prob,
|
533 |
+
"mask_feature_length": model_args.mask_feature_length,
|
534 |
+
"gradient_checkpointing": training_args.gradient_checkpointing,
|
535 |
+
"layerdrop": model_args.layerdrop,
|
536 |
+
"ctc_loss_reduction": model_args.ctc_loss_reduction,
|
537 |
+
"pad_token_id": tokenizer.pad_token_id,
|
538 |
+
"vocab_size": len(tokenizer),
|
539 |
+
"activation_dropout": model_args.activation_dropout,
|
540 |
+
}
|
541 |
+
)
|
542 |
+
|
543 |
+
# create model
|
544 |
+
model = AutoModelForCTC.from_pretrained(
|
545 |
+
model_args.model_name_or_path,
|
546 |
+
cache_dir=model_args.cache_dir,
|
547 |
+
config=config,
|
548 |
+
use_auth_token=data_args.use_auth_token,
|
549 |
+
)
|
550 |
+
|
551 |
+
# freeze encoder
|
552 |
+
if model_args.freeze_feature_encoder:
|
553 |
+
model.freeze_feature_encoder()
|
554 |
+
|
555 |
+
# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
|
556 |
+
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
|
557 |
+
# so that we just need to set the correct target sampling rate and normalize the input
|
558 |
+
# via the `feature_extractor`
|
559 |
+
|
560 |
+
# make sure that dataset decodes audio with correct sampling rate
|
561 |
+
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
|
562 |
+
if dataset_sampling_rate != feature_extractor.sampling_rate:
|
563 |
+
raw_datasets = raw_datasets.cast_column(
|
564 |
+
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
|
565 |
+
)
|
566 |
+
|
567 |
+
# derive max & min input length for sample rate & max duration
|
568 |
+
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
|
569 |
+
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
|
570 |
+
audio_column_name = data_args.audio_column_name
|
571 |
+
num_workers = data_args.preprocessing_num_workers
|
572 |
+
|
573 |
+
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
|
574 |
+
phoneme_language = data_args.phoneme_language
|
575 |
+
|
576 |
+
# Preprocessing the datasets.
|
577 |
+
# We need to read the audio files as arrays and tokenize the targets.
|
578 |
+
def prepare_dataset(batch):
|
579 |
+
# load audio
|
580 |
+
sample = batch[audio_column_name]
|
581 |
+
|
582 |
+
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
|
583 |
+
batch["input_values"] = inputs.input_values[0]
|
584 |
+
batch["input_length"] = len(batch["input_values"])
|
585 |
+
|
586 |
+
# encode targets
|
587 |
+
additional_kwargs = {}
|
588 |
+
if phoneme_language is not None:
|
589 |
+
additional_kwargs["phonemizer_lang"] = phoneme_language
|
590 |
+
|
591 |
+
batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
|
592 |
+
return batch
|
593 |
+
|
594 |
+
with training_args.main_process_first(desc="dataset map preprocessing"):
|
595 |
+
vectorized_datasets = raw_datasets.map(
|
596 |
+
prepare_dataset,
|
597 |
+
remove_columns=next(iter(raw_datasets.values())).column_names,
|
598 |
+
num_proc=num_workers,
|
599 |
+
desc="preprocess datasets",
|
600 |
+
)
|
601 |
+
|
602 |
+
def is_audio_in_length_range(length):
|
603 |
+
return length > min_input_length and length < max_input_length
|
604 |
+
|
605 |
+
# filter data that is shorter than min_input_length
|
606 |
+
vectorized_datasets = vectorized_datasets.filter(
|
607 |
+
is_audio_in_length_range,
|
608 |
+
num_proc=num_workers,
|
609 |
+
input_columns=["input_length"],
|
610 |
+
)
|
611 |
+
|
612 |
+
# 7. Next, we can prepare the training.
|
613 |
+
# Let's use word error rate (WER) as our evaluation metric,
|
614 |
+
# instantiate a data collator and the trainer
|
615 |
+
|
616 |
+
# Define evaluation metrics during training, *i.e.* word error rate, character error rate
|
617 |
+
eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
|
618 |
+
|
619 |
+
# for large datasets it is advised to run the preprocessing on a
|
620 |
+
# single machine first with ``args.preprocessing_only`` since there will mostly likely
|
621 |
+
# be a timeout when running the script in distributed mode.
|
622 |
+
# In a second step ``args.preprocessing_only`` can then be set to `False` to load the
|
623 |
+
# cached dataset
|
624 |
+
if data_args.preprocessing_only:
|
625 |
+
logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
|
626 |
+
return
|
627 |
+
|
628 |
+
def compute_metrics(pred):
|
629 |
+
pred_logits = pred.predictions
|
630 |
+
pred_ids = np.argmax(pred_logits, axis=-1)
|
631 |
+
|
632 |
+
pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
|
633 |
+
|
634 |
+
pred_str = tokenizer.batch_decode(pred_ids)
|
635 |
+
# we do not want to group tokens when computing the metrics
|
636 |
+
label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
|
637 |
+
|
638 |
+
metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
|
639 |
+
|
640 |
+
return metrics
|
641 |
+
|
642 |
+
# Now save everything to be able to create a single processor later
|
643 |
+
if is_main_process(training_args.local_rank):
|
644 |
+
# save feature extractor, tokenizer and config
|
645 |
+
feature_extractor.save_pretrained(training_args.output_dir)
|
646 |
+
tokenizer.save_pretrained(training_args.output_dir)
|
647 |
+
config.save_pretrained(training_args.output_dir)
|
648 |
+
|
649 |
+
try:
|
650 |
+
processor = AutoProcessor.from_pretrained(training_args.output_dir)
|
651 |
+
except (OSError, KeyError):
|
652 |
+
warnings.warn(
|
653 |
+
"Loading a processor from a feature extractor config that does not"
|
654 |
+
" include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
|
655 |
+
" attribute to your `preprocessor_config.json` file to suppress this warning: "
|
656 |
+
" `'processor_class': 'Wav2Vec2Processor'`",
|
657 |
+
FutureWarning,
|
658 |
+
)
|
659 |
+
processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
|
660 |
+
|
661 |
+
# Instantiate custom data collator
|
662 |
+
data_collator = DataCollatorCTCWithPadding(processor=processor)
|
663 |
+
|
664 |
+
# Initialize Trainer
|
665 |
+
trainer = Trainer(
|
666 |
+
model=model,
|
667 |
+
data_collator=data_collator,
|
668 |
+
args=training_args,
|
669 |
+
compute_metrics=compute_metrics,
|
670 |
+
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
|
671 |
+
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
|
672 |
+
tokenizer=feature_extractor,
|
673 |
+
)
|
674 |
+
|
675 |
+
# 8. Finally, we can start training
|
676 |
+
|
677 |
+
# Training
|
678 |
+
if training_args.do_train:
|
679 |
+
|
680 |
+
# use last checkpoint if exist
|
681 |
+
if last_checkpoint is not None:
|
682 |
+
checkpoint = last_checkpoint
|
683 |
+
elif os.path.isdir(model_args.model_name_or_path):
|
684 |
+
checkpoint = model_args.model_name_or_path
|
685 |
+
else:
|
686 |
+
checkpoint = None
|
687 |
+
|
688 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
689 |
+
trainer.save_model()
|
690 |
+
|
691 |
+
metrics = train_result.metrics
|
692 |
+
max_train_samples = (
|
693 |
+
data_args.max_train_samples
|
694 |
+
if data_args.max_train_samples is not None
|
695 |
+
else len(vectorized_datasets["train"])
|
696 |
+
)
|
697 |
+
metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
|
698 |
+
|
699 |
+
trainer.log_metrics("train", metrics)
|
700 |
+
trainer.save_metrics("train", metrics)
|
701 |
+
trainer.save_state()
|
702 |
+
|
703 |
+
# Evaluation
|
704 |
+
results = {}
|
705 |
+
if training_args.do_eval:
|
706 |
+
logger.info("*** Evaluate ***")
|
707 |
+
metrics = trainer.evaluate()
|
708 |
+
max_eval_samples = (
|
709 |
+
data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
|
710 |
+
)
|
711 |
+
metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
|
712 |
+
|
713 |
+
trainer.log_metrics("eval", metrics)
|
714 |
+
trainer.save_metrics("eval", metrics)
|
715 |
+
|
716 |
+
# Write model card and (optionally) push to hub
|
717 |
+
config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
|
718 |
+
kwargs = {
|
719 |
+
"finetuned_from": model_args.model_name_or_path,
|
720 |
+
"tasks": "speech-recognition",
|
721 |
+
"tags": ["automatic-speech-recognition", data_args.dataset_name],
|
722 |
+
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
|
723 |
+
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
|
724 |
+
}
|
725 |
+
if "common_voice" in data_args.dataset_name:
|
726 |
+
kwargs["language"] = config_name
|
727 |
+
|
728 |
+
if training_args.push_to_hub:
|
729 |
+
trainer.push_to_hub(**kwargs)
|
730 |
+
else:
|
731 |
+
trainer.create_model_card(**kwargs)
|
732 |
+
|
733 |
+
return results
|
734 |
+
|
735 |
+
|
736 |
+
if __name__ == "__main__":
|
737 |
+
main()
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36a9163393fb6b85ab9754dc500a8171106095c789cc5d76758339e8671f5b54
|
3 |
+
size 2991
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"a": 1, "d": 2, "e": 3, "g": 4, "i": 5, "l": 6, "o": 7, "p": 8, "q": 9, "⋯": 10, "⼤": 11, "⽣": 12, "、": 13, "。": 14, "《": 15, "》": 16, "「": 17, "」": 18, "ㄟ": 19, "ㄧ": 20, "一": 21, "丁": 22, "七": 23, "丈": 24, "三": 25, "上": 26, "下": 27, "不": 28, "且": 29, "世": 30, "丘": 31, "丙": 32, "丟": 33, "並": 34, "中": 35, "串": 36, "丶": 37, "丸": 38, "丹": 39, "主": 40, "乃": 41, "久": 42, "之": 43, "乍": 44, "乎": 45, "乏": 46, "乖": 47, "乘": 48, "乙": 49, "九": 50, "乞": 51, "也": 52, "乳": 53, "乾": 54, "亂": 55, "了": 56, "予": 57, "事": 58, "二": 59, "互": 60, "五": 61, "井": 62, "些": 63, "亞": 64, "亡": 65, "交": 66, "亥": 67, "亦": 68, "享": 69, "京": 70, "亭": 71, "亮": 72, "人": 73, "什": 74, "仁": 75, "仆": 76, "仇": 77, "今": 78, "介": 79, "仍": 80, "仔": 81, "他": 82, "付": 83, "仙": 84, "代": 85, "令": 86, "以": 87, "仰": 88, "仲": 89, "件": 90, "任": 91, "份": 92, "仿": 93, "企": 94, "伊": 95, "伍": 96, "伏": 97, "伐": 98, "休": 99, "伙": 100, "伯": 101, "估": 102, "伴": 103, "伸": 104, "伺": 105, "似": 106, "佃": 107, "但": 108, "佈": 109, "位": 110, "低": 111, "住": 112, "佑": 113, "佔": 114, "何": 115, "佛": 116, "作": 117, "你": 118, "佩": 119, "佰": 120, "佳": 121, "併": 122, "使": 123, "來": 124, "侈": 125, "例": 126, "供": 127, "依": 128, "侮": 129, "侯": 130, "侵": 131, "侶": 132, "便": 133, "係": 134, "促": 135, "俄": 136, "俊": 137, "俏": 138, "俗": 139, "保": 140, "俠": 141, "信": 142, "修": 143, "俱": 144, "俾": 145, "倉": 146, "個": 147, "倍": 148, "們": 149, "倒": 150, "倖": 151, "倘": 152, "候": 153, "倚": 154, "借": 155, "倡": 156, "値": 157, "倦": 158, "倫": 159, "值": 160, "假": 161, "偉": 162, "偏": 163, "做": 164, "停": 165, "健": 166, "側": 167, "偵": 168, "偶": 169, "偷": 170, "偽": 171, "傅": 172, "傍": 173, "傑": 174, "傘": 175, "備": 176, "傢": 177, "催": 178, "傲": 179, "傳": 180, "債": 181, "傷": 182, "傻": 183, "傾": 184, "僅": 185, "像": 186, "僕": 187, "僚": 188, "僱": 189, "僵": 190, "價": 191, "僻": 192, "儀": 193, "儂": 194, "億": 195, "儘": 196, "償": 197, "優": 198, "儲": 199, "兀": 200, "允": 201, "元": 202, "兄": 203, "充": 204, "兇": 205, "先": 206, "光": 207, "克": 208, "兌": 209, "免": 210, "兒": 211, "兔": 212, "入": 213, "內": 214, "全": 215, "兩": 216, "八": 217, "公": 218, "六": 219, "兮": 220, "共": 221, "兵": 222, "其": 223, "具": 224, "典": 225, "兼": 226, "内": 227, "冊": 228, "再": 229, "冒": 230, "冗": 231, "冠": 232, "冤": 233, "冬": 234, "冰": 235, "冶": 236, "冷": 237, "准": 238, "凋": 239, "凌": 240, "凍": 241, "凝": 242, "凡": 243, "凱": 244, "凳": 245, "凹": 246, "出": 247, "函": 248, "刀": 249, "刁": 250, "分": 251, "切": 252, "刊": 253, "刑": 254, "划": 255, "列": 256, "初": 257, "判": 258, "別": 259, "利": 260, "刪": 261, "到": 262, "制": 263, "刷": 264, "券": 265, "刺": 266, "刻": 267, "則": 268, "前": 269, "剖": 270, "剛": 271, "剝": 272, "剩": 273, "剪": 274, "副": 275, "割": 276, "創": 277, "剷": 278, "劃": 279, "劇": 280, "劈": 281, "劉": 282, "劍": 283, "劑": 284, "力": 285, "功": 286, "加": 287, "劣": 288, "助": 289, "努": 290, "勁": 291, "勇": 292, "勉": 293, "勒": 294, "動": 295, "勘": 296, "務": 297, "勝": 298, "勞": 299, "募": 300, "勢": 301, "勤": 302, "勳": 303, "勵": 304, "勸": 305, "勻": 306, "勾": 307, "勿": 308, "包": 309, "匆": 310, "匈": 311, "化": 312, "北": 313, "匙": 314, "匪": 315, "匯": 316, "匱": 317, "匹": 318, "匿": 319, "區": 320, "十": 321, "千": 322, "升": 323, "午": 324, "半": 325, "卑": 326, "卓": 327, "協": 328, "南": 329, "博": 330, "占": 331, "卡": 332, "卦": 333, "印": 334, "危": 335, "即": 336, "卵": 337, "卷": 338, "卸": 339, "卻": 340, "厄": 341, "厚": 342, "原": 343, "厭": 344, "厲": 345, "去": 346, "參": 347, "又": 348, "叉": 349, "及": 350, "友": 351, "反": 352, "叔": 353, "取": 354, "受": 355, "叛": 356, "叢": 357, "口": 358, "古": 359, "句": 360, "另": 361, "叨": 362, "只": 363, "叫": 364, "召": 365, "叮": 366, "可": 367, "台": 368, "史": 369, "右": 370, "司": 371, "叼": 372, "吃": 373, "各": 374, "合": 375, "吉": 376, "吊": 377, "同": 378, "名": 379, "吐": 380, "向": 381, "君": 382, "吝": 383, "吞": 384, "吠": 385, "否": 386, "吧": 387, "含": 388, "吭": 389, "吳": 390, "吵": 391, "吸": 392, "吹": 393, "吻": 394, "吼": 395, "呀": 396, "呂": 397, "呆": 398, "呈": 399, "告": 400, "呢": 401, "周": 402, "呱": 403, "味": 404, "呵": 405, "呼": 406, "命": 407, "咆": 408, "和": 409, "咒": 410, "咕": 411, "咖": 412, "咚": 413, "咪": 414, "咬": 415, "咳": 416, "哀": 417, "品": 418, "哄": 419, "哇": 420, "哈": 421, "哉": 422, "哎": 423, "員": 424, "哥": 425, "哦": 426, "哨": 427, "哩": 428, "哪": 429, "哭": 430, "哲": 431, "哺": 432, "哼": 433, "唆": 434, "唐": 435, "唬": 436, "售": 437, "唯": 438, "唱": 439, "唷": 440, "唸": 441, "商": 442, "啊": 443, "問": 444, "啖": 445, "啟": 446, "啡": 447, "啤": 448, "啥": 449, "啦": 450, "啼": 451, "啾": 452, "喀": 453, "善": 454, "喉": 455, "喊": 456, "喔": 457, "喘": 458, "喜": 459, "喝": 460, "喧": 461, "喪": 462, "喬": 463, "單": 464, "喵": 465, "喻": 466, "嗆": 467, "嗎": 468, "嗚": 469, "嗜": 470, "嗡": 471, "嗨": 472, "嗯": 473, "嗽": 474, "嘆": 475, "嘉": 476, "嘍": 477, "嘗": 478, "嘛": 479, "嘯": 480, "嘰": 481, "嘲": 482, "嘴": 483, "嘻": 484, "噌": 485, "噓": 486, "噗": 487, "器": 488, "噬": 489, "噴": 490, "噶": 491, "噸": 492, "噹": 493, "嚀": 494, "嚇": 495, "嚕": 496, "嚨": 497, "嚮": 498, "嚴": 499, "嚼": 500, "囂": 501, "囉": 502, "囊": 503, "囚": 504, "四": 505, "回": 506, "因": 507, "囤": 508, "困": 509, "固": 510, "圈": 511, "國": 512, "圍": 513, "園": 514, "圓": 515, "圖": 516, "團": 517, "土": 518, "在": 519, "地": 520, "圾": 521, "址": 522, "均": 523, "坊": 524, "坎": 525, "坐": 526, "坑": 527, "坡": 528, "坦": 529, "坪": 530, "垂": 531, "垃": 532, "型": 533, "垢": 534, "埋": 535, "城": 536, "域": 537, "執": 538, "培": 539, "基": 540, "堂": 541, "堃": 542, "堅": 543, "堆": 544, "堡": 545, "堪": 546, "報": 547, "場": 548, "堵": 549, "塊": 550, "塑": 551, "塔": 552, "塗": 553, "塞": 554, "填": 555, "塭": 556, "塵": 557, "境": 558, "墓": 559, "增": 560, "墨": 561, "墮": 562, "墾": 563, "壁": 564, "壅": 565, "壇": 566, "壓": 567, "壘": 568, "壞": 569, "壟": 570, "壤": 571, "士": 572, "壯": 573, "壺": 574, "壽": 575, "夏": 576, "夕": 577, "外": 578, "多": 579, "夜": 580, "夠": 581, "夢": 582, "夥": 583, "大": 584, "天": 585, "太": 586, "夫": 587, "夭": 588, "央": 589, "失": 590, "夷": 591, "夾": 592, "奇": 593, "奈": 594, "奉": 595, "奏": 596, "契": 597, "奔": 598, "套": 599, "奠": 600, "奢": 601, "奧": 602, "奪": 603, "奮": 604, "女": 605, "奴": 606, "奶": 607, "奸": 608, "她": 609, "好": 610, "如": 611, "妃": 612, "妄": 613, "妊": 614, "妓": 615, "妖": 616, "妙": 617, "妝": 618, "妥": 619, "妨": 620, "妳": 621, "妹": 622, "妻": 623, "妾": 624, "姆": 625, "姊": 626, "始": 627, "姍": 628, "姐": 629, "姑": 630, "姓": 631, "委": 632, "姚": 633, "姦": 634, "姨": 635, "姬": 636, "姻": 637, "威": 638, "娑": 639, "娘": 640, "娛": 641, "娠": 642, "娥": 643, "娶": 644, "娼": 645, "婆": 646, "婉": 647, "婊": 648, "婚": 649, "婦": 650, "婪": 651, "媒": 652, "媳": 653, "媽": 654, "嫁": 655, "嫂": 656, "嫌": 657, "嫖": 658, "嫩": 659, "嬤": 660, "嬰": 661, "嬸": 662, "子": 663, "孔": 664, "孕": 665, "字": 666, "存": 667, "孝": 668, "季": 669, "孤": 670, "孩": 671, "孫": 672, "孵": 673, "學": 674, "它": 675, "宅": 676, "宇": 677, "守": 678, "安": 679, "宋": 680, "完": 681, "宗": 682, "官": 683, "宙": 684, "定": 685, "宛": 686, "宜": 687, "客": 688, "宣": 689, "室": 690, "宮": 691, "宰": 692, "害": 693, "宴": 694, "宵": 695, "家": 696, "容": 697, "宿": 698, "寂": 699, "寄": 700, "密": 701, "富": 702, "寒": 703, "寓": 704, "寞": 705, "察": 706, "寡": 707, "實": 708, "寧": 709, "寨": 710, "審": 711, "寫": 712, "寬": 713, "寮": 714, "寵": 715, "寶": 716, "寸": 717, "寺": 718, "封": 719, "射": 720, "將": 721, "專": 722, "尊": 723, "尋": 724, "對": 725, "導": 726, "小": 727, "少": 728, "尖": 729, "尚": 730, "尤": 731, "尬": 732, "就": 733, "尷": 734, "尺": 735, "尼": 736, "尾": 737, "尿": 738, "局": 739, "屁": 740, "居": 741, "屆": 742, "屈": 743, "屋": 744, "屌": 745, "屍": 746, "屎": 747, "屏": 748, "屑": 749, "展": 750, "屜": 751, "屠": 752, "屢": 753, "層": 754, "履": 755, "屬": 756, "屯": 757, "山": 758, "岩": 759, "岸": 760, "峰": 761, "島": 762, "峻": 763, "峽": 764, "崇": 765, "崛": 766, "崩": 767, "崴": 768, "嵌": 769, "嵐": 770, "嶄": 771, "嶼": 772, "巔": 773, "川": 774, "州": 775, "巡": 776, "巢": 777, "工": 778, "左": 779, "巧": 780, "巨": 781, "差": 782, "己": 783, "已": 784, "巴": 785, "巷": 786, "巾": 787, "市": 788, "布": 789, "帆": 790, "希": 791, "帕": 792, "帖": 793, "帝": 794, "帥": 795, "師": 796, "席": 797, "帳": 798, "帶": 799, "常": 800, "帽": 801, "幅": 802, "幌": 803, "幕": 804, "幣": 805, "幫": 806, "干": 807, "平": 808, "年": 809, "幸": 810, "幹": 811, "幻": 812, "幼": 813, "幾": 814, "庄": 815, "床": 816, "序": 817, "底": 818, "店": 819, "府": 820, "度": 821, "座": 822, "庫": 823, "庭": 824, "康": 825, "庸": 826, "廁": 827, "廂": 828, "廉": 829, "廊": 830, "廖": 831, "廚": 832, "廟": 833, "廠": 834, "廢": 835, "廣": 836, "廬": 837, "廳": 838, "延": 839, "廷": 840, "建": 841, "弄": 842, "弊": 843, "式": 844, "弓": 845, "弔": 846, "引": 847, "弘": 848, "弟": 849, "弦": 850, "弱": 851, "張": 852, "強": 853, "彈": 854, "彊": 855, "彌": 856, "彎": 857, "彙": 858, "形": 859, "彩": 860, "彬": 861, "彭": 862, "影": 863, "彷": 864, "役": 865, "彼": 866, "彿": 867, "往": 868, "征": 869, "待": 870, "很": 871, "徊": 872, "律": 873, "後": 874, "徑": 875, "徒": 876, "得": 877, "徘": 878, "徙": 879, "從": 880, "復": 881, "循": 882, "徬": 883, "微": 884, "徵": 885, "德": 886, "徹": 887, "心": 888, "必": 889, "忌": 890, "忍": 891, "志": 892, "忘": 893, "忙": 894, "快": 895, "念": 896, "忽": 897, "怎": 898, "怒": 899, "怕": 900, "怖": 901, "思": 902, "怠": 903, "怡": 904, "急": 905, "怦": 906, "性": 907, "怨": 908, "怪": 909, "怵": 910, "恃": 911, "恆": 912, "恍": 913, "恐": 914, "恕": 915, "恢": 916, "恤": 917, "恨": 918, "恩": 919, "恭": 920, "息": 921, "恰": 922, "悄": 923, "悅": 924, "悉": 925, "悔": 926, "悟": 927, "悠": 928, "患": 929, "您": 930, "悲": 931, "情": 932, "惇": 933, "惋": 934, "惑": 935, "惕": 936, "惘": 937, "惜": 938, "惠": 939, "惡": 940, "惰": 941, "惱": 942, "想": 943, "惶": 944, "惹": 945, "愁": 946, "愈": 947, "愉": 948, "意": 949, "愕": 950, "愚": 951, "愛": 952, "愜": 953, "感": 954, "愣": 955, "愧": 956, "慈": 957, "態": 958, "慌": 959, "慎": 960, "慕": 961, "慘": 962, "慚": 963, "慢": 964, "慣": 965, "慧": 966, "慮": 967, "慰": 968, "慶": 969, "慾": 970, "憂": 971, "憊": 972, "憐": 973, "憑": 974, "憤": 975, "憩": 976, "憲": 977, "憶": 978, "憾": 979, "懂": 980, "懇": 981, "應": 982, "懲": 983, "懶": 984, "懷": 985, "懸": 986, "懼": 987, "戀": 988, "戊": 989, "成": 990, "我": 991, "戒": 992, "或": 993, "戚": 994, "截": 995, "戰": 996, "戲": 997, "戴": 998, "戶": 999, "房": 1000, "所": 1001, "扇": 1002, "扉": 1003, "手": 1004, "才": 1005, "扎": 1006, "打": 1007, "托": 1008, "扛": 1009, "扣": 1010, "扭": 1011, "扮": 1012, "扯": 1013, "扶": 1014, "批": 1015, "扼": 1016, "找": 1017, "承": 1018, "技": 1019, "抄": 1020, "把": 1021, "抑": 1022, "抒": 1023, "抓": 1024, "投": 1025, "抖": 1026, "抗": 1027, "折": 1028, "披": 1029, "抬": 1030, "抱": 1031, "抵": 1032, "抹": 1033, "抽": 1034, "拆": 1035, "拉": 1036, "拋": 1037, "拍": 1038, "拒": 1039, "拓": 1040, "拔": 1041, "拖": 1042, "拘": 1043, "拚": 1044, "招": 1045, "拜": 1046, "括": 1047, "拯": 1048, "拱": 1049, "拳": 1050, "拼": 1051, "拾": 1052, "拿": 1053, "持": 1054, "指": 1055, "按": 1056, "挑": 1057, "挖": 1058, "挫": 1059, "振": 1060, "挺": 1061, "捍": 1062, "捐": 1063, "捕": 1064, "捨": 1065, "捲": 1066, "捷": 1067, "掃": 1068, "授": 1069, "掉": 1070, "掌": 1071, "掏": 1072, "排": 1073, "掘": 1074, "掙": 1075, "掛": 1076, "掠": 1077, "採": 1078, "探": 1079, "接": 1080, "控": 1081, "推": 1082, "掩": 1083, "措": 1084, "掰": 1085, "揀": 1086, "揉": 1087, "描": 1088, "提": 1089, "插": 1090, "揚": 1091, "換": 1092, "握": 1093, "揣": 1094, "揪": 1095, "揭": 1096, "揮": 1097, "援": 1098, "揹": 1099, "損": 1100, "搏": 1101, "搔": 1102, "搖": 1103, "搗": 1104, "搜": 1105, "搞": 1106, "搬": 1107, "搭": 1108, "搶": 1109, "摔": 1110, "摘": 1111, "摟": 1112, "摧": 1113, "摩": 1114, "摯": 1115, "摸": 1116, "撇": 1117, "撐": 1118, "撒": 1119, "撕": 1120, "撞": 1121, "撤": 1122, "撥": 1123, "撫": 1124, "播": 1125, "撰": 1126, "撲": 1127, "撿": 1128, "擁": 1129, "擅": 1130, "擇": 1131, "擊": 1132, "擋": 1133, "操": 1134, "擎": 1135, "擔": 1136, "據": 1137, "擠": 1138, "擦": 1139, "擬": 1140, "擱": 1141, "擲": 1142, "擴": 1143, "擷": 1144, "擺": 1145, "擾": 1146, "攀": 1147, "攔": 1148, "攜": 1149, "攝": 1150, "攤": 1151, "攪": 1152, "支": 1153, "收": 1154, "攸": 1155, "改": 1156, "攻": 1157, "放": 1158, "政": 1159, "故": 1160, "效": 1161, "敏": 1162, "救": 1163, "敗": 1164, "敘": 1165, "教": 1166, "敝": 1167, "敢": 1168, "散": 1169, "敦": 1170, "敬": 1171, "敲": 1172, "整": 1173, "敵": 1174, "數": 1175, "斂": 1176, "文": 1177, "斌": 1178, "斑": 1179, "斗": 1180, "料": 1181, "斜": 1182, "斤": 1183, "斥": 1184, "斧": 1185, "斯": 1186, "新": 1187, "斷": 1188, "方": 1189, "於": 1190, "施": 1191, "旁": 1192, "旅": 1193, "旋": 1194, "族": 1195, "旗": 1196, "既": 1197, "日": 1198, "旦": 1199, "旨": 1200, "早": 1201, "旬": 1202, "旭": 1203, "旺": 1204, "昂": 1205, "昆": 1206, "昇": 1207, "昌": 1208, "明": 1209, "昏": 1210, "易": 1211, "昔": 1212, "星": 1213, "映": 1214, "春": 1215, "昧": 1216, "昨": 1217, "是": 1218, "昴": 1219, "時": 1220, "晃": 1221, "晉": 1222, "晚": 1223, "晨": 1224, "普": 1225, "景": 1226, "晰": 1227, "晴": 1228, "晶": 1229, "智": 1230, "晾": 1231, "暄": 1232, "暇": 1233, "暈": 1234, "暑": 1235, "暖": 1236, "暗": 1237, "暫": 1238, "暱": 1239, "暴": 1240, "曆": 1241, "曉": 1242, "曙": 1243, "曝": 1244, "曬": 1245, "曲": 1246, "更": 1247, "書": 1248, "曹": 1249, "曼": 1250, "曾": 1251, "替": 1252, "最": 1253, "會": 1254, "月": 1255, "有": 1256, "朋": 1257, "服": 1258, "朕": 1259, "朗": 1260, "望": 1261, "朝": 1262, "期": 1263, "木": 1264, "未": 1265, "末": 1266, "本": 1267, "札": 1268, "朱": 1269, "朵": 1270, "朽": 1271, "杉": 1272, "李": 1273, "材": 1274, "村": 1275, "杖": 1276, "杜": 1277, "束": 1278, "杭": 1279, "杯": 1280, "杰": 1281, "東": 1282, "松": 1283, "板": 1284, "枉": 1285, "析": 1286, "枕": 1287, "林": 1288, "枚": 1289, "果": 1290, "枝": 1291, "枯": 1292, "架": 1293, "柏": 1294, "某": 1295, "染": 1296, "柔": 1297, "柚": 1298, "查": 1299, "柯": 1300, "柱": 1301, "柳": 1302, "柴": 1303, "柵": 1304, "柺": 1305, "柿": 1306, "栗": 1307, "校": 1308, "栩": 1309, "株": 1310, "核": 1311, "根": 1312, "格": 1313, "栽": 1314, "桂": 1315, "桃": 1316, "框": 1317, "案": 1318, "桌": 1319, "桶": 1320, "桿": 1321, "梁": 1322, "梅": 1323, "梗": 1324, "條": 1325, "梨": 1326, "梯": 1327, "梳": 1328, "棄": 1329, "棉": 1330, "棋": 1331, "棍": 1332, "棒": 1333, "棚": 1334, "棟": 1335, "森": 1336, "棲": 1337, "棵": 1338, "棺": 1339, "椅": 1340, "植": 1341, "椎": 1342, "椒": 1343, "椰": 1344, "楊": 1345, "楚": 1346, "楞": 1347, "業": 1348, "極": 1349, "概": 1350, "榔": 1351, "榕": 1352, "榜": 1353, "榨": 1354, "榮": 1355, "榻": 1356, "構": 1357, "槍": 1358, "槓": 1359, "樁": 1360, "樂": 1361, "樑": 1362, "樓": 1363, "標": 1364, "樞": 1365, "模": 1366, "樣": 1367, "樵": 1368, "樸": 1369, "樹": 1370, "樺": 1371, "樽": 1372, "橋": 1373, "橘": 1374, "橙": 1375, "機": 1376, "橡": 1377, "橫": 1378, "檔": 1379, "檢": 1380, "檬": 1381, "檯": 1382, "檳": 1383, "檸": 1384, "檻": 1385, "櫃": 1386, "櫻": 1387, "欄": 1388, "欉": 1389, "權": 1390, "欠": 1391, "次": 1392, "欣": 1393, "欲": 1394, "欸": 1395, "欺": 1396, "欽": 1397, "款": 1398, "歉": 1399, "歌": 1400, "歐": 1401, "歡": 1402, "止": 1403, "正": 1404, "此": 1405, "步": 1406, "武": 1407, "歧": 1408, "歲": 1409, "歷": 1410, "歸": 1411, "死": 1412, "殊": 1413, "殖": 1414, "殘": 1415, "殭": 1416, "段": 1417, "殷": 1418, "殺": 1419, "殼": 1420, "殿": 1421, "毀": 1422, "毋": 1423, "母": 1424, "每": 1425, "毒": 1426, "比": 1427, "毛": 1428, "毫": 1429, "氏": 1430, "民": 1431, "氘": 1432, "氚": 1433, "氛": 1434, "氣": 1435, "氧": 1436, "氨": 1437, "氫": 1438, "氮": 1439, "氰": 1440, "水": 1441, "永": 1442, "氾": 1443, "汀": 1444, "汁": 1445, "求": 1446, "汐": 1447, "汗": 1448, "汙": 1449, "汛": 1450, "汝": 1451, "江": 1452, "池": 1453, "污": 1454, "汪": 1455, "汰": 1456, "決": 1457, "汽": 1458, "汾": 1459, "沃": 1460, "沈": 1461, "沉": 1462, "沐": 1463, "沒": 1464, "沖": 1465, "沙": 1466, "沛": 1467, "没": 1468, "沮": 1469, "沱": 1470, "河": 1471, "沸": 1472, "油": 1473, "治": 1474, "沼": 1475, "沾": 1476, "沿": 1477, "況": 1478, "泉": 1479, "泊": 1480, "法": 1481, "泛": 1482, "泡": 1483, "波": 1484, "泣": 1485, "泥": 1486, "注": 1487, "泰": 1488, "泳": 1489, "洋": 1490, "洗": 1491, "洛": 1492, "洞": 1493, "津": 1494, "洪": 1495, "洱": 1496, "洲": 1497, "活": 1498, "洽": 1499, "派": 1500, "流": 1501, "浩": 1502, "浪": 1503, "浮": 1504, "浴": 1505, "海": 1506, "消": 1507, "涉": 1508, "涯": 1509, "液": 1510, "涵": 1511, "涸": 1512, "涼": 1513, "淇": 1514, "淋": 1515, "淑": 1516, "淒": 1517, "淘": 1518, "淚": 1519, "淡": 1520, "淤": 1521, "淨": 1522, "淪": 1523, "深": 1524, "淵": 1525, "混": 1526, "淹": 1527, "淺": 1528, "添": 1529, "清": 1530, "減": 1531, "渡": 1532, "渣": 1533, "渥": 1534, "渦": 1535, "測": 1536, "渭": 1537, "港": 1538, "渲": 1539, "渴": 1540, "游": 1541, "渾": 1542, "湊": 1543, "湍": 1544, "湖": 1545, "湧": 1546, "湯": 1547, "溉": 1548, "源": 1549, "準": 1550, "溝": 1551, "溪": 1552, "溫": 1553, "溶": 1554, "溺": 1555, "溼": 1556, "滂": 1557, "滅": 1558, "滋": 1559, "滌": 1560, "滑": 1561, "滯": 1562, "滲": 1563, "滴": 1564, "滷": 1565, "滾": 1566, "滿": 1567, "漁": 1568, "漂": 1569, "漆": 1570, "漏": 1571, "演": 1572, "漠": 1573, "漢": 1574, "漫": 1575, "漲": 1576, "漸": 1577, "潑": 1578, "潔": 1579, "潛": 1580, "潤": 1581, "潦": 1582, "潮": 1583, "潰": 1584, "澄": 1585, "澎": 1586, "澡": 1587, "澤": 1588, "澳": 1589, "激": 1590, "濃": 1591, "濕": 1592, "濟": 1593, "濫": 1594, "濱": 1595, "濾": 1596, "瀏": 1597, "瀑": 1598, "瀕": 1599, "灌": 1600, "灑": 1601, "灘": 1602, "灣": 1603, "火": 1604, "灰": 1605, "災": 1606, "炎": 1607, "炒": 1608, "炫": 1609, "炮": 1610, "炸": 1611, "為": 1612, "烈": 1613, "烊": 1614, "烏": 1615, "烤": 1616, "烹": 1617, "焉": 1618, "焚": 1619, "無": 1620, "焦": 1621, "焰": 1622, "然": 1623, "煎": 1624, "煙": 1625, "煤": 1626, "煦": 1627, "照": 1628, "煩": 1629, "煮": 1630, "熊": 1631, "熔": 1632, "熟": 1633, "熬": 1634, "熱": 1635, "燃": 1636, "燈": 1637, "燒": 1638, "燕": 1639, "燙": 1640, "營": 1641, "燥": 1642, "燦": 1643, "燭": 1644, "爆": 1645, "爍": 1646, "爐": 1647, "爛": 1648, "爪": 1649, "爬": 1650, "爭": 1651, "爲": 1652, "父": 1653, "爸": 1654, "爹": 1655, "爺": 1656, "爽": 1657, "爾": 1658, "牆": 1659, "片": 1660, "版": 1661, "牌": 1662, "牙": 1663, "牛": 1664, "牠": 1665, "牡": 1666, "牧": 1667, "物": 1668, "牲": 1669, "牴": 1670, "特": 1671, "牽": 1672, "犬": 1673, "犯": 1674, "狀": 1675, "狂": 1676, "狐": 1677, "狗": 1678, "狠": 1679, "狸": 1680, "狹": 1681, "狼": 1682, "猛": 1683, "猜": 1684, "猩": 1685, "猴": 1686, "猶": 1687, "猿": 1688, "獄": 1689, "獅": 1690, "獎": 1691, "獨": 1692, "獲": 1693, "獵": 1694, "獸": 1695, "獻": 1696, "玄": 1697, "率": 1698, "玉": 1699, "王": 1700, "玩": 1701, "玫": 1702, "玻": 1703, "珀": 1704, "珈": 1705, "珍": 1706, "珠": 1707, "班": 1708, "現": 1709, "球": 1710, "理": 1711, "琥": 1712, "琪": 1713, "琴": 1714, "瑋": 1715, "瑕": 1716, "瑜": 1717, "瑟": 1718, "瑩": 1719, "瑪": 1720, "瑰": 1721, "璃": 1722, "璧": 1723, "環": 1724, "瓜": 1725, "瓦": 1726, "瓶": 1727, "瓷": 1728, "甄": 1729, "甘": 1730, "甚": 1731, "甜": 1732, "生": 1733, "產": 1734, "甦": 1735, "用": 1736, "甩": 1737, "甫": 1738, "田": 1739, "由": 1740, "甲": 1741, "申": 1742, "男": 1743, "町": 1744, "界": 1745, "畏": 1746, "畔": 1747, "留": 1748, "畜": 1749, "畢": 1750, "略": 1751, "番": 1752, "畫": 1753, "異": 1754, "當": 1755, "畸": 1756, "疆": 1757, "疊": 1758, "疏": 1759, "疑": 1760, "疙": 1761, "疫": 1762, "疲": 1763, "疼": 1764, "疾": 1765, "病": 1766, "症": 1767, "痕": 1768, "痛": 1769, "痠": 1770, "痰": 1771, "痴": 1772, "瘋": 1773, "瘟": 1774, "瘦": 1775, "瘩": 1776, "療": 1777, "癌": 1778, "癒": 1779, "癡": 1780, "癢": 1781, "癮": 1782, "癱": 1783, "登": 1784, "發": 1785, "白": 1786, "百": 1787, "皂": 1788, "的": 1789, "皆": 1790, "皇": 1791, "皮": 1792, "皺": 1793, "盃": 1794, "盆": 1795, "益": 1796, "盎": 1797, "盒": 1798, "盔": 1799, "盛": 1800, "盜": 1801, "盞": 1802, "盟": 1803, "盡": 1804, "監": 1805, "盤": 1806, "盪": 1807, "目": 1808, "盲": 1809, "直": 1810, "相": 1811, "盼": 1812, "盾": 1813, "省": 1814, "眉": 1815, "看": 1816, "真": 1817, "眠": 1818, "眨": 1819, "眷": 1820, "眺": 1821, "眼": 1822, "眾": 1823, "睛": 1824, "睞": 1825, "睡": 1826, "督": 1827, "睦": 1828, "瞌": 1829, "瞧": 1830, "瞪": 1831, "瞬": 1832, "瞭": 1833, "瞻": 1834, "矛": 1835, "知": 1836, "矩": 1837, "短": 1838, "矮": 1839, "矲": 1840, "石": 1841, "砂": 1842, "研": 1843, "砲": 1844, "破": 1845, "砷": 1846, "硃": 1847, "硫": 1848, "硬": 1849, "碌": 1850, "碎": 1851, "碑": 1852, "碗": 1853, "碟": 1854, "碧": 1855, "碩": 1856, "碰": 1857, "碳": 1858, "確": 1859, "碼": 1860, "碾": 1861, "磁": 1862, "磐": 1863, "磚": 1864, "磨": 1865, "礁": 1866, "礎": 1867, "礙": 1868, "示": 1869, "社": 1870, "祈": 1871, "祉": 1872, "祕": 1873, "祖": 1874, "祝": 1875, "神": 1876, "祟": 1877, "祥": 1878, "票": 1879, "祭": 1880, "祿": 1881, "禁": 1882, "禍": 1883, "福": 1884, "禦": 1885, "禧": 1886, "禮": 1887, "禱": 1888, "禿": 1889, "秀": 1890, "私": 1891, "秉": 1892, "秋": 1893, "科": 1894, "秒": 1895, "秘": 1896, "租": 1897, "秦": 1898, "秧": 1899, "秩": 1900, "移": 1901, "稀": 1902, "稅": 1903, "程": 1904, "稍": 1905, "稚": 1906, "稜": 1907, "稠": 1908, "種": 1909, "稱": 1910, "稻": 1911, "稽": 1912, "稿": 1913, "穀": 1914, "積": 1915, "穗": 1916, "穩": 1917, "穴": 1918, "究": 1919, "空": 1920, "穿": 1921, "突": 1922, "窄": 1923, "窗": 1924, "窘": 1925, "窮": 1926, "窯": 1927, "竄": 1928, "竅": 1929, "竇": 1930, "立": 1931, "站": 1932, "竟": 1933, "章": 1934, "童": 1935, "竭": 1936, "端": 1937, "競": 1938, "竹": 1939, "竿": 1940, "笑": 1941, "符": 1942, "笨": 1943, "第": 1944, "筆": 1945, "等": 1946, "筋": 1947, "筍": 1948, "筐": 1949, "筒": 1950, "答": 1951, "策": 1952, "箕": 1953, "算": 1954, "管": 1955, "箭": 1956, "箱": 1957, "節": 1958, "範": 1959, "篇": 1960, "築": 1961, "篡": 1962, "篩": 1963, "篷": 1964, "簡": 1965, "簷": 1966, "簽": 1967, "簿": 1968, "籃": 1969, "籌": 1970, "籍": 1971, "籠": 1972, "籤": 1973, "籮": 1974, "籲": 1975, "米": 1976, "籽": 1977, "粉": 1978, "粒": 1979, "粗": 1980, "粥": 1981, "粵": 1982, "粹": 1983, "粽": 1984, "精": 1985, "粿": 1986, "糊": 1987, "糕": 1988, "糖": 1989, "糞": 1990, "糟": 1991, "糧": 1992, "糬": 1993, "糰": 1994, "系": 1995, "糾": 1996, "紀": 1997, "約": 1998, "紅": 1999, "紋": 2000, "納": 2001, "紐": 2002, "紓": 2003, "純": 2004, "紗": 2005, "紙": 2006, "級": 2007, "紛": 2008, "素": 2009, "紡": 2010, "索": 2011, "紫": 2012, "累": 2013, "細": 2014, "紹": 2015, "終": 2016, "組": 2017, "絆": 2018, "結": 2019, "絕": 2020, "絡": 2021, "給": 2022, "絨": 2023, "統": 2024, "絲": 2025, "綁": 2026, "經": 2027, "綜": 2028, "綠": 2029, "綢": 2030, "維": 2031, "綱": 2032, "網": 2033, "綴": 2034, "綿": 2035, "緊": 2036, "緒": 2037, "線": 2038, "締": 2039, "緣": 2040, "編": 2041, "緩": 2042, "緯": 2043, "練": 2044, "緻": 2045, "縈": 2046, "縝": 2047, "縣": 2048, "縫": 2049, "縮": 2050, "縱": 2051, "總": 2052, "績": 2053, "繁": 2054, "繆": 2055, "織": 2056, "繞": 2057, "繩": 2058, "繪": 2059, "繫": 2060, "繳": 2061, "繹": 2062, "繼": 2063, "續": 2064, "纏": 2065, "纜": 2066, "缸": 2067, "缺": 2068, "罄": 2069, "罐": 2070, "罕": 2071, "罩": 2072, "罪": 2073, "置": 2074, "罰": 2075, "署": 2076, "罵": 2077, "罷": 2078, "罹": 2079, "羅": 2080, "羈": 2081, "羊": 2082, "美": 2083, "羞": 2084, "群": 2085, "羨": 2086, "義": 2087, "羽": 2088, "翁": 2089, "翅": 2090, "習": 2091, "翹": 2092, "翻": 2093, "翼": 2094, "耀": 2095, "老": 2096, "考": 2097, "者": 2098, "而": 2099, "耍": 2100, "耐": 2101, "耕": 2102, "耗": 2103, "耘": 2104, "耳": 2105, "耶": 2106, "耽": 2107, "聆": 2108, "聊": 2109, "聖": 2110, "聘": 2111, "聚": 2112, "聞": 2113, "聯": 2114, "聰": 2115, "聲": 2116, "聳": 2117, "職": 2118, "聽": 2119, "肅": 2120, "肉": 2121, "肋": 2122, "肌": 2123, "肖": 2124, "肘": 2125, "肚": 2126, "肝": 2127, "股": 2128, "肢": 2129, "肥": 2130, "肩": 2131, "肯": 2132, "育": 2133, "肺": 2134, "肽": 2135, "胃": 2136, "背": 2137, "胎": 2138, "胖": 2139, "胚": 2140, "胜": 2141, "胞": 2142, "胡": 2143, "胥": 2144, "胯": 2145, "胸": 2146, "能": 2147, "脂": 2148, "脅": 2149, "脆": 2150, "脈": 2151, "脊": 2152, "脖": 2153, "脫": 2154, "脹": 2155, "脾": 2156, "腐": 2157, "腔": 2158, "腕": 2159, "腦": 2160, "腫": 2161, "腮": 2162, "腰": 2163, "腱": 2164, "腳": 2165, "腸": 2166, "腹": 2167, "腿": 2168, "膀": 2169, "膏": 2170, "膚": 2171, "膠": 2172, "膨": 2173, "膩": 2174, "膽": 2175, "臂": 2176, "臃": 2177, "臉": 2178, "臘": 2179, "臟": 2180, "臣": 2181, "臥": 2182, "臨": 2183, "自": 2184, "臭": 2185, "至": 2186, "致": 2187, "臺": 2188, "臼": 2189, "舅": 2190, "與": 2191, "興": 2192, "舉": 2193, "舊": 2194, "舌": 2195, "舍": 2196, "舒": 2197, "舔": 2198, "舞": 2199, "舟": 2200, "航": 2201, "般": 2202, "舵": 2203, "船": 2204, "舺": 2205, "艇": 2206, "艋": 2207, "艘": 2208, "艦": 2209, "良": 2210, "艱": 2211, "色": 2212, "艷": 2213, "艾": 2214, "芋": 2215, "芒": 2216, "芙": 2217, "芝": 2218, "芥": 2219, "芬": 2220, "芭": 2221, "花": 2222, "芳": 2223, "芹": 2224, "芽": 2225, "苓": 2226, "苗": 2227, "苟": 2228, "若": 2229, "苦": 2230, "苪": 2231, "英": 2232, "茂": 2233, "范": 2234, "茄": 2235, "茅": 2236, "茫": 2237, "茱": 2238, "茲": 2239, "茶": 2240, "草": 2241, "荒": 2242, "荔": 2243, "荷": 2244, "莊": 2245, "莎": 2246, "莓": 2247, "莫": 2248, "莽": 2249, "菁": 2250, "菅": 2251, "菇": 2252, "菊": 2253, "菌": 2254, "菜": 2255, "華": 2256, "菲": 2257, "菸": 2258, "萄": 2259, "萊": 2260, "萌": 2261, "萬": 2262, "萱": 2263, "落": 2264, "葉": 2265, "著": 2266, "葛": 2267, "葡": 2268, "董": 2269, "葩": 2270, "葬": 2271, "蒂": 2272, "蒐": 2273, "蒙": 2274, "蒜": 2275, "蒨": 2276, "蒸": 2277, "蒼": 2278, "蓉": 2279, "蓋": 2280, "蓮": 2281, "蔔": 2282, "蔗": 2283, "蔚": 2284, "蔡": 2285, "蔣": 2286, "蔥": 2287, "蔭": 2288, "蕉": 2289, "蕩": 2290, "蕾": 2291, "薄": 2292, "薏": 2293, "薑": 2294, "薛": 2295, "薦": 2296, "薩": 2297, "薪": 2298, "薯": 2299, "藉": 2300, "藍": 2301, "藏": 2302, "藝": 2303, "藤": 2304, "藥": 2305, "藩": 2306, "蘆": 2307, "蘇": 2308, "蘊": 2309, "蘋": 2310, "蘑": 2311, "蘭": 2312, "蘿": 2313, "虎": 2314, "虐": 2315, "處": 2316, "虛": 2317, "虞": 2318, "號": 2319, "虧": 2320, "蚊": 2321, "蚤": 2322, "蚵": 2323, "蛇": 2324, "蛋": 2325, "蛙": 2326, "蛛": 2327, "蛤": 2328, "蛻": 2329, "蜂": 2330, "蜘": 2331, "蜜": 2332, "蝦": 2333, "蝴": 2334, "蝶": 2335, "蝸": 2336, "螂": 2337, "螃": 2338, "融": 2339, "螞": 2340, "螢": 2341, "螺": 2342, "蟑": 2343, "蟬": 2344, "蟲": 2345, "蟹": 2346, "蟻": 2347, "蠅": 2348, "蠟": 2349, "蠢": 2350, "蠣": 2351, "蠱": 2352, "蠻": 2353, "血": 2354, "衆": 2355, "行": 2356, "衍": 2357, "術": 2358, "街": 2359, "衛": 2360, "衝": 2361, "衡": 2362, "衣": 2363, "表": 2364, "衫": 2365, "衰": 2366, "袁": 2367, "袋": 2368, "袍": 2369, "袖": 2370, "被": 2371, "袱": 2372, "裁": 2373, "裂": 2374, "裎": 2375, "裏": 2376, "裕": 2377, "裙": 2378, "補": 2379, "裝": 2380, "裡": 2381, "裸": 2382, "製": 2383, "複": 2384, "褐": 2385, "褲": 2386, "襄": 2387, "襪": 2388, "襲": 2389, "西": 2390, "要": 2391, "覆": 2392, "見": 2393, "規": 2394, "覓": 2395, "視": 2396, "親": 2397, "覺": 2398, "覽": 2399, "觀": 2400, "角": 2401, "解": 2402, "觸": 2403, "言": 2404, "訂": 2405, "計": 2406, "訊": 2407, "討": 2408, "訐": 2409, "訓": 2410, "訕": 2411, "託": 2412, "記": 2413, "訝": 2414, "訟": 2415, "訣": 2416, "訪": 2417, "設": 2418, "許": 2419, "訴": 2420, "診": 2421, "註": 2422, "証": 2423, "詐": 2424, "評": 2425, "詛": 2426, "詞": 2427, "詡": 2428, "詢": 2429, "試": 2430, "詩": 2431, "詬": 2432, "詭": 2433, "話": 2434, "該": 2435, "詳": 2436, "詹": 2437, "誇": 2438, "誌": 2439, "認": 2440, "誓": 2441, "誕": 2442, "誘": 2443, "語": 2444, "誠": 2445, "誡": 2446, "誣": 2447, "誤": 2448, "說": 2449, "誰": 2450, "課": 2451, "誼": 2452, "調": 2453, "談": 2454, "請": 2455, "諒": 2456, "論": 2457, "諜": 2458, "諧": 2459, "諮": 2460, "諸": 2461, "諾": 2462, "謀": 2463, "謂": 2464, "謄": 2465, "謊": 2466, "謎": 2467, "講": 2468, "謝": 2469, "謠": 2470, "謹": 2471, "證": 2472, "識": 2473, "譚": 2474, "譜": 2475, "警": 2476, "譯": 2477, "議": 2478, "護": 2479, "譽": 2480, "讀": 2481, "變": 2482, "讓": 2483, "讚": 2484, "谷": 2485, "豆": 2486, "豈": 2487, "豐": 2488, "豚": 2489, "象": 2490, "豪": 2491, "豫": 2492, "豬": 2493, "貂": 2494, "貌": 2495, "貓": 2496, "貝": 2497, "貞": 2498, "負": 2499, "財": 2500, "貢": 2501, "貧": 2502, "貨": 2503, "販": 2504, "貪": 2505, "貫": 2506, "責": 2507, "貯": 2508, "貲": 2509, "貴": 2510, "買": 2511, "貸": 2512, "費": 2513, "貼": 2514, "貿": 2515, "賀": 2516, "賄": 2517, "賅": 2518, "資": 2519, "賈": 2520, "賊": 2521, "賓": 2522, "賜": 2523, "賞": 2524, "賠": 2525, "賢": 2526, "賣": 2527, "賤": 2528, "賦": 2529, "質": 2530, "賭": 2531, "賴": 2532, "賺": 2533, "購": 2534, "賽": 2535, "贈": 2536, "贊": 2537, "贏": 2538, "赤": 2539, "赫": 2540, "走": 2541, "赴": 2542, "起": 2543, "趁": 2544, "超": 2545, "越": 2546, "趕": 2547, "趙": 2548, "趟": 2549, "趣": 2550, "趨": 2551, "足": 2552, "趴": 2553, "跆": 2554, "跋": 2555, "跌": 2556, "跑": 2557, "跚": 2558, "距": 2559, "跟": 2560, "跡": 2561, "跨": 2562, "路": 2563, "跳": 2564, "踉": 2565, "踏": 2566, "踐": 2567, "踢": 2568, "踩": 2569, "踴": 2570, "踹": 2571, "蹄": 2572, "蹈": 2573, "蹌": 2574, "蹟": 2575, "蹣": 2576, "蹤": 2577, "蹦": 2578, "蹲": 2579, "蹺": 2580, "躁": 2581, "躍": 2582, "身": 2583, "躲": 2584, "躺": 2585, "軀": 2586, "車": 2587, "軌": 2588, "軍": 2589, "軟": 2590, "軸": 2591, "軾": 2592, "較": 2593, "載": 2594, "輒": 2595, "輔": 2596, "輕": 2597, "輝": 2598, "輩": 2599, "輪": 2600, "輯": 2601, "輸": 2602, "輻": 2603, "輾": 2604, "輿": 2605, "轄": 2606, "轉": 2607, "轍": 2608, "轎": 2609, "辛": 2610, "辜": 2611, "辟": 2612, "辣": 2613, "辦": 2614, "辨": 2615, "辭": 2616, "辯": 2617, "辱": 2618, "農": 2619, "迅": 2620, "迎": 2621, "近": 2622, "返": 2623, "迪": 2624, "迫": 2625, "述": 2626, "迴": 2627, "迷": 2628, "追": 2629, "退": 2630, "送": 2631, "逃": 2632, "逆": 2633, "透": 2634, "逐": 2635, "途": 2636, "逕": 2637, "逗": 2638, "這": 2639, "通": 2640, "逛": 2641, "逝": 2642, "逞": 2643, "速": 2644, "造": 2645, "逢": 2646, "連": 2647, "逮": 2648, "週": 2649, "進": 2650, "逼": 2651, "逾": 2652, "遁": 2653, "遇": 2654, "遊": 2655, "運": 2656, "遍": 2657, "過": 2658, "道": 2659, "達": 2660, "違": 2661, "遙": 2662, "遜": 2663, "遞": 2664, "遠": 2665, "適": 2666, "遭": 2667, "遮": 2668, "遲": 2669, "遴": 2670, "遵": 2671, "遷": 2672, "選": 2673, "遺": 2674, "遼": 2675, "遽": 2676, "避": 2677, "邀": 2678, "邁": 2679, "還": 2680, "邊": 2681, "邏": 2682, "那": 2683, "邦": 2684, "邪": 2685, "邱": 2686, "邵": 2687, "郁": 2688, "郊": 2689, "郎": 2690, "郝": 2691, "部": 2692, "郭": 2693, "郵": 2694, "都": 2695, "鄉": 2696, "鄧": 2697, "鄭": 2698, "鄰": 2699, "酋": 2700, "配": 2701, "酒": 2702, "酗": 2703, "酪": 2704, "酬": 2705, "酮": 2706, "酵": 2707, "酶": 2708, "酷": 2709, "酸": 2710, "醉": 2711, "醋": 2712, "醒": 2713, "醜": 2714, "醫": 2715, "醬": 2716, "釁": 2717, "采": 2718, "釋": 2719, "里": 2720, "重": 2721, "野": 2722, "量": 2723, "釐": 2724, "金": 2725, "釘": 2726, "針": 2727, "釣": 2728, "鈉": 2729, "鈔": 2730, "鈕": 2731, "鈣": 2732, "鈴": 2733, "鉅": 2734, "鉛": 2735, "鉤": 2736, "銀": 2737, "銘": 2738, "銜": 2739, "銷": 2740, "鋒": 2741, "鋪": 2742, "鋸": 2743, "鋼": 2744, "錄": 2745, "錐": 2746, "錢": 2747, "錦": 2748, "錨": 2749, "錫": 2750, "錯": 2751, "錶": 2752, "鍊": 2753, "鍋": 2754, "鍛": 2755, "鍵": 2756, "鍾": 2757, "鎂": 2758, "鎖": 2759, "鎮": 2760, "鏈": 2761, "鏡": 2762, "鐘": 2763, "鐵": 2764, "鑑": 2765, "鑰": 2766, "鑼": 2767, "長": 2768, "門": 2769, "閃": 2770, "閉": 2771, "開": 2772, "閒": 2773, "間": 2774, "閱": 2775, "闆": 2776, "闊": 2777, "闖": 2778, "關": 2779, "闢": 2780, "阱": 2781, "防": 2782, "阻": 2783, "阿": 2784, "陀": 2785, "附": 2786, "陌": 2787, "降": 2788, "限": 2789, "陡": 2790, "院": 2791, "陣": 2792, "除": 2793, "陪": 2794, "陰": 2795, "陳": 2796, "陵": 2797, "陶": 2798, "陷": 2799, "陸": 2800, "陽": 2801, "隆": 2802, "隊": 2803, "階": 2804, "隔": 2805, "際": 2806, "障": 2807, "隧": 2808, "隨": 2809, "險": 2810, "隱": 2811, "隻": 2812, "雀": 2813, "雄": 2814, "雅": 2815, "集": 2816, "雇": 2817, "雌": 2818, "雕": 2819, "雖": 2820, "雙": 2821, "雛": 2822, "雜": 2823, "雞": 2824, "離": 2825, "難": 2826, "雨": 2827, "雪": 2828, "雲": 2829, "零": 2830, "雷": 2831, "電": 2832, "需": 2833, "霄": 2834, "震": 2835, "霉": 2836, "霍": 2837, "霖": 2838, "霧": 2839, "露": 2840, "霸": 2841, "霹": 2842, "��": 2843, "靈": 2844, "青": 2845, "靜": 2846, "非": 2847, "靠": 2848, "靡": 2849, "面": 2850, "革": 2851, "靭": 2852, "靴": 2853, "鞋": 2854, "鞏": 2855, "鞭": 2856, "韋": 2857, "韓": 2858, "音": 2859, "韻": 2860, "響": 2861, "頁": 2862, "頂": 2863, "頃": 2864, "項": 2865, "順": 2866, "須": 2867, "預": 2868, "頓": 2869, "頗": 2870, "領": 2871, "頭": 2872, "頰": 2873, "頸": 2874, "頻": 2875, "顆": 2876, "題": 2877, "額": 2878, "顏": 2879, "願": 2880, "類": 2881, "顧": 2882, "顯": 2883, "顰": 2884, "風": 2885, "颱": 2886, "飄": 2887, "飆": 2888, "飛": 2889, "食": 2890, "飩": 2891, "飪": 2892, "飯": 2893, "飲": 2894, "飽": 2895, "飾": 2896, "餃": 2897, "餅": 2898, "養": 2899, "餐": 2900, "餓": 2901, "餘": 2902, "餚": 2903, "餛": 2904, "餡": 2905, "館": 2906, "餮": 2907, "餵": 2908, "饋": 2909, "饒": 2910, "饕": 2911, "饗": 2912, "首": 2913, "香": 2914, "馥": 2915, "馨": 2916, "馬": 2917, "馭": 2918, "馳": 2919, "馴": 2920, "駁": 2921, "駐": 2922, "駕": 2923, "駛": 2924, "駭": 2925, "騎": 2926, "騙": 2927, "騰": 2928, "騷": 2929, "騾": 2930, "驅": 2931, "驕": 2932, "驗": 2933, "驚": 2934, "驟": 2935, "骨": 2936, "骼": 2937, "髒": 2938, "髓": 2939, "體": 2940, "高": 2941, "髮": 2942, "鬆": 2943, "鬍": 2944, "鬚": 2945, "鬥": 2946, "鬧": 2947, "鬱": 2948, "鬼": 2949, "魂": 2950, "魄": 2951, "魅": 2952, "魍": 2953, "魎": 2954, "魏": 2955, "魑": 2956, "魔": 2957, "魚": 2958, "魯": 2959, "魷": 2960, "鮪": 2961, "鮭": 2962, "鮮": 2963, "鯊": 2964, "鯛": 2965, "鯨": 2966, "鰈": 2967, "鰜": 2968, "鰭": 2969, "鰻": 2970, "鱔": 2971, "鱷": 2972, "鳥": 2973, "鳩": 2974, "鳳": 2975, "鳴": 2976, "鴉": 2977, "鴨": 2978, "鴻": 2979, "鵝": 2980, "鵲": 2981, "鶯": 2982, "鶴": 2983, "鷹": 2984, "鹹": 2985, "鹽": 2986, "鹿": 2987, "麗": 2988, "麥": 2989, "麵": 2990, "麻": 2991, "麼": 2992, "麽": 2993, "黃": 2994, "黎": 2995, "黑": 2996, "默": 2997, "點": 2998, "黨": 2999, "黯": 3000, "鼎": 3001, "鼓": 3002, "鼠": 3003, "鼻": 3004, "齁": 3005, "齊": 3006, "齋": 3007, "齒": 3008, "齡": 3009, "龍": 3010, "龐": 3011, "龜": 3012, "!": 3013, ",": 3014, ":": 3015, ";": 3016, "?": 3017, "a": 3018, "b": 3019, "f": 3020, "g": 3021, "i": 3022, "n": 3023, "p": 3024, "t": 3025, "~": 3026, "|": 0, "[UNK]": 3027, "[PAD]": 3028}
|