|
dataset: |
|
training: [ |
|
] |
|
|
|
validation: [ |
|
] |
|
noise: [ |
|
] |
|
|
|
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" |
|
|
|
use_hdf5: True |
|
hdf5_flag: r |
|
validate: True |
|
|
|
workers: 4 |
|
cache: True |
|
|
|
phones_range: [4, 512] |
|
duration_range: [1.0, 24.0] |
|
|
|
random_utterance: 1.0 |
|
max_prompts: 3 |
|
prompt_duration: 3.0 |
|
|
|
sample_type: speaker |
|
|
|
tasks_list: ["tts"] |
|
|
|
models: |
|
_max_levels: 8 |
|
_models: |
|
- name: "ar" |
|
size: "full" |
|
resp_levels: 1 |
|
prom_levels: 2 |
|
tasks: 8 |
|
arch_type: "retnet" |
|
|
|
- name: "nar" |
|
size: "full" |
|
resp_levels: 3 |
|
prom_levels: 4 |
|
tasks: 8 |
|
arch_type: "retnet" |
|
|
|
|
|
hyperparameters: |
|
batch_size: 32 |
|
gradient_accumulation_steps: 4 |
|
gradient_clipping: 100 |
|
|
|
optimizer: AdamW |
|
learning_rate: 1.0e-6 |
|
|
|
scheduler_type: "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation: |
|
batch_size: 64 |
|
frequency: 500 |
|
size: 64 |
|
|
|
steps: 300 |
|
ar_temperature: 0.95 |
|
nar_temperature: 0.25 |
|
|
|
trainer: |
|
iterations: 1_000_000 |
|
|
|
save_tag: step |
|
save_on_oom: True |
|
save_on_quit: True |
|
save_frequency: 25 |
|
|
|
keep_last_checkpoints: 2 |
|
|
|
aggressive_optimizations: False |
|
|
|
load_state_dict: True |
|
strict_loading: False |
|
|
|
|
|
|
|
|
|
gc_mode: None |
|
|
|
weight_dtype: bfloat16 |
|
|
|
backend: deepspeed |
|
deepspeed: |
|
zero_optimization_level: 2 |
|
use_compression_training: True |
|
|
|
inference: |
|
use_vocos: True |
|
normalize: False |
|
|
|
weight_dtype: float32 |
|
|
|
bitsandbytes: |
|
enabled: False |
|
injects: True |
|
linear: True |
|
embedding: True |
|
|