dataset: | |
training: [] | |
validation: [ ] | |
noise: [] | |
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" | |
use_hdf5: True | |
use_metadata: True | |
hdf5_flag: r | |
validate: True | |
workers: 2 | |
cache: True | |
phones_range: [4, 256] | |
duration_range: [1.0, 16.0] | |
random_utterance: 1.0 | |
max_prompts: 3 | |
prompt_duration: 6.0 | |
sample_type: speaker | |
tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"] | |
models: | |
_prom_levels: 8 | |
_max_levels: 8 | |
_models: | |
- name: "ar+nar" | |
size: "full" | |
resp_levels: 8 | |
prom_levels: 8 | |
tasks: 8 | |
langs: 2 | |
arch_type: "retnet" | |
training: True | |
version: 3 | |
hyperparameters: | |
batch_size: 8 | |
gradient_accumulation_steps: 32 | |
gradient_clipping: 100 | |
optimizer: AdamW # Prodigy | |
torch_optimizer: True | |
learning_rate: 1.0e-4 | |
scheduler_type: "" | |
#scheduler_type: OneCycle | |
#scheduler_params: | |
# cycle_first_step_size: 10_000 | |
# cycle_first_stair_count: 10_000 | |
# cycle_second_step_size: 15_000 | |
# cycle_second_stair_count: 15_000 | |
# decay_step_size: 5_000 | |
# cycle_min_lr: 2.5e-4 # 1.0e-5 | |
# cycle_max_lr: 2.5e-4 # 1.0e-4 | |
# decay_lr_rate: 0.0 | |
# cycle_min_mom: 0.90 | |
# cycle_max_mom: 0.99 | |
# decay_mom_rate: 0.0 | |
evaluation: | |
batch_size: 16 | |
frequency: 250 | |
size: 16 | |
steps: 450 | |
ar_temperature: 0.95 | |
nar_temperature: 0.25 | |
load_disabled_engines: True | |
trainer: | |
iterations: 1_000_000 | |
save_tag: step | |
save_on_oom: True | |
save_on_quit: True | |
save_frequency: 100 | |
export_on_save: True | |
keep_last_checkpoints: 4 | |
aggressive_optimizations: False | |
load_disabled_engines: False | |
load_state_dict: True | |
strict_loading: False | |
#load_tag: "9500" | |
#load_states: False | |
#restart_step_count: True | |
gc_mode: None # "global_step" | |
weight_dtype: bfloat16 | |
amp: False | |
backend: deepspeed | |
deepspeed: | |
inferencing: False | |
zero_optimization_level: 0 | |
use_compression_training: False | |
activation_checkpointing: True | |
inference: | |
backend: deepspeed | |
use_vocos: True | |
normalize: False | |
weight_dtype: bfloat16 | |
amp: False | |
bitsandbytes: | |
enabled: False | |
injects: True | |
linear: True | |
embedding: True | |