sample_rate: 24_000 | |
audio_backend: "vocos" | |
models: | |
- name: "ar+nar" | |
size: "full" | |
resp_levels: 8 | |
prom_levels: 8 | |
tasks: 8 | |
langs: 2 | |
tones: 1 | |
arch_type: retnet | |
training: False | |
version: 2 | |
dropout: 0.1 | |
capabilities: ["ar", "nar"] | |
experimental: | |
audio_embedding_sums: True | |
#loras: | |
#- name : "lora" | |
# rank: 128 | |
# alpha: 128 | |
# training: True | |
# rvq_levels: [] | |
hyperparameters: | |
batch_size: 32 | |
gradient_accumulation_steps: 8 | |
gradient_clipping: 1.0 | |
warmup_steps: 10 | |
optimizer: Prodigy | |
learning_rate: 1.0 | |
torch_optimizer: True | |
scheduler: "" # ScheduleFree | |
torch_scheduler: True | |
evaluation: | |
batch_size: 4 | |
frequency: 250 | |
size: 4 | |
steps: 500 | |
ar_temperature: 1.0 | |
nar_temperature: 0.0 | |
trainer: | |
iterations: 1_000_000 | |
save_frequency: 250 | |
keep_last_checkpoints: 4 | |
resize_modules: True | |
gradient_checkpointing: True | |
weight_dtype: bfloat16 | |
amp: True | |
backend: deepspeed | |
deepspeed: | |
inferencing: False | |
amp: False | |
inference: | |
backend: local | |
weight_dtype: bfloat16 | |
amp: True | |
optimizations: | |
injects: False | |
replace: True | |
linear: False | |
embedding: False | |
optimizers: True | |
bitsandbytes: False | |
dadaptation: False | |
bitnet: False | |
fp8: False | |
dataset: | |
use_hdf5: True | |
hdf5_flag: r | |
use_metadata: True | |
validate: True | |
workers: 1 | |
cache: True | |
duration_range: [3.0, 12.0] | |
prompt_max_samples: 1 | |
prompt_duration_range: [3.0, 3.0] | |
resps_max_samples: 1 | |
sample_type: path # path # speaker | |
sample_order: duration | |
sample_max_duration_batch: 300 | |
sample_shuffle: False | |
tasks_list: [ "tts", "stt" ] | |
training: [] | |
validation: [] | |
noise: [] |