| sample_rate: 24_000 | |
| audio_backend: "vocos" | |
| models: | |
| - name: "ar+nar" | |
| size: "full" | |
| resp_levels: 8 | |
| prom_levels: 8 | |
| tasks: 8 | |
| langs: 2 | |
| tones: 1 | |
| arch_type: retnet | |
| training: False | |
| version: 2 | |
| dropout: 0.1 | |
| capabilities: ["ar", "nar"] | |
| experimental: | |
| audio_embedding_sums: True | |
| #loras: | |
| #- name : "lora" | |
| # rank: 128 | |
| # alpha: 128 | |
| # training: True | |
| # rvq_levels: [] | |
| hyperparameters: | |
| batch_size: 32 | |
| gradient_accumulation_steps: 8 | |
| gradient_clipping: 1.0 | |
| warmup_steps: 10 | |
| optimizer: Prodigy | |
| learning_rate: 1.0 | |
| torch_optimizer: True | |
| scheduler: "" # ScheduleFree | |
| torch_scheduler: True | |
| evaluation: | |
| batch_size: 4 | |
| frequency: 250 | |
| size: 4 | |
| steps: 500 | |
| ar_temperature: 1.0 | |
| nar_temperature: 0.0 | |
| trainer: | |
| iterations: 1_000_000 | |
| save_frequency: 250 | |
| keep_last_checkpoints: 4 | |
| resize_modules: True | |
| gradient_checkpointing: True | |
| weight_dtype: bfloat16 | |
| amp: True | |
| backend: deepspeed | |
| deepspeed: | |
| inferencing: False | |
| amp: False | |
| inference: | |
| backend: local | |
| weight_dtype: bfloat16 | |
| amp: True | |
| optimizations: | |
| injects: False | |
| replace: True | |
| linear: False | |
| embedding: False | |
| optimizers: True | |
| bitsandbytes: False | |
| dadaptation: False | |
| bitnet: False | |
| fp8: False | |
| dataset: | |
| use_hdf5: True | |
| hdf5_flag: r | |
| use_metadata: True | |
| validate: True | |
| workers: 1 | |
| cache: True | |
| duration_range: [3.0, 12.0] | |
| prompt_max_samples: 1 | |
| prompt_duration_range: [3.0, 3.0] | |
| resps_max_samples: 1 | |
| sample_type: path # path # speaker | |
| sample_order: duration | |
| sample_max_duration_batch: 300 | |
| sample_shuffle: False | |
| tasks_list: [ "tts", "stt" ] | |
| training: [] | |
| validation: [] | |
| noise: [] |