File size: 1,656 Bytes
2de4670 494a301 2de4670 0c4f028 2de4670 0c4f028 2de4670 03504d1 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 494a301 2de4670 0c4f028 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
sample_rate: 24_000
audio_backend: "vocos"
models:
- name: "ar+nar"
size: "full"
resp_levels: 8
prom_levels: 8
tasks: 8
langs: 2
tones: 1
arch_type: retnet
training: False
version: 2
dropout: 0.1
capabilities: ["ar", "nar"]
experimental:
audio_embedding_sums: True
#loras:
#- name : "lora"
# rank: 128
# alpha: 128
# training: True
# rvq_levels: []
hyperparameters:
batch_size: 32
gradient_accumulation_steps: 8
gradient_clipping: 1.0
warmup_steps: 10
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 4
frequency: 250
size: 4
steps: 500
ar_temperature: 1.0
nar_temperature: 0.0
trainer:
iterations: 1_000_000
save_frequency: 250
keep_last_checkpoints: 4
resize_modules: True
gradient_checkpointing: True
weight_dtype: bfloat16
amp: True
backend: deepspeed
deepspeed:
inferencing: False
amp: False
inference:
backend: local
weight_dtype: bfloat16
amp: True
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
use_hdf5: True
hdf5_flag: r
use_metadata: True
validate: True
workers: 1
cache: True
duration_range: [3.0, 12.0]
prompt_max_samples: 1
prompt_duration_range: [3.0, 3.0]
resps_max_samples: 1
sample_type: path # path # speaker
sample_order: duration
sample_max_duration_batch: 300
sample_shuffle: False
tasks_list: [ "tts", "stt" ]
training: []
validation: []
noise: [] |