File size: 1,830 Bytes
09f258a 1ea585e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
sample_rate: 24_000
audio_backend: "vocos"
models:
- name: "ar+nar-tts+stt"
size: "full"
resp_levels: 8
prom_levels: 8
tasks: 9
langs: 4
tones: 1
arch_type: llama
training: False
version: 5
attention: auto
dropout: 0.1
#loss_factors:
# text: 0.01
# prom: 0.5
# resp: 1.0
capabilities: ["ar", "nar"]
experimental:
# modifies model arch
audio_embedding_sums: True
unified_position_ids: False
split_classifiers: True
#loras:
#- name : "lora"
# rank: 128
# alpha: 128
# training: True
# rvq_levels: []
hyperparameters:
batch_size: 32
gradient_accumulation_steps: 8
gradient_clipping: 1.0
warmup_steps: 10
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 4
frequency: 250
size: 4
steps: 500
ar_temperature: 1.0
nar_temperature: 0.0
trainer:
iterations: 1_000_000
save_frequency: 250
keep_last_checkpoints: 4
resize_modules: True
gradient_checkpointing: True
weight_dtype: bfloat16
amp: True
backend: deepspeed
deepspeed:
inferencing: False
amp: False
inference:
backend: local
weight_dtype: bfloat16
amp: True
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
use_hdf5: True
hdf5_flag: r
use_metadata: True
validate: True
workers: 1
cache: True
duration_range: [3.0, 12.0]
prompt_max_samples: 1
prompt_duration_range: [3.0, 3.0]
resps_max_samples: 1
sample_type: path # path # speaker
sample_order: duration
sample_max_duration_batch: 300
sample_shuffle: False
tasks_list: [ "tts", "stt" ]
training: []
validation: []
noise: [] |