vall-e / models /experiments /config.dac-nar-len.yaml
mrq
stuff
db6b323
raw
history blame
2.19 kB
sample_rate: 44_000
audio_backend: "dac"
models:
- name: "nar-len"
size:
audio_tokens: 1024
text_tokens: 256
dim: 1024
heads: 16
layers: 16
resp_levels: 9
prom_levels: 9
tasks: 8
langs: 2
tones: 1
arch_type: llama
training: True
version: 5
attention: flash_attention_2
dropout: 0.1
#loss_factors:
# text: 0.01
# prom: 0.5
# resp: 1.0
# len: 1.0
capabilities: ["nar", "len"]
experimental:
audio_embedding_sums: False
interleave: False
unified_position_ids: True
rvq_level_range: []
split_classifiers: True
tie_classifier_to_embedding: False
#loras:
#- name : "lora-test"
# rank: 128
# alpha: 128
# training: True
# rvq_levels: []
hyperparameters:
batch_size: 16
gradient_accumulation_steps: 4
gradient_clipping: 1.0
warmup_steps: 10
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 4
frequency: 250
size: 4
steps: 500
ar_temperature: 1.0
nar_temperature: 0.0
trainer:
iterations: 1_000_000
save_frequency: 250
keep_last_checkpoints: 4
check_for_oom: False
gradient_checkpointing: False
weight_dtype: bfloat16
amp: False
backend: deepspeed
deepspeed:
inferencing: False
amp: False
load_webui: False
inference:
backend: local
normalize: False
weight_dtype: bfloat16
amp: False
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
use_hdf5: True
hdf5_flag: r
use_metadata: True
validate: True
workers: 1
cache: False
duration_range: [3.0, 24.0]
random_utterance: 1.0
max_prompts: 1
prompt_duration_range: [3.0, 3.0]
max_resps: 1
p_resp_append: 0.25
sample_type: path # path # speaker
sample_order: duration
sample_max_duration_batch: 100
tasks_list: [ "tts" ] #, "tts-c", "ns", "sr" ]
training: []
validation: []
noise: []