mrq
replaced ar+nar-tts+stt-llama-8 as the base ar+nar-llama-8 since it pretty much surpasses it now
1ea585e
sample_rate: 24_000 | |
audio_backend: "vocos" | |
models: | |
- name: "ar+nar-tts+stt" | |
size: "full" | |
resp_levels: 8 | |
prom_levels: 8 | |
tasks: 9 | |
langs: 4 | |
tones: 1 | |
arch_type: llama | |
training: False | |
version: 5 | |
attention: auto | |
dropout: 0.1 | |
#loss_factors: | |
# text: 0.01 | |
# prom: 0.5 | |
# resp: 1.0 | |
capabilities: ["ar", "nar"] | |
experimental: | |
p_rvq_levels: "auto" | |
audio_embedding_sums: True | |
unified_position_ids: False | |
split_classifiers: True | |
# | |
causal_size: 1 | |
interleave: False | |
rvq_level_range: [] | |
tie_classifier_to_embedding: False | |
#loras: | |
#- name : "lora" | |
# rank: 128 | |
# alpha: 128 | |
# training: True | |
# rvq_levels: [] | |
hyperparameters: | |
batch_size: 32 | |
gradient_accumulation_steps: 8 | |
gradient_clipping: 1.0 | |
warmup_steps: 10 | |
optimizer: Prodigy | |
learning_rate: 1.0 | |
torch_optimizer: True | |
scheduler: "" # ScheduleFree | |
torch_scheduler: True | |
evaluation: | |
batch_size: 4 | |
frequency: 250 | |
size: 4 | |
steps: 500 | |
ar_temperature: 1.0 | |
nar_temperature: 0.0 | |
trainer: | |
iterations: 1_000_000 | |
save_frequency: 250 | |
keep_last_checkpoints: 4 | |
resize_modules: True | |
check_for_oom: False | |
gradient_checkpointing: True | |
weight_dtype: bfloat16 | |
amp: True | |
backend: deepspeed | |
deepspeed: | |
inferencing: False | |
amp: False | |
load_webui: False | |
inference: | |
backend: local | |
normalize: False | |
weight_dtype: bfloat16 | |
amp: True | |
optimizations: | |
injects: False | |
replace: True | |
linear: False | |
embedding: False | |
optimizers: True | |
bitsandbytes: False | |
dadaptation: False | |
bitnet: False | |
fp8: False | |
dataset: | |
use_hdf5: True | |
hdf5_flag: r | |
use_metadata: True | |
validate: True | |
workers: 1 | |
cache: True | |
duration_range: [3.0, 12.0] | |
random_utterance: 1.0 | |
max_prompts: 1 | |
prompt_duration_range: [3.0, 3.0] | |
max_resps: 1 | |
p_resp_append: 0.25 | |
sample_type: path # path # speaker | |
sample_order: duration | |
sample_max_duration_batch: 300 | |
sample_shuffle: False | |
tasks_list: [ "tts", "stt" ] | |
training: [] | |
validation: [] | |
noise: [] |