sample_rate: 24_000
audio_backend: "vocos"

models:
- name: "ar+nar"
  size: "full"
  resp_levels: 8
  prom_levels: 8
  tasks: 9
  langs: 4
  tones: 1
  arch_type: llama
  training: True
  version: 5
  attention: auto
  dropout: 0.1
  #loss_factors:
  #  text: 0.01
  #  prom: 0.5
  #  resp: 1.0
  capabilities: ["ar", "nar"]
  experimental:
    p_rvq_levels: "auto"
    audio_embedding_sums: True
    unified_position_ids: False
    split_classifiers: True
    # 
    causal_size: 1
    interleave: False
    rvq_level_range: []
    tie_classifier_to_embedding: False

loras:
- name : "lora-glados"
  rank: 128
  alpha: 128
  training: True
  rvq_levels: []

hyperparameters:
  batch_size: 32
  gradient_accumulation_steps: 8
  gradient_clipping: 1.0
  warmup_steps: 10

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 4
  frequency: 250
  size: 4
  
  steps: 500
  ar_temperature: 1.0
  nar_temperature: 0.0

trainer:
  iterations: 1_000_000  
  save_frequency: 250
  keep_last_checkpoints: 4

  resize_modules: True
  
  check_for_oom: False
  gradient_checkpointing: True

  weight_dtype: bfloat16
  amp: True

  backend: deepspeed
  deepspeed:
    inferencing: False
    amp: False

  load_webui: False

inference:
  backend: local
  normalize: False

  weight_dtype: bfloat16
  amp: True

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  use_hdf5: True
  hdf5_flag: r
  
  use_metadata: True
  validate: True

  workers: 1
  cache: True

  duration_range: [3.0, 12.0]

  random_utterance: 1.0
  max_prompts: 1
  prompt_duration_range: [3.0, 3.0]
  
  max_resps: 1
  p_resp_append: 0.25

  sample_type: path # path # speaker
  sample_order: duration
  sample_max_duration_batch: 300
  sample_shuffle: False

  tasks_list: [ "tts", "stt" ]

  training: []
  validation: []
  noise: []