ecker
/

vall-e

Model card Files Files and versions Community

File size: 1,830 Bytes

sample_rate: 24_000
audio_backend: "vocos"

models:
- name: "ar+nar-tts+stt"
  size: "full"
  resp_levels: 8
  prom_levels: 8
  tasks: 9
  langs: 4
  tones: 1
  arch_type: llama
  training: False
  version: 5
  attention: auto
  dropout: 0.1
  #loss_factors:
  #  text: 0.01
  #  prom: 0.5
  #  resp: 1.0
  capabilities: ["ar", "nar"]
  experimental:
    # modifies model arch
    audio_embedding_sums: True
    unified_position_ids: False
    split_classifiers: True

#loras:
#- name : "lora"
#  rank: 128
#  alpha: 128
#  training: True
#  rvq_levels: []

hyperparameters:
  batch_size: 32
  gradient_accumulation_steps: 8
  gradient_clipping: 1.0
  warmup_steps: 10

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 4
  frequency: 250
  size: 4
  
  steps: 500
  ar_temperature: 1.0
  nar_temperature: 0.0

trainer:
  iterations: 1_000_000  
  save_frequency: 250
  keep_last_checkpoints: 4

  resize_modules: True
  gradient_checkpointing: True

  weight_dtype: bfloat16
  amp: True

  backend: deepspeed
  deepspeed:
    inferencing: False
    amp: False

inference:
  backend: local
  weight_dtype: bfloat16
  amp: True

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  use_hdf5: True
  hdf5_flag: r
  
  use_metadata: True
  validate: True

  workers: 1
  cache: True

  duration_range: [3.0, 12.0]

  prompt_max_samples: 1
  prompt_duration_range: [3.0, 3.0]
  
  resps_max_samples: 1

  sample_type: path # path # speaker
  sample_order: duration
  sample_max_duration_batch: 300
  sample_shuffle: False

  tasks_list: [ "tts", "stt" ]

  training: []
  validation: []
  noise: []