ecker
/

vall-e

Model card Files Files and versions Community

ecker commited on 19 days ago

Commit

494a301

•

1 Parent(s): 09f258a

Update models/config.retnet.yaml

Browse files

Files changed (1) hide show

models/config.retnet.yaml +32 -61

models/config.retnet.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 sample_rate: 24_000
-audio_backend: vocos
 models:
 - name: "ar+nar"
@@ -17,17 +17,18 @@ models:
   experimental:
     audio_embedding_sums: True
-hyperparameters:
-  autotune: False
-  autotune_params:
-    start_profile_step: 1
-    end_profile_step: 50
-    num_tuning_micro_batch_sizes: 8
-  batch_size: 16
   gradient_accumulation_steps: 8
   gradient_clipping: 1.0
-  warmup_steps: 250
   optimizer: Prodigy
   learning_rate: 1.0
@@ -37,59 +38,32 @@ hyperparameters:
   torch_scheduler: True
 evaluation:
-  batch_size: 16
-  frequency: 1000
-  size: 16
   steps: 500
-  ar_temperature: 0.95
-  nar_temperature: 0.25
-  load_disabled_engines: True
 trainer:
-  #no_logger: True
-  ddp: False
-  check_for_oom: False
-  iterations: 1_000_000
-  save_tag: step
-  save_on_oom: True
-  save_on_quit: True
-  save_frequency: 500
-  export_on_save: True
-  keep_last_checkpoints: 8
-  aggressive_optimizations: False
-  load_disabled_engines: False
   gradient_checkpointing: True
-  #load_state_dict: True
-  strict_loading: False
-  #load_tag: "9500"
-  #load_states: False
-  #restart_step_count: True
-  gc_mode: None # "global_step"
   weight_dtype: bfloat16
   amp: True
   backend: deepspeed
   deepspeed:
-    inferencing: True
-    zero_optimization_level: 0
-    use_compression_training: False
     amp: False
-  load_webui: False
 inference:
-  backend: deepspeed
-  audio_backend: "vocos"
-  normalize: False
   weight_dtype: bfloat16
   amp: True
@@ -107,31 +81,28 @@ optimizations:
   fp8: False
 dataset:
-  speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
-  speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
-  speaker_languages:
-    ja: []
   use_hdf5: True
-  use_metadata: True
   hdf5_flag: r
   validate: True
-  workers: 6
   cache: True
-  duration_range: [3.0, 16.0]
-  random_utterance: 1.0
-  max_prompts: 1
-  prompt_duration_range: [3.0, 9.0]
-  max_resps: 1
-  p_resp_append: 0.25
   sample_type: path # path # speaker
-  tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
   training: []
   validation: []

 sample_rate: 24_000
+audio_backend: "vocos"
 models:
 - name: "ar+nar"
   experimental:
     audio_embedding_sums: True
+#loras:
+#- name : "lora"
+#  rank: 128
+#  alpha: 128
+#  training: True
+#  rvq_levels: []
+hyperparameters:
+  batch_size: 32
   gradient_accumulation_steps: 8
   gradient_clipping: 1.0
+  warmup_steps: 10
   optimizer: Prodigy
   learning_rate: 1.0
   torch_scheduler: True
 evaluation:
+  batch_size: 4
+  frequency: 250
+  size: 4
   steps: 500
+  ar_temperature: 1.0
+  nar_temperature: 0.0
 trainer:
+  iterations: 1_000_000
+  save_frequency: 250
+  keep_last_checkpoints: 4
+  resize_modules: True
   gradient_checkpointing: True
   weight_dtype: bfloat16
   amp: True
   backend: deepspeed
   deepspeed:
+    inferencing: False
     amp: False
 inference:
+  backend: local
   weight_dtype: bfloat16
   amp: True
   fp8: False
 dataset:
   use_hdf5: True
   hdf5_flag: r
+  use_metadata: True
   validate: True
+  workers: 1
   cache: True
+  duration_range: [3.0, 12.0]
+  prompt_max_samples: 1
+  prompt_duration_range: [3.0, 3.0]
+  resps_max_samples: 1
   sample_type: path # path # speaker
+  sample_order: duration
+  sample_max_duration_batch: 300
+  sample_shuffle: False
+  tasks_list: [ "tts", "stt" ]
   training: []
   validation: []