ecker
/

vall-e

Model card Files Files and versions Community

ecker commited on Aug 22, 2023

Commit

f0fb314

·

1 Parent(s): 2b739a9

So far so good

Files changed (4) hide show

README.md +6 -0
ckpt/ar-retnet-4/fp32.pth +3 -0
ckpt/nar-retnet-4/fp32.pth +3 -0
config.yaml +121 -0

README.md CHANGED Viewed

@@ -1,3 +1,9 @@
 ---
 license: agpl-3.0
 ---

 ---
 license: agpl-3.0
 ---
+This repo contains the necessary weights and configuration file for use with my VALL-E implementation: [mrq/vall-e](https://git.ecker.tech/mrq/vall-e)
+The model currently is in a *semi-usable* state, and I'm only releasing them now in hopes that it also helps jumpstart anyone else that wants to use them.
+In the future, I'll release my dataset as well, so anyone can also grab the dataset and train from scratch or continue off from this repo.

ckpt/ar-retnet-4/fp32.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddbb2dc8049ccfc5547d8dcfb5c6c47dc82b7bcdb3014a3bcf193e21588f254a
+size 418040447

ckpt/nar-retnet-4/fp32.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0170d5e6862cfb5871de952e93ff848e457f631a2cddd2975407c9d4031d2f46
+size 422230591

config.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+dataset:
+  training: [
+  ]
+  validation: [
+  ]
+  noise: [
+  ]
+  speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
+  use_hdf5: True
+  hdf5_flag: r
+  validate: True
+  workers: 4
+  cache: True
+  phones_range: [4, 512]
+  duration_range: [1.0, 24.0]
+  random_utterance: 1.0
+  max_prompts: 3
+  prompt_duration: 3.0
+  sample_type: speaker
+  tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
+models:
+  _max_levels: 8
+  _models:
+  - name: "ar"
+    size: "full"
+    resp_levels: 1
+    prom_levels: 2
+    tasks: 8
+    arch_type: "retnet"
+  - name: "nar"
+    size: "full"
+    resp_levels: 3
+    prom_levels: 4
+    tasks: 8
+    arch_type: "retnet"
+hyperparameters:
+  batch_size: 32
+  gradient_accumulation_steps: 4
+  gradient_clipping: 100
+  optimizer: AdamW
+  learning_rate: 1.0e-6
+  scheduler_type: ""
+  #scheduler_type: OneCycle
+  #scheduler_params:
+  #  cycle_first_step_size: 10_000
+  #  cycle_first_stair_count: 10_000
+  #  cycle_second_step_size: 15_000
+  #  cycle_second_stair_count: 15_000
+  #  decay_step_size: 5_000
+  #  cycle_min_lr: 2.5e-4 # 1.0e-5
+  #  cycle_max_lr: 2.5e-4 # 1.0e-4
+  #  decay_lr_rate: 0.0
+  #  cycle_min_mom: 0.90
+  #  cycle_max_mom: 0.99
+  #  decay_mom_rate: 0.0
+evaluation:
+  batch_size: 64
+  frequency: 500
+  size: 64
+  steps: 300
+  ar_temperature: 0.95
+  nar_temperature: 0.25
+trainer:
+  iterations: 1_000_000
+  save_tag: step
+  save_on_oom: True
+  save_on_quit: True
+  save_frequency: 25
+  keep_last_checkpoints: 2
+  aggressive_optimizations: False
+  load_state_dict: True
+  strict_loading: False
+  #load_tag: "9500"
+  #load_states: False
+  #restart_step_count: True
+  gc_mode: None # "global_step"
+  weight_dtype: bfloat16
+  backend: deepspeed
+  deepspeed:
+    zero_optimization_level: 2
+    use_compression_training: True
+inference:
+  use_vocos: True
+  normalize: False
+  weight_dtype: float32
+bitsandbytes:
+  enabled: False
+  injects: True
+  linear: True
+  embedding: True