File size: 1,656 Bytes
2de4670
494a301
2de4670
 
 
 
 
 
 
 
 
0c4f028
2de4670
0c4f028
2de4670
 
03504d1
 
2de4670
494a301
 
 
 
 
 
2de4670
494a301
 
2de4670
 
494a301
2de4670
 
 
 
 
 
 
 
 
494a301
 
 
2de4670
 
494a301
 
2de4670
 
494a301
 
 
2de4670
494a301
2de4670
 
 
 
 
 
 
494a301
2de4670
 
 
494a301
2de4670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494a301
 
2de4670
 
494a301
2de4670
 
494a301
2de4670
494a301
 
2de4670
494a301
2de4670
 
494a301
 
 
2de4670
494a301
2de4670
 
 
0c4f028
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
sample_rate: 24_000
audio_backend: "vocos"

models:
- name: "ar+nar"
  size: "full"
  resp_levels: 8
  prom_levels: 8
  tasks: 8
  langs: 2
  tones: 1
  arch_type: retnet
  training: False
  version: 2
  dropout: 0.1
  capabilities: ["ar", "nar"]
  experimental:
    audio_embedding_sums: True

#loras:
#- name : "lora"
#  rank: 128
#  alpha: 128
#  training: True
#  rvq_levels: []

hyperparameters:
  batch_size: 32
  gradient_accumulation_steps: 8
  gradient_clipping: 1.0
  warmup_steps: 10

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 4
  frequency: 250
  size: 4
  
  steps: 500
  ar_temperature: 1.0
  nar_temperature: 0.0

trainer:
  iterations: 1_000_000  
  save_frequency: 250
  keep_last_checkpoints: 4

  resize_modules: True
  gradient_checkpointing: True

  weight_dtype: bfloat16
  amp: True

  backend: deepspeed
  deepspeed:
    inferencing: False
    amp: False

inference:
  backend: local
  weight_dtype: bfloat16
  amp: True

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  use_hdf5: True
  hdf5_flag: r
  
  use_metadata: True
  validate: True

  workers: 1
  cache: True

  duration_range: [3.0, 12.0]

  prompt_max_samples: 1
  prompt_duration_range: [3.0, 3.0]
  
  resps_max_samples: 1

  sample_type: path # path # speaker
  sample_order: duration
  sample_max_duration_batch: 300
  sample_shuffle: False

  tasks_list: [ "tts", "stt" ]

  training: []
  validation: []
  noise: []