ecker commited on
Commit
494a301
1 Parent(s): 09f258a

Update models/config.retnet.yaml

Browse files
Files changed (1) hide show
  1. models/config.retnet.yaml +32 -61
models/config.retnet.yaml CHANGED
@@ -1,5 +1,5 @@
1
  sample_rate: 24_000
2
- audio_backend: vocos
3
 
4
  models:
5
  - name: "ar+nar"
@@ -17,17 +17,18 @@ models:
17
  experimental:
18
  audio_embedding_sums: True
19
 
20
- hyperparameters:
21
- autotune: False
22
- autotune_params:
23
- start_profile_step: 1
24
- end_profile_step: 50
25
- num_tuning_micro_batch_sizes: 8
26
 
27
- batch_size: 16
 
28
  gradient_accumulation_steps: 8
29
  gradient_clipping: 1.0
30
- warmup_steps: 250
31
 
32
  optimizer: Prodigy
33
  learning_rate: 1.0
@@ -37,59 +38,32 @@ hyperparameters:
37
  torch_scheduler: True
38
 
39
  evaluation:
40
- batch_size: 16
41
- frequency: 1000
42
- size: 16
43
 
44
  steps: 500
45
- ar_temperature: 0.95
46
- nar_temperature: 0.25
47
- load_disabled_engines: True
48
 
49
  trainer:
50
- #no_logger: True
51
- ddp: False
52
- check_for_oom: False
53
- iterations: 1_000_000
54
-
55
- save_tag: step
56
- save_on_oom: True
57
- save_on_quit: True
58
- save_frequency: 500
59
- export_on_save: True
60
-
61
- keep_last_checkpoints: 8
62
 
63
- aggressive_optimizations: False
64
- load_disabled_engines: False
65
  gradient_checkpointing: True
66
 
67
- #load_state_dict: True
68
- strict_loading: False
69
- #load_tag: "9500"
70
- #load_states: False
71
- #restart_step_count: True
72
-
73
- gc_mode: None # "global_step"
74
-
75
  weight_dtype: bfloat16
76
  amp: True
77
 
78
  backend: deepspeed
79
  deepspeed:
80
- inferencing: True
81
- zero_optimization_level: 0
82
- use_compression_training: False
83
-
84
  amp: False
85
 
86
- load_webui: False
87
-
88
  inference:
89
- backend: deepspeed
90
- audio_backend: "vocos"
91
- normalize: False
92
-
93
  weight_dtype: bfloat16
94
  amp: True
95
 
@@ -107,31 +81,28 @@ optimizations:
107
  fp8: False
108
 
109
  dataset:
110
- speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
111
- speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
112
- speaker_languages:
113
- ja: []
114
-
115
  use_hdf5: True
116
- use_metadata: True
117
  hdf5_flag: r
 
 
118
  validate: True
119
 
120
- workers: 6
121
  cache: True
122
 
123
- duration_range: [3.0, 16.0]
124
 
125
- random_utterance: 1.0
126
- max_prompts: 1
127
- prompt_duration_range: [3.0, 9.0]
128
 
129
- max_resps: 1
130
- p_resp_append: 0.25
131
 
132
  sample_type: path # path # speaker
 
 
 
133
 
134
- tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
135
 
136
  training: []
137
  validation: []
 
1
  sample_rate: 24_000
2
+ audio_backend: "vocos"
3
 
4
  models:
5
  - name: "ar+nar"
 
17
  experimental:
18
  audio_embedding_sums: True
19
 
20
+ #loras:
21
+ #- name : "lora"
22
+ # rank: 128
23
+ # alpha: 128
24
+ # training: True
25
+ # rvq_levels: []
26
 
27
+ hyperparameters:
28
+ batch_size: 32
29
  gradient_accumulation_steps: 8
30
  gradient_clipping: 1.0
31
+ warmup_steps: 10
32
 
33
  optimizer: Prodigy
34
  learning_rate: 1.0
 
38
  torch_scheduler: True
39
 
40
  evaluation:
41
+ batch_size: 4
42
+ frequency: 250
43
+ size: 4
44
 
45
  steps: 500
46
+ ar_temperature: 1.0
47
+ nar_temperature: 0.0
 
48
 
49
  trainer:
50
+ iterations: 1_000_000
51
+ save_frequency: 250
52
+ keep_last_checkpoints: 4
 
 
 
 
 
 
 
 
 
53
 
54
+ resize_modules: True
 
55
  gradient_checkpointing: True
56
 
 
 
 
 
 
 
 
 
57
  weight_dtype: bfloat16
58
  amp: True
59
 
60
  backend: deepspeed
61
  deepspeed:
62
+ inferencing: False
 
 
 
63
  amp: False
64
 
 
 
65
  inference:
66
+ backend: local
 
 
 
67
  weight_dtype: bfloat16
68
  amp: True
69
 
 
81
  fp8: False
82
 
83
  dataset:
 
 
 
 
 
84
  use_hdf5: True
 
85
  hdf5_flag: r
86
+
87
+ use_metadata: True
88
  validate: True
89
 
90
+ workers: 1
91
  cache: True
92
 
93
+ duration_range: [3.0, 12.0]
94
 
95
+ prompt_max_samples: 1
96
+ prompt_duration_range: [3.0, 3.0]
 
97
 
98
+ resps_max_samples: 1
 
99
 
100
  sample_type: path # path # speaker
101
+ sample_order: duration
102
+ sample_max_duration_batch: 300
103
+ sample_shuffle: False
104
 
105
+ tasks_list: [ "tts", "stt" ]
106
 
107
  training: []
108
  validation: []