ecker commited on
Commit
2de4670
1 Parent(s): 646f05b

Upload 5 files

Browse files

Uploaded adequate weights

.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ model/ckpt/ar-llama-1/fp32.pth filter=lfs diff=lfs merge=lfs -text
2
+ model/ckpt/ar+nar-llama-8/fp32.pth filter=lfs diff=lfs merge=lfs -text
3
+ model/ckpt/nar-llama-8/fp32.pth filter=lfs diff=lfs merge=lfs -text
model/ckpt/ar+nar-llama-8/fp32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5575331ee4da43781e3e6dba2f48de5f3f7d811f6584330a56c5a767994e1b2f
3
+ size 441075976
model/ckpt/ar-llama-1/fp32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de64a12741767b4e3e013f779cdf3ff5d638c1db8635553f63156c321865d668
3
+ size 411697230
model/ckpt/nar-llama-8/fp32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed74021797c5eafee3f88e3d79b16e51eb086f47ca2700f56f993eb90f7d3a6
3
+ size 441075976
model/config.split.yaml ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 24_000
2
+ audio_backend: "vocos"
3
+ experimental: True
4
+
5
+ models:
6
+ - name: "ar"
7
+ size: "full"
8
+ resp_levels: 1
9
+ prom_levels: 1
10
+ tasks: 8
11
+ langs: 2
12
+ tones: 1
13
+ arch_type: llama
14
+ training: False
15
+ version: 5
16
+ attention: auto
17
+ dropout: 0.1
18
+ audio_embedding_sums: False
19
+ experimental: False
20
+ interleave: False
21
+ loss_factors:
22
+ text: 0.01
23
+ prom: 0.5
24
+ resp: 1.0
25
+ capabilities: ["ar"]
26
+ - name: "nar"
27
+ size: "full"
28
+ resp_levels: 8
29
+ prom_levels: 8
30
+ tasks: 8
31
+ langs: 2
32
+ tones: 1
33
+ arch_type: llama
34
+ training: False
35
+ version: 5
36
+ attention: auto
37
+ dropout: 0.1
38
+ audio_embedding_sums: False
39
+ experimental: False
40
+ interleave: False
41
+ loss_factors:
42
+ text: 0.01
43
+ prom: 0.5
44
+ resp: 1.0
45
+ capabilities: ["nar"]
46
+
47
+ hyperparameters:
48
+ autotune: False
49
+ autotune_params:
50
+ start_profile_step: 1
51
+ end_profile_step: 50
52
+ num_tuning_micro_batch_sizes: 8
53
+
54
+ batch_size: 16
55
+ gradient_accumulation_steps: 8
56
+ gradient_clipping: 1.0
57
+ warmup_steps: 250
58
+
59
+ optimizer: Prodigy
60
+ learning_rate: 1.0
61
+ torch_optimizer: True
62
+
63
+ scheduler: "" # ScheduleFree
64
+ torch_scheduler: True
65
+
66
+ evaluation:
67
+ batch_size: 16
68
+ frequency: 1000
69
+ size: 16
70
+
71
+ steps: 500
72
+ ar_temperature: 0.95
73
+ nar_temperature: 0.25
74
+ load_disabled_engines: True
75
+
76
+ trainer:
77
+ #no_logger: True
78
+ ddp: False
79
+ check_for_oom: False
80
+ iterations: 1_000_000
81
+
82
+ save_tag: step
83
+ save_on_oom: True
84
+ save_on_quit: True
85
+ save_frequency: 500
86
+ export_on_save: True
87
+
88
+ keep_last_checkpoints: 8
89
+
90
+ aggressive_optimizations: False
91
+ load_disabled_engines: False
92
+ gradient_checkpointing: True
93
+
94
+ #load_state_dict: True
95
+ strict_loading: False
96
+ #load_tag: "9500"
97
+ #load_states: False
98
+ #restart_step_count: True
99
+
100
+ gc_mode: None # "global_step"
101
+
102
+ weight_dtype: bfloat16
103
+ amp: True
104
+
105
+ backend: deepspeed
106
+ deepspeed:
107
+ inferencing: True
108
+ zero_optimization_level: 0
109
+ use_compression_training: False
110
+
111
+ amp: False
112
+
113
+ load_webui: False
114
+
115
+ inference:
116
+ backend: deepspeed
117
+ audio_backend: "vocos"
118
+ normalize: False
119
+
120
+ weight_dtype: bfloat16
121
+ amp: True
122
+
123
+ optimizations:
124
+ injects: False
125
+ replace: True
126
+
127
+ linear: False
128
+ embedding: False
129
+ optimizers: True
130
+
131
+ bitsandbytes: False
132
+ dadaptation: False
133
+ bitnet: False
134
+ fp8: False
135
+
136
+ dataset:
137
+ speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
138
+ speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
139
+ speaker_languages:
140
+ ja: []
141
+
142
+ use_hdf5: True
143
+ use_metadata: True
144
+ hdf5_flag: r
145
+ validate: True
146
+
147
+ workers: 6
148
+ cache: True
149
+
150
+ duration_range: [24.0, 32.0]
151
+
152
+ random_utterance: 1.0
153
+ max_prompts: 1
154
+ prompt_duration_range: [3.0, 9.0]
155
+
156
+ max_resps: 1
157
+ p_resp_append: 0.25
158
+
159
+ sample_type: path # path # speaker
160
+
161
+ tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
162
+
163
+ training: []
164
+ validation: []
165
+ noise: []
model/config.yaml ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 24_000
2
+ audio_backend: "vocos"
3
+ experimental: True
4
+
5
+ models:
6
+ - name: "ar+nar"
7
+ size: "full"
8
+ resp_levels: 8
9
+ prom_levels: 8
10
+ tasks: 8
11
+ langs: 2
12
+ tones: 1
13
+ arch_type: llama
14
+ training: False
15
+ version: 5
16
+ attention: auto
17
+ dropout: 0.1
18
+ audio_embedding_sums: False
19
+ experimental: False
20
+ interleave: False
21
+ loss_factors:
22
+ text: 0.01
23
+ prom: 0.5
24
+ resp: 1.0
25
+ capabilities: ["ar", "nar"]
26
+
27
+ hyperparameters:
28
+ autotune: False
29
+ autotune_params:
30
+ start_profile_step: 1
31
+ end_profile_step: 50
32
+ num_tuning_micro_batch_sizes: 8
33
+
34
+ batch_size: 16
35
+ gradient_accumulation_steps: 8
36
+ gradient_clipping: 1.0
37
+ warmup_steps: 250
38
+
39
+ optimizer: Prodigy
40
+ learning_rate: 1.0
41
+ torch_optimizer: True
42
+
43
+ scheduler: "" # ScheduleFree
44
+ torch_scheduler: True
45
+
46
+ evaluation:
47
+ batch_size: 16
48
+ frequency: 1000
49
+ size: 16
50
+
51
+ steps: 500
52
+ ar_temperature: 0.95
53
+ nar_temperature: 0.25
54
+ load_disabled_engines: True
55
+
56
+ trainer:
57
+ #no_logger: True
58
+ ddp: False
59
+ check_for_oom: False
60
+ iterations: 1_000_000
61
+
62
+ save_tag: step
63
+ save_on_oom: True
64
+ save_on_quit: True
65
+ save_frequency: 500
66
+ export_on_save: True
67
+
68
+ keep_last_checkpoints: 8
69
+
70
+ aggressive_optimizations: False
71
+ load_disabled_engines: False
72
+ gradient_checkpointing: True
73
+
74
+ #load_state_dict: True
75
+ strict_loading: False
76
+ #load_tag: "9500"
77
+ #load_states: False
78
+ #restart_step_count: True
79
+
80
+ gc_mode: None # "global_step"
81
+
82
+ weight_dtype: bfloat16
83
+ amp: True
84
+
85
+ backend: deepspeed
86
+ deepspeed:
87
+ inferencing: True
88
+ zero_optimization_level: 0
89
+ use_compression_training: False
90
+
91
+ amp: False
92
+
93
+ load_webui: False
94
+
95
+ inference:
96
+ backend: deepspeed
97
+ audio_backend: "vocos"
98
+ normalize: False
99
+
100
+ weight_dtype: bfloat16
101
+ amp: True
102
+
103
+ optimizations:
104
+ injects: False
105
+ replace: True
106
+
107
+ linear: False
108
+ embedding: False
109
+ optimizers: True
110
+
111
+ bitsandbytes: False
112
+ dadaptation: False
113
+ bitnet: False
114
+ fp8: False
115
+
116
+ dataset:
117
+ speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
118
+ speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
119
+ speaker_languages:
120
+ ja: []
121
+
122
+ use_hdf5: True
123
+ use_metadata: True
124
+ hdf5_flag: r
125
+ validate: True
126
+
127
+ workers: 6
128
+ cache: True
129
+
130
+ duration_range: [24.0, 32.0]
131
+
132
+ random_utterance: 1.0
133
+ max_prompts: 1
134
+ prompt_duration_range: [3.0, 9.0]
135
+
136
+ max_resps: 1
137
+ p_resp_append: 0.25
138
+
139
+ sample_type: path # path # speaker
140
+
141
+ tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
142
+
143
+ training: []
144
+ validation: []
145
+ noise: []