Kit-Lemonfoot commited on
Commit
e50b99e
1 Parent(s): 2e6d693

Added first version of HoloJPTest3.

Browse files

- Added Fubuki, Subaru, Rushia, Kanata, Watame, Nene, Polka, and Raden.
- Adjusted style weight range and default value.
- It's the start of the end...

app.py CHANGED
@@ -1,378 +1,380 @@
1
- print("Starting up. Please be patient...")
2
-
3
- import argparse
4
- import datetime
5
- import os
6
- import sys
7
- from typing import Optional
8
- import json
9
- import utils
10
-
11
- import gradio as gr
12
- import torch
13
- import yaml
14
-
15
- from common.constants import (
16
- DEFAULT_ASSIST_TEXT_WEIGHT,
17
- DEFAULT_LENGTH,
18
- DEFAULT_LINE_SPLIT,
19
- DEFAULT_NOISE,
20
- DEFAULT_NOISEW,
21
- DEFAULT_SDP_RATIO,
22
- DEFAULT_SPLIT_INTERVAL,
23
- DEFAULT_STYLE,
24
- DEFAULT_STYLE_WEIGHT,
25
- Languages,
26
- )
27
- from common.log import logger
28
- from common.tts_model import ModelHolder
29
- from infer import InvalidToneError
30
- from text.japanese import g2kata_tone, kata_tone2phone_tone, text_normalize
31
-
32
- is_hf_spaces = os.getenv("SYSTEM") == "spaces"
33
- limit = 150
34
-
35
- # Get path settings
36
- with open(os.path.join("configs", "paths.yml"), "r", encoding="utf-8") as f:
37
- path_config: dict[str, str] = yaml.safe_load(f.read())
38
- # dataset_root = path_config["dataset_root"]
39
- assets_root = path_config["assets_root"]
40
-
41
- def tts_fn(
42
- model_name,
43
- model_path,
44
- text,
45
- language,
46
- reference_audio_path,
47
- sdp_ratio,
48
- noise_scale,
49
- noise_scale_w,
50
- length_scale,
51
- line_split,
52
- split_interval,
53
- assist_text,
54
- assist_text_weight,
55
- use_assist_text,
56
- style,
57
- style_weight,
58
- kata_tone_json_str,
59
- use_tone,
60
- speaker,
61
- ):
62
- if len(text)<2:
63
- return "Please enter some text.", None, kata_tone_json_str
64
-
65
- if is_hf_spaces and len(text) > limit:
66
- return f"Too long! There is a character limit of {limit} characters.", None, kata_tone_json_str
67
-
68
- if(not model_holder.current_model):
69
- model_holder.load_model_gr(model_name, model_path)
70
- logger.info(f"Loaded model '{model_name}'")
71
- if(model_holder.current_model.model_path != model_path):
72
- model_holder.load_model_gr(model_name, model_path)
73
- logger.info(f"Swapped to model '{model_name}'")
74
- speaker_id = model_holder.current_model.spk2id[speaker]
75
- start_time = datetime.datetime.now()
76
-
77
- wrong_tone_message = ""
78
- kata_tone: Optional[list[tuple[str, int]]] = None
79
- if use_tone and kata_tone_json_str != "":
80
- if language != "JP":
81
- #logger.warning("Only Japanese is supported for tone generation.")
82
- wrong_tone_message = "アクセント指定は現在日本語のみ対応しています。"
83
- if line_split:
84
- #logger.warning("Tone generation is not supported for line split.")
85
- wrong_tone_message = (
86
- "アクセント指定は改行で分けて生成を使わない場合のみ対応しています。"
87
- )
88
- try:
89
- kata_tone = []
90
- json_data = json.loads(kata_tone_json_str)
91
- # tupleを使うように変換
92
- for kana, tone in json_data:
93
- assert isinstance(kana, str) and tone in (0, 1), f"{kana}, {tone}"
94
- kata_tone.append((kana, tone))
95
- except Exception as e:
96
- logger.warning(f"Error occurred when parsing kana_tone_json: {e}")
97
- wrong_tone_message = f"アクセント指定が不正です: {e}"
98
- kata_tone = None
99
-
100
- # toneは実際に音声合成に代入される際のみnot Noneになる
101
- tone: Optional[list[int]] = None
102
- if kata_tone is not None:
103
- phone_tone = kata_tone2phone_tone(kata_tone)
104
- tone = [t for _, t in phone_tone]
105
-
106
- try:
107
- sr, audio = model_holder.current_model.infer(
108
- text=text,
109
- language=language,
110
- reference_audio_path=reference_audio_path,
111
- sdp_ratio=sdp_ratio,
112
- noise=noise_scale,
113
- noisew=noise_scale_w,
114
- length=length_scale,
115
- line_split=line_split,
116
- split_interval=split_interval,
117
- assist_text=assist_text,
118
- assist_text_weight=assist_text_weight,
119
- use_assist_text=use_assist_text,
120
- style=style,
121
- style_weight=style_weight,
122
- given_tone=tone,
123
- sid=speaker_id,
124
- )
125
- except InvalidToneError as e:
126
- logger.error(f"Tone error: {e}")
127
- return f"Error: アクセント指定が不正です:\n{e}", None, kata_tone_json_str
128
- except ValueError as e:
129
- logger.error(f"Value error: {e}")
130
- return f"Error: {e}", None, kata_tone_json_str
131
-
132
- end_time = datetime.datetime.now()
133
- duration = (end_time - start_time).total_seconds()
134
-
135
- if tone is None and language == "JP":
136
- # アクセント指定に使えるようにアクセント情報を返す
137
- norm_text = text_normalize(text)
138
- kata_tone = g2kata_tone(norm_text)
139
- kata_tone_json_str = json.dumps(kata_tone, ensure_ascii=False)
140
- elif tone is None:
141
- kata_tone_json_str = ""
142
-
143
- if reference_audio_path:
144
- style="External Audio"
145
- logger.info(f"Successful inference, took {duration}s | {speaker} | {language}/{sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{style}/{style_weight} | {text}")
146
- message = f"Success, time: {duration} seconds."
147
- if wrong_tone_message != "":
148
- message = wrong_tone_message + "\n" + message
149
- return message, (sr, audio), kata_tone_json_str
150
-
151
- def load_voicedata():
152
- print("Loading voice data...")
153
- #voices = []
154
- envoices = []
155
- jpvoices = []
156
- styledict = {}
157
- with open("voicelist.json", "r", encoding="utf-8") as f:
158
- voc_info = json.load(f)
159
- for name, info in voc_info.items():
160
- if not info['enable']:
161
- continue
162
- model_path = info['model_path']
163
- model_path_full = f"model_assets/{model_path}/{model_path}.safetensors"
164
- if not os.path.exists(model_path_full):
165
- model_path_full = f"model_assets\\{model_path}\\{model_path}.safetensors"
166
- voice_name = info['title']
167
- speakerid = info['speakerid']
168
- datasetauthor = info['datasetauthor']
169
- image = info['cover']
170
- if not os.path.exists(f"images/{image}"):
171
- image="none.png"
172
- if not model_path in styledict.keys():
173
- conf=f"model_assets/{model_path}/config.json"
174
- hps = utils.get_hparams_from_file(conf)
175
- s2id = hps.data.style2id
176
- styledict[model_path] = s2id.keys()
177
- print(f"Set up hyperparameters for model {model_path}")
178
- if(info['primarylang']=="JP"):
179
- jpvoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image))
180
- else:
181
- envoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image))
182
- return [envoices, jpvoices], styledict
183
-
184
-
185
- initial_text = "Hello there! This is test audio of a new Hololive text to speech tool."
186
-
187
- initial_md = """
188
- # Hololive [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)
189
- ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
190
- ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
191
-
192
- Do no evil.
193
- """
194
-
195
- style_md = """
196
- - You can control things like voice tone, emotion, and reading style through presets or through voice files.
197
- - Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
198
- - Setting the intensity too high will likely break the output.
199
- - The required intensity will depend based on the speaker and the desired style.
200
- - If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
201
- """
202
-
203
- if __name__ == "__main__":
204
- parser = argparse.ArgumentParser()
205
- parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
206
- parser.add_argument(
207
- "--dir", "-d", type=str, help="Model directory", default=assets_root
208
- )
209
- parser.add_argument(
210
- "--share", action="store_true", help="Share this app publicly", default=False
211
- )
212
- parser.add_argument(
213
- "--server-name",
214
- type=str,
215
- default=None,
216
- help="Server name for Gradio app",
217
- )
218
- parser.add_argument(
219
- "--no-autolaunch",
220
- action="store_true",
221
- default=False,
222
- help="Do not launch app automatically",
223
- )
224
- args = parser.parse_args()
225
- model_dir = args.dir
226
-
227
- if args.cpu:
228
- device = "cpu"
229
- else:
230
- device = "cuda" if torch.cuda.is_available() else "cpu"
231
-
232
- model_holder = ModelHolder(model_dir, device)
233
-
234
- languages = ["EN", "JP", "ZH"]
235
- langnames = ["English", "Japanese"]
236
-
237
- model_names = model_holder.model_names
238
- if len(model_names) == 0:
239
- logger.error(f"No models found. Please place the model in {model_dir}.")
240
- sys.exit(1)
241
- initial_id = 0
242
- initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
243
- #print(initial_pth_files)
244
-
245
- voicedata, styledict = load_voicedata()
246
-
247
- #Gradio preload
248
- text_input = gr.TextArea(label="Text", value=initial_text)
249
- line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
250
- split_interval = gr.Slider(
251
- minimum=0.0,
252
- maximum=2,
253
- value=0.5,
254
- step=0.1,
255
- label="Length of division seperation time (in seconds)",
256
- )
257
- language = gr.Dropdown(choices=languages, value="EN", label="Language")
258
- sdp_ratio = gr.Slider(
259
- minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
260
- )
261
- noise_scale = gr.Slider(
262
- minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
263
- )
264
- noise_scale_w = gr.Slider(
265
- minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
266
- )
267
- length_scale = gr.Slider(
268
- minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
269
- )
270
- use_style_text = gr.Checkbox(label="Use stylization text", value=False)
271
- style_text = gr.Textbox(
272
- label="Style text",
273
- placeholder="Check the \"Use stylization text\" box to use this option!",
274
- info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
275
- visible=True,
276
- )
277
- style_text_weight = gr.Slider(
278
- minimum=0,
279
- maximum=1,
280
- value=0.7,
281
- step=0.1,
282
- label="Text stylization strength",
283
- visible=True,
284
- )
285
-
286
- with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="Hololive Style-Bert-VITS2") as app:
287
- gr.Markdown(initial_md)
288
-
289
- #NOT USED SINCE NONE OF MY MODELS ARE JPEXTRA.
290
- #ONLY HERE FOR COMPATIBILITY WITH THE EXISTING INFER CODE.
291
- #DO NOT RENDER OR MAKE VISIBLE
292
- tone = gr.Textbox(
293
- label="Accent adjustment (0 for low, 1 for high)",
294
- info="This can only be used when not seperated by line breaks. It is not universal.",
295
- visible=False
296
- )
297
- use_tone = gr.Checkbox(label="Use accent adjustment", value=False, visible=False)
298
-
299
- #for (name, model_path, voice_name, speakerid, datasetauthor, image) in voicedata:
300
- for vi in range(len(voicedata)):
301
- with gr.TabItem(langnames[vi]):
302
- for (name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image) in voicedata[vi]:
303
- with gr.TabItem(name):
304
- mn = gr.Textbox(value=model_path, visible=False, interactive=False)
305
- mp = gr.Textbox(value=model_path_full, visible=False, interactive=False)
306
- spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
307
- with gr.Row():
308
- with gr.Column():
309
- gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path} | Dataset author: {datasetauthor}")
310
- gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
311
- with gr.Column():
312
- with gr.TabItem("Style using a preset"):
313
- style = gr.Dropdown(
314
- label="Current style (Neutral is an average style)",
315
- choices=styledict[model_path],
316
- value="Neutral",
317
- )
318
- with gr.TabItem("Style using existing audio"):
319
- ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
320
- style_weight = gr.Slider(
321
- minimum=0,
322
- maximum=50,
323
- value=5,
324
- step=0.1,
325
- label="Style strength",
326
- )
327
- with gr.Column():
328
- tts_button = gr.Button(
329
- "Synthesize", variant="primary", interactive=True
330
- )
331
- text_output = gr.Textbox(label="Info")
332
- audio_output = gr.Audio(label="Result")
333
-
334
- tts_button.click(
335
- tts_fn,
336
- inputs=[
337
- mn,
338
- mp,
339
- text_input,
340
- language,
341
- ref_audio_path,
342
- sdp_ratio,
343
- noise_scale,
344
- noise_scale_w,
345
- length_scale,
346
- line_split,
347
- split_interval,
348
- style_text,
349
- style_text_weight,
350
- use_style_text,
351
- style,
352
- style_weight,
353
- tone,
354
- use_tone,
355
- spk,
356
- ],
357
- outputs=[text_output, audio_output, tone],
358
- )
359
-
360
- with gr.Row():
361
- with gr.Column():
362
- text_input.render()
363
- line_split.render()
364
- split_interval.render()
365
- language.render()
366
- with gr.Column():
367
- sdp_ratio.render()
368
- noise_scale.render()
369
- noise_scale_w.render()
370
- length_scale.render()
371
- use_style_text.render()
372
- style_text.render()
373
- style_text_weight.render()
374
-
375
- with gr.Accordion("Styling Guide", open=False):
376
- gr.Markdown(style_md)
377
-
378
- app.launch(allowed_paths=['/file/images/'])
 
 
 
1
+ print("Starting up. Please be patient...")
2
+
3
+ import argparse
4
+ import datetime
5
+ import os
6
+ import sys
7
+ from typing import Optional
8
+ import json
9
+ import utils
10
+
11
+ import gradio as gr
12
+ import torch
13
+ import yaml
14
+
15
+ from common.constants import (
16
+ DEFAULT_ASSIST_TEXT_WEIGHT,
17
+ DEFAULT_LENGTH,
18
+ DEFAULT_LINE_SPLIT,
19
+ DEFAULT_NOISE,
20
+ DEFAULT_NOISEW,
21
+ DEFAULT_SDP_RATIO,
22
+ DEFAULT_SPLIT_INTERVAL,
23
+ DEFAULT_STYLE,
24
+ DEFAULT_STYLE_WEIGHT,
25
+ Languages,
26
+ )
27
+ from common.log import logger
28
+ from common.tts_model import ModelHolder
29
+ from infer import InvalidToneError
30
+ from text.japanese import g2kata_tone, kata_tone2phone_tone, text_normalize
31
+
32
+ is_hf_spaces = os.getenv("SYSTEM") == "spaces"
33
+ limit = 150
34
+
35
+ # Get path settings
36
+ with open(os.path.join("configs", "paths.yml"), "r", encoding="utf-8") as f:
37
+ path_config: dict[str, str] = yaml.safe_load(f.read())
38
+ # dataset_root = path_config["dataset_root"]
39
+ assets_root = path_config["assets_root"]
40
+
41
+ def tts_fn(
42
+ model_name,
43
+ model_path,
44
+ text,
45
+ language,
46
+ reference_audio_path,
47
+ sdp_ratio,
48
+ noise_scale,
49
+ noise_scale_w,
50
+ length_scale,
51
+ line_split,
52
+ split_interval,
53
+ assist_text,
54
+ assist_text_weight,
55
+ use_assist_text,
56
+ style,
57
+ style_weight,
58
+ kata_tone_json_str,
59
+ use_tone,
60
+ speaker,
61
+ ):
62
+ if len(text)<2:
63
+ return "Please enter some text.", None, kata_tone_json_str
64
+
65
+ if is_hf_spaces and len(text) > limit:
66
+ return f"Too long! There is a character limit of {limit} characters.", None, kata_tone_json_str
67
+
68
+ if(not model_holder.current_model):
69
+ model_holder.load_model_gr(model_name, model_path)
70
+ logger.info(f"Loaded model '{model_name}'")
71
+ if(model_holder.current_model.model_path != model_path):
72
+ model_holder.load_model_gr(model_name, model_path)
73
+ logger.info(f"Swapped to model '{model_name}'")
74
+ speaker_id = model_holder.current_model.spk2id[speaker]
75
+ start_time = datetime.datetime.now()
76
+
77
+ wrong_tone_message = ""
78
+ kata_tone: Optional[list[tuple[str, int]]] = None
79
+ if use_tone and kata_tone_json_str != "":
80
+ if language != "JP":
81
+ #logger.warning("Only Japanese is supported for tone generation.")
82
+ wrong_tone_message = "アクセント指定は現在日本語のみ対応しています。"
83
+ if line_split:
84
+ #logger.warning("Tone generation is not supported for line split.")
85
+ wrong_tone_message = (
86
+ "アクセント指定は改行で分けて生成を使わない場合のみ対応しています。"
87
+ )
88
+ try:
89
+ kata_tone = []
90
+ json_data = json.loads(kata_tone_json_str)
91
+ # tupleを使うように変換
92
+ for kana, tone in json_data:
93
+ assert isinstance(kana, str) and tone in (0, 1), f"{kana}, {tone}"
94
+ kata_tone.append((kana, tone))
95
+ except Exception as e:
96
+ logger.warning(f"Error occurred when parsing kana_tone_json: {e}")
97
+ wrong_tone_message = f"アクセント指定が不正です: {e}"
98
+ kata_tone = None
99
+
100
+ # toneは実際に音声合成に代入される際のみnot Noneになる
101
+ tone: Optional[list[int]] = None
102
+ if kata_tone is not None:
103
+ phone_tone = kata_tone2phone_tone(kata_tone)
104
+ tone = [t for _, t in phone_tone]
105
+
106
+ try:
107
+ sr, audio = model_holder.current_model.infer(
108
+ text=text,
109
+ language=language,
110
+ reference_audio_path=reference_audio_path,
111
+ sdp_ratio=sdp_ratio,
112
+ noise=noise_scale,
113
+ noisew=noise_scale_w,
114
+ length=length_scale,
115
+ line_split=line_split,
116
+ split_interval=split_interval,
117
+ assist_text=assist_text,
118
+ assist_text_weight=assist_text_weight,
119
+ use_assist_text=use_assist_text,
120
+ style=style,
121
+ style_weight=style_weight,
122
+ given_tone=tone,
123
+ sid=speaker_id,
124
+ )
125
+ except InvalidToneError as e:
126
+ logger.error(f"Tone error: {e}")
127
+ return f"Error: アクセント指定が不正です:\n{e}", None, kata_tone_json_str
128
+ except ValueError as e:
129
+ logger.error(f"Value error: {e}")
130
+ return f"Error: {e}", None, kata_tone_json_str
131
+
132
+ end_time = datetime.datetime.now()
133
+ duration = (end_time - start_time).total_seconds()
134
+
135
+ if tone is None and language == "JP":
136
+ # アクセント指定に使えるようにアクセント情報を返す
137
+ norm_text = text_normalize(text)
138
+ kata_tone = g2kata_tone(norm_text)
139
+ kata_tone_json_str = json.dumps(kata_tone, ensure_ascii=False)
140
+ elif tone is None:
141
+ kata_tone_json_str = ""
142
+
143
+ if reference_audio_path:
144
+ style="External Audio"
145
+ logger.info(f"Successful inference, took {duration}s | {speaker} | {language}/{sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{style}/{style_weight} | {text}")
146
+ message = f"Success, time: {duration} seconds."
147
+ if wrong_tone_message != "":
148
+ message = wrong_tone_message + "\n" + message
149
+ return message, (sr, audio), kata_tone_json_str
150
+
151
+ def load_voicedata():
152
+ print("Loading voice data...")
153
+ #voices = []
154
+ envoices = []
155
+ jpvoices = []
156
+ styledict = {}
157
+ with open("voicelist.json", "r", encoding="utf-8") as f:
158
+ voc_info = json.load(f)
159
+ for name, info in voc_info.items():
160
+ if not info['enable']:
161
+ continue
162
+ model_path = info['model_path']
163
+ model_path_full = f"{model_dir}/{model_path}/{model_path}.safetensors"
164
+ if not os.path.exists(model_path_full):
165
+ model_path_full = f"{model_dir}\\{model_path}\\{model_path}.safetensors"
166
+ voice_name = info['title']
167
+ speakerid = info['speakerid']
168
+ datasetauthor = info['datasetauthor']
169
+ image = info['cover']
170
+ if not os.path.exists(f"images/{image}"):
171
+ image="none.png"
172
+ if not model_path in styledict.keys():
173
+ conf=f"{model_dir}/{model_path}/config.json"
174
+ hps = utils.get_hparams_from_file(conf)
175
+ s2id = hps.data.style2id
176
+ styledict[model_path] = s2id.keys()
177
+ print(f"Set up hyperparameters for model {model_path}")
178
+ #print(f"Indexed voice {voice_name}")
179
+ if(info['primarylang']=="JP"):
180
+ jpvoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image))
181
+ else:
182
+ envoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image))
183
+ return [envoices, jpvoices], styledict
184
+
185
+
186
+ initial_text = "Hello there! This is test audio of a new Hololive text to speech tool."
187
+
188
+ initial_md = """
189
+ # Hololive [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)
190
+ ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
191
+ ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
192
+
193
+ Do no evil.
194
+ """
195
+
196
+ style_md = """
197
+ - You can control things like voice tone, emotion, and reading style through presets or through voice files.
198
+ - Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
199
+ - Setting the intensity too high will likely break the output.
200
+ - The required intensity will depend based on the speaker and the desired style.
201
+ - If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
202
+ """
203
+
204
+ if __name__ == "__main__":
205
+ parser = argparse.ArgumentParser()
206
+ parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
207
+ parser.add_argument(
208
+ "--dir", "-d", type=str, help="Model directory", default=assets_root
209
+ )
210
+ parser.add_argument(
211
+ "--share", action="store_true", help="Share this app publicly", default=False
212
+ )
213
+ parser.add_argument(
214
+ "--server-name",
215
+ type=str,
216
+ default=None,
217
+ help="Server name for Gradio app",
218
+ )
219
+ parser.add_argument(
220
+ "--no-autolaunch",
221
+ action="store_true",
222
+ default=False,
223
+ help="Do not launch app automatically",
224
+ )
225
+ args = parser.parse_args()
226
+ model_dir = args.dir
227
+ print(model_dir)
228
+
229
+ if args.cpu:
230
+ device = "cpu"
231
+ else:
232
+ device = "cuda" if torch.cuda.is_available() else "cpu"
233
+
234
+ model_holder = ModelHolder(model_dir, device)
235
+
236
+ languages = ["EN", "JP", "ZH"]
237
+ langnames = ["English", "Japanese"]
238
+
239
+ model_names = model_holder.model_names
240
+ if len(model_names) == 0:
241
+ logger.error(f"No models found. Please place the model in {model_dir}.")
242
+ sys.exit(1)
243
+ initial_id = 0
244
+ initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
245
+ #print(initial_pth_files)
246
+
247
+ voicedata, styledict = load_voicedata()
248
+
249
+ #Gradio preload
250
+ text_input = gr.TextArea(label="Text", value=initial_text)
251
+ line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
252
+ split_interval = gr.Slider(
253
+ minimum=0.0,
254
+ maximum=2,
255
+ value=0.5,
256
+ step=0.1,
257
+ label="Length of division seperation time (in seconds)",
258
+ )
259
+ language = gr.Dropdown(choices=languages, value="EN", label="Language")
260
+ sdp_ratio = gr.Slider(
261
+ minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
262
+ )
263
+ noise_scale = gr.Slider(
264
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
265
+ )
266
+ noise_scale_w = gr.Slider(
267
+ minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
268
+ )
269
+ length_scale = gr.Slider(
270
+ minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
271
+ )
272
+ use_style_text = gr.Checkbox(label="Use stylization text", value=False)
273
+ style_text = gr.Textbox(
274
+ label="Style text",
275
+ placeholder="Check the \"Use stylization text\" box to use this option!",
276
+ info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
277
+ visible=True,
278
+ )
279
+ style_text_weight = gr.Slider(
280
+ minimum=0,
281
+ maximum=1,
282
+ value=0.7,
283
+ step=0.1,
284
+ label="Text stylization strength",
285
+ visible=True,
286
+ )
287
+
288
+ with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="Hololive Style-Bert-VITS2") as app:
289
+ gr.Markdown(initial_md)
290
+
291
+ #NOT USED SINCE NONE OF MY MODELS ARE JPEXTRA.
292
+ #ONLY HERE FOR COMPATIBILITY WITH THE EXISTING INFER CODE.
293
+ #DO NOT RENDER OR MAKE VISIBLE
294
+ tone = gr.Textbox(
295
+ label="Accent adjustment (0 for low, 1 for high)",
296
+ info="This can only be used when not seperated by line breaks. It is not universal.",
297
+ visible=False
298
+ )
299
+ use_tone = gr.Checkbox(label="Use accent adjustment", value=False, visible=False)
300
+
301
+ #for (name, model_path, voice_name, speakerid, datasetauthor, image) in voicedata:
302
+ for vi in range(len(voicedata)):
303
+ with gr.TabItem(langnames[vi]):
304
+ for (name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image) in voicedata[vi]:
305
+ with gr.TabItem(name):
306
+ mn = gr.Textbox(value=model_path, visible=False, interactive=False)
307
+ mp = gr.Textbox(value=model_path_full, visible=False, interactive=False)
308
+ spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
309
+ with gr.Row():
310
+ with gr.Column():
311
+ gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path} | Dataset author: {datasetauthor}")
312
+ gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
313
+ with gr.Column():
314
+ with gr.TabItem("Style using a preset"):
315
+ style = gr.Dropdown(
316
+ label="Current style (Neutral is an average style)",
317
+ choices=styledict[model_path],
318
+ value="Neutral",
319
+ )
320
+ with gr.TabItem("Style using existing audio"):
321
+ ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", sources=["upload"])
322
+ style_weight = gr.Slider(
323
+ minimum=0,
324
+ maximum=20,
325
+ value=3,
326
+ step=0.1,
327
+ label="Style strength",
328
+ )
329
+ with gr.Column():
330
+ tts_button = gr.Button(
331
+ "Synthesize", variant="primary", interactive=True
332
+ )
333
+ text_output = gr.Textbox(label="Info")
334
+ audio_output = gr.Audio(label="Result")
335
+
336
+ tts_button.click(
337
+ tts_fn,
338
+ inputs=[
339
+ mn,
340
+ mp,
341
+ text_input,
342
+ language,
343
+ ref_audio_path,
344
+ sdp_ratio,
345
+ noise_scale,
346
+ noise_scale_w,
347
+ length_scale,
348
+ line_split,
349
+ split_interval,
350
+ style_text,
351
+ style_text_weight,
352
+ use_style_text,
353
+ style,
354
+ style_weight,
355
+ tone,
356
+ use_tone,
357
+ spk,
358
+ ],
359
+ outputs=[text_output, audio_output, tone],
360
+ )
361
+
362
+ with gr.Row():
363
+ with gr.Column():
364
+ text_input.render()
365
+ line_split.render()
366
+ split_interval.render()
367
+ language.render()
368
+ with gr.Column():
369
+ sdp_ratio.render()
370
+ noise_scale.render()
371
+ noise_scale_w.render()
372
+ length_scale.render()
373
+ use_style_text.render()
374
+ style_text.render()
375
+ style_text_weight.render()
376
+
377
+ with gr.Accordion("Styling Guide", open=False):
378
+ gr.Markdown(style_md)
379
+
380
+ app.launch(allowed_paths=['/file/images/'])
images/fubuki.png ADDED
images/kanata.png ADDED
images/nene.png ADDED
images/polka.png ADDED
images/raden.png ADDED
images/rushia.png ADDED
images/subaru.png ADDED
images/watame.png ADDED
model_assets/SBV2_HoloJPTest3/SBV2_HoloJPTest3.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:435a6d300788d821738d8f1d4b91c96f4ce888086875fa51b85339c777e5a170
3
+ size 198775356
model_assets/SBV2_HoloJPTest3/config.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "SBV2_HoloJPTest3",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 1000,
6
+ "seed": 42,
7
+ "epochs": 100,
8
+ "learning_rate": 0.0002,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 4,
15
+ "bf16_run": false,
16
+ "lr_decay": 0.99995,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "skip_optimizer": false,
23
+ "freeze_ZH_bert": false,
24
+ "freeze_JP_bert": false,
25
+ "freeze_EN_bert": false,
26
+ "freeze_style": false,
27
+ "freeze_encoder": false,
28
+ "freeze_decoder": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": false,
32
+ "training_files": "Data\\SBV2_HoloJPTest3\\train.list",
33
+ "validation_files": "Data\\SBV2_HoloJPTest3\\val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 8,
44
+ "cleaned_text": true,
45
+ "num_styles": 10,
46
+ "style2id": {
47
+ "Neutral": 0,
48
+ "Subaru": 1,
49
+ "Watame": 2,
50
+ "Polka": 3,
51
+ "Surprise": 4,
52
+ "Rushia": 5,
53
+ "Fubuki": 6,
54
+ "Kanata": 7,
55
+ "Raden": 8,
56
+ "Nene": 9
57
+ },
58
+ "spk2id": {
59
+ "ShirakamiFubuki": 0,
60
+ "OozoraSubaru": 1,
61
+ "UruhaRushia": 2,
62
+ "AmaneKanata": 3,
63
+ "TsunomakiWatame": 4,
64
+ "MomosuzuNene": 5,
65
+ "OmaruPolka": 6,
66
+ "JuufuuteiRaden": 7
67
+ }
68
+ },
69
+ "model": {
70
+ "use_spk_conditioned_encoder": true,
71
+ "use_noise_scaled_mas": true,
72
+ "use_mel_posterior_encoder": false,
73
+ "use_duration_discriminator": true,
74
+ "inter_channels": 192,
75
+ "hidden_channels": 192,
76
+ "filter_channels": 768,
77
+ "n_heads": 2,
78
+ "n_layers": 6,
79
+ "kernel_size": 3,
80
+ "p_dropout": 0.1,
81
+ "resblock": "1",
82
+ "resblock_kernel_sizes": [
83
+ 3,
84
+ 7,
85
+ 11
86
+ ],
87
+ "resblock_dilation_sizes": [
88
+ [
89
+ 1,
90
+ 3,
91
+ 5
92
+ ],
93
+ [
94
+ 1,
95
+ 3,
96
+ 5
97
+ ],
98
+ [
99
+ 1,
100
+ 3,
101
+ 5
102
+ ]
103
+ ],
104
+ "upsample_rates": [
105
+ 8,
106
+ 8,
107
+ 2,
108
+ 2,
109
+ 2
110
+ ],
111
+ "upsample_initial_channel": 512,
112
+ "upsample_kernel_sizes": [
113
+ 16,
114
+ 16,
115
+ 8,
116
+ 2,
117
+ 2
118
+ ],
119
+ "n_layers_q": 3,
120
+ "use_spectral_norm": false,
121
+ "gin_channels": 256
122
+ },
123
+ "version": "2.5.0"
124
+ }
model_assets/SBV2_HoloJPTest3/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8516f4541abf025c8f22b383c4366ba28704a0c3e210ef8672f821729e5caa22
3
+ size 10368
voicelist.json CHANGED
@@ -224,6 +224,15 @@
224
  "primarylang": "JP",
225
  "cover": "mel.png"
226
  },
 
 
 
 
 
 
 
 
 
227
  "Matsuri": {
228
  "enable": true,
229
  "model_path": "SBV2_HoloJPTest2.5",
@@ -269,6 +278,15 @@
269
  "primarylang": "JP",
270
  "cover": "ayame.png"
271
  },
 
 
 
 
 
 
 
 
 
272
  "Okayu": {
273
  "enable": true,
274
  "model_path": "SBV2_HoloJPTest",
@@ -278,6 +296,15 @@
278
  "primarylang": "JP",
279
  "cover": "okayu.png"
280
  },
 
 
 
 
 
 
 
 
 
281
  "Flare": {
282
  "enable": true,
283
  "model_path": "SBV2_HoloJPTest",
@@ -305,6 +332,24 @@
305
  "primarylang": "JP",
306
  "cover": "marine.png"
307
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  "Towa": {
309
  "enable": true,
310
  "model_path": "SBV2_HoloJPTest2",
@@ -332,6 +377,24 @@
332
  "primarylang": "JP",
333
  "cover": "lamy.png"
334
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  "Laplus": {
336
  "enable": true,
337
  "model_path": "SBV2_HoloJPTest",
@@ -376,5 +439,14 @@
376
  "datasetauthor": "Kit Lemonfoot",
377
  "primarylang": "JP",
378
  "cover": "ririka.png"
 
 
 
 
 
 
 
 
 
379
  }
380
  }
 
224
  "primarylang": "JP",
225
  "cover": "mel.png"
226
  },
227
+ "Fubuki": {
228
+ "enable": true,
229
+ "model_path": "SBV2_HoloJPTest3",
230
+ "title": "Shirakami Fubuki",
231
+ "speakerid": "ShirakamiFubuki",
232
+ "datasetauthor": "Kit Lemonfoot",
233
+ "primarylang": "JP",
234
+ "cover": "fubuki.png"
235
+ },
236
  "Matsuri": {
237
  "enable": true,
238
  "model_path": "SBV2_HoloJPTest2.5",
 
278
  "primarylang": "JP",
279
  "cover": "ayame.png"
280
  },
281
+ "Subaru": {
282
+ "enable": true,
283
+ "model_path": "SBV2_HoloJPTest3",
284
+ "title": "Oozora Subaru",
285
+ "speakerid": "OozoraSubaru",
286
+ "datasetauthor": "Kit Lemonfoot",
287
+ "primarylang": "JP",
288
+ "cover": "subaru.png"
289
+ },
290
  "Okayu": {
291
  "enable": true,
292
  "model_path": "SBV2_HoloJPTest",
 
296
  "primarylang": "JP",
297
  "cover": "okayu.png"
298
  },
299
+ "Rushia": {
300
+ "enable": true,
301
+ "model_path": "SBV2_HoloJPTest3",
302
+ "title": "Uruha Rushia",
303
+ "speakerid": "UruhaRushia",
304
+ "datasetauthor": "Kit Lemonfoot",
305
+ "primarylang": "JP",
306
+ "cover": "rushia.png"
307
+ },
308
  "Flare": {
309
  "enable": true,
310
  "model_path": "SBV2_HoloJPTest",
 
332
  "primarylang": "JP",
333
  "cover": "marine.png"
334
  },
335
+ "Kanata": {
336
+ "enable": true,
337
+ "model_path": "SBV2_HoloJPTest3",
338
+ "title": "Amane Kanata",
339
+ "speakerid": "AmaneKanata",
340
+ "datasetauthor": "Kit Lemonfoot",
341
+ "primarylang": "JP",
342
+ "cover": "kanata.png"
343
+ },
344
+ "Watame": {
345
+ "enable": true,
346
+ "model_path": "SBV2_HoloJPTest3",
347
+ "title": "Tsunomaki Watame",
348
+ "speakerid": "TsunomakiWatame",
349
+ "datasetauthor": "Kit Lemonfoot",
350
+ "primarylang": "JP",
351
+ "cover": "watame.png"
352
+ },
353
  "Towa": {
354
  "enable": true,
355
  "model_path": "SBV2_HoloJPTest2",
 
377
  "primarylang": "JP",
378
  "cover": "lamy.png"
379
  },
380
+ "Nene": {
381
+ "enable": true,
382
+ "model_path": "SBV2_HoloJPTest3",
383
+ "title": "Momosuzu Nene",
384
+ "speakerid": "MomosuzuNene",
385
+ "datasetauthor": "Kit Lemonfoot",
386
+ "primarylang": "JP",
387
+ "cover": "nene.png"
388
+ },
389
+ "Polka": {
390
+ "enable": true,
391
+ "model_path": "SBV2_HoloJPTest3",
392
+ "title": "Omaru Polka",
393
+ "speakerid": "OmaruPolka",
394
+ "datasetauthor": "Kit Lemonfoot",
395
+ "primarylang": "JP",
396
+ "cover": "polka.png"
397
+ },
398
  "Laplus": {
399
  "enable": true,
400
  "model_path": "SBV2_HoloJPTest",
 
439
  "datasetauthor": "Kit Lemonfoot",
440
  "primarylang": "JP",
441
  "cover": "ririka.png"
442
+ },
443
+ "Raden": {
444
+ "enable": true,
445
+ "model_path": "SBV2_HoloJPTest3",
446
+ "title": "Juufuutei Raden",
447
+ "speakerid": "JuufuuteiRaden",
448
+ "datasetauthor": "Kit Lemonfoot",
449
+ "primarylang": "JP",
450
+ "cover": "raden.png"
451
  }
452
  }