Plachta commited on
Commit
18dbf2d
·
1 Parent(s): eac9bb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -278
app.py CHANGED
@@ -5,7 +5,6 @@ import re
5
  import tempfile
6
  import logging
7
  logging.getLogger('numba').setLevel(logging.WARNING)
8
- import ONNXVITS_infer
9
  import librosa
10
  import numpy as np
11
  import torch
@@ -15,14 +14,12 @@ import utils
15
  import gradio as gr
16
  import gradio.utils as gr_utils
17
  import gradio.processing_utils as gr_processing_utils
18
- from models import SynthesizerTrn
19
  from text import text_to_sequence, _clean_text
20
  from text.symbols import symbols
21
  from mel_processing import spectrogram_torch
22
- import translators.server as tss
23
  import psutil
24
  from datetime import datetime
25
- from text.cleaners import japanese_cleaners
26
 
27
  def audio_postprocess(self, y):
28
  if y is None:
@@ -42,53 +39,71 @@ def audio_postprocess(self, y):
42
  return gr_processing_utils.encode_url_or_file_to_base64(file.name)
43
 
44
 
 
 
 
 
 
 
 
45
  gr.Audio.postprocess = audio_postprocess
46
 
47
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
48
- languages = ['日本語', '简体中文', 'English']
49
- characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
50
- '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
51
- '8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
52
- '12:目白麦昆', '13:神鹰', '14:好歌剧', '15:成田白仁',
53
- '16:鲁道夫象征', '17:气槽', '18:爱丽数码', '19:青云天空',
54
- '20:玉藻十字', '21:美妙姿势', '22:琵琶晨光', '23:重炮',
55
- '24:曼城茶座', '25:美普波旁', '26:目白雷恩', '27:菱曙',
56
- '28:雪之美人', '29:米浴', '30:艾尼斯风神', '31:爱丽速子',
57
- '32:爱慕织姬', '33:稻荷一', '34:胜利奖券', '35:空中神宫',
58
- '36:荣进闪耀', '37:真机伶', '38:川上公主', '39:黄金城市',
59
- '40:樱花进王', '41:采珠', '42:新光风', '43:东商变革',
60
- '44:超级小溪', '45:醒目飞鹰', '46:荒漠英雄', '47:东瀛佐敦',
61
- '48:中山庆典', '49:成田大进', '50:西野花', '51:春乌拉拉',
62
- '52:青竹回忆', '53:微光飞驹', '54:美丽周日', '55:待兼福来',
63
- '56:Mr.C.B', '57:名将怒涛', '58:目白多伯', '59:优秀素质',
64
- '60:帝王光环', '61:待兼诗歌剧', '62:生野狄杜斯', '63:目白善信',
65
- '64:大拓太阳神', '65:双涡轮', '66:里见光钻', '67:北部玄驹',
66
- '68:樱花千代王', '69:天狼星象征', '70:目白阿尔丹', '71:八重无敌',
67
- '72:鹤丸刚志', '73:目白光明', '74:樱花桂冠', '75:成田路',
68
- '76:也文摄辉', '77:吉兆', '78:谷野美酒', '79:第一红宝石',
69
- '80:真弓快车', '81:骏川手纲', '82:凯斯奇迹', '83:小林历奇',
70
- '84:北港火山', '85:奇锐骏', '86:秋川理事长']
71
- def show_memory_info(hint):
72
- pid = os.getpid()
73
- p = psutil.Process(pid)
74
- info = p.memory_info()
75
- memory = info.rss / 1024.0 / 1024
76
- print("{} 内存占用: {} MB".format(hint, memory))
77
 
78
- def text_to_phoneme(text, symbols, is_symbol):
79
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 
 
 
 
 
 
 
 
80
 
81
- sequence = ""
82
- if not is_symbol:
83
- clean_text = japanese_cleaners(text)
84
- else:
85
- clean_text = text
86
- for symbol in clean_text:
87
- if symbol not in _symbol_to_id.keys():
88
- continue
89
- symbol_id = _symbol_to_id[symbol]
90
- sequence += symbol
91
- return sequence
 
 
 
 
 
 
 
 
 
92
 
93
  def get_text(text, hps, is_symbol):
94
  text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
@@ -97,120 +112,12 @@ def get_text(text, hps, is_symbol):
97
  text_norm = LongTensor(text_norm)
98
  return text_norm
99
 
100
- hps = utils.get_hparams_from_file("./configs/uma87.json")
101
- symbols = hps.symbols
102
- net_g = ONNXVITS_infer.SynthesizerTrn(
103
- len(hps.symbols),
104
- hps.data.filter_length // 2 + 1,
105
- hps.train.segment_size // hps.data.hop_length,
106
- n_speakers=hps.data.n_speakers,
107
- **hps.model)
108
- _ = net_g.eval()
109
-
110
- _ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g)
111
 
112
- def to_symbol_fn(is_symbol_input, input_text, temp_text):
113
- return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
114
- else (temp_text, temp_text)
115
-
116
- def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, is_symbol):
117
- # check character & duraction parameter
118
- if language not in languages:
119
- print("Error: No such language\n")
120
- return "Error: No such language", None, None, None
121
- if character not in characters:
122
- print("Error: No such character\n")
123
- return "Error: No such character", None, None, None
124
- # check text length
125
- if limitation:
126
- text_len = len(text_raw) if is_symbol else len(re.sub("\[([A-Z]{2})\]", "", text_raw))
127
- max_len = 150
128
- if is_symbol:
129
- max_len *= 3
130
- if text_len > max_len:
131
- print(f"Refused: Text too long ({text_len}).")
132
- return "Error: Text is too long", None, None, None
133
- if text_len == 0:
134
- print("Refused: Text length is zero.")
135
- return "Error: Please input text!", None, None, None
136
- if is_symbol:
137
- text = text_raw
138
- elif language == '日本語':
139
- text = text_raw
140
- elif language == '简体中文':
141
- text = tss.google(text_raw, from_language='zh', to_language='ja')
142
- elif language == 'English':
143
- text = tss.google(text_raw, from_language='en', to_language='ja')
144
- char_id = int(character.split(':')[0])
145
- stn_tst = get_text(text, hps, is_symbol)
146
- with torch.no_grad():
147
- x_tst = stn_tst.unsqueeze(0)
148
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
149
- sid = torch.LongTensor([char_id])
150
- try:
151
- jp2phoneme = text_to_phoneme(text, hps.symbols, is_symbol)
152
- durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
153
- noise_scale_w=noise_scale_w, length_scale=duration)
154
- char_dur_list = []
155
- for i, char in enumerate(jp2phoneme):
156
- char_pos = i * 2 + 1
157
- char_dur = durations[char_pos]
158
- char_dur_list.append(char_dur)
159
- except IndexError:
160
- print("Refused: Phoneme input contains non-phoneme character.")
161
- return "Error: You can only input phoneme under phoneme input model", None, None, None
162
- char_spacing_dur_list = []
163
- char_spacings = []
164
- for i in range(len(durations)):
165
- if i % 2 == 0: # spacing
166
- char_spacings.append("spacing")
167
- elif i % 2 == 1: # char
168
- char_spacings.append(jp2phoneme[int((i - 1) / 2)])
169
- char_spacing_dur_list.append(int(durations[i]))
170
- # convert duration information to string
171
- duration_info_str = ""
172
- for i in range(len(char_spacings)):
173
- if i == len(char_spacings) - 1:
174
- duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")"
175
- elif char_spacings[i] == "spacing":
176
- duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")" + ", "
177
- else:
178
- duration_info_str += char_spacings[i] + ":" + str(char_spacing_dur_list[i])
179
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
180
- currentDateAndTime = datetime.now()
181
- print(f"\nCharacter {character} inference successful: {text}")
182
- if language != '日本語':
183
- print(f"translate from {language}: {text_raw}")
184
- show_memory_info(str(currentDateAndTime) + " infer调用后")
185
- return (text,(22050, audio), jp2phoneme, duration_info_str)
186
-
187
- def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
188
- try:
189
- phonemes = duration_info_str.split(", ")
190
- recons_durs = []
191
- recons_phonemes = ""
192
- for i, item in enumerate(phonemes):
193
- if i == 0:
194
- recons_durs.append(int(item.strip("()")))
195
- else:
196
- phoneme_n_dur, spacing_dur = item.split("(")
197
- recons_phonemes += phoneme_n_dur.split(":")[0]
198
- recons_durs.append(int(phoneme_n_dur.split(":")[1]))
199
- recons_durs.append(int(spacing_dur.strip(")")))
200
- except ValueError:
201
- return ("Error: Format must not be changed!", None)
202
- except AssertionError:
203
- return ("Error: Format must not be changed!", None)
204
- char_id = int(character.split(':')[0])
205
- stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
206
- with torch.no_grad():
207
- x_tst = stn_tst.unsqueeze(0)
208
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
209
- sid = torch.LongTensor([char_id])
210
- audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
211
- length_scale=duration)[0][0, 0].data.cpu().float().numpy()
212
- print(f"\nCharacter {character} inference successful: {recons_phonemes}, from {duration_info_str}")
213
- return (recons_phonemes, (22050, audio))
214
 
215
  download_audio_js = """
216
  () =>{{
@@ -230,134 +137,132 @@ download_audio_js = """
230
  }}
231
  """
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  if __name__ == "__main__":
234
  parser = argparse.ArgumentParser()
235
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
236
  args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  app = gr.Blocks()
238
  with app:
239
- gr.Markdown("# Umamusume voice synthesizer 赛马娘语音合成器\n\n"
240
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
241
- "This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
242
- "这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。[Dataset Link](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
243
- "[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
244
- "You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
245
- "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
246
  "If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
247
  "若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
248
- "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
249
- "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
250
  )
251
- with gr.Row():
252
- with gr.Column():
253
- # We instantiate the Textbox class
254
- textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
255
- with gr.Accordion(label="Phoneme Input", open=False):
256
- temp_text_var = gr.Variable()
257
- symbol_input = gr.Checkbox(value=False, label="Symbol input")
258
- symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
259
- samples=[[x] for x in symbols],
260
- elem_id=f"symbol-list")
261
- symbol_list_json = gr.Json(value=symbols, visible=False)
262
- symbol_input.change(to_symbol_fn,
263
- [symbol_input, textbox, temp_text_var],
264
- [textbox, temp_text_var])
265
- symbol_list.click(None, [symbol_list, symbol_list_json], textbox,
266
- _js=f"""
267
- (i, symbols, text) => {{
268
- let root = document.querySelector("body > gradio-app");
269
- if (root.shadowRoot != null)
270
- root = root.shadowRoot;
271
- let text_input = root.querySelector("#tts-input").querySelector("textarea");
272
- let startPos = text_input.selectionStart;
273
- let endPos = text_input.selectionEnd;
274
- let oldTxt = text_input.value;
275
- let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
276
- text_input.value = result;
277
- let x = window.scrollX, y = window.scrollY;
278
- text_input.focus();
279
- text_input.selectionStart = startPos + symbols[i].length;
280
- text_input.selectionEnd = startPos + symbols[i].length;
281
- text_input.blur();
282
- window.scrollTo(x, y);
283
-
284
- text = text_input.value;
285
-
286
- return text;
287
- }}""")
288
- # select character
289
- char_dropdown = gr.Dropdown(choices=characters, value = "0:特别周", label='character')
290
- language_dropdown = gr.Dropdown(choices=languages, value = "日本語", label='language')
291
-
292
-
293
- duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1, label='时长 Duration')
294
- noise_scale_slider = gr.Slider(minimum=0.1, maximum=5, value=0.667, step=0.001, label='噪声比例 noise_scale')
295
- noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')
296
-
297
-
298
-
299
- with gr.Column():
300
- text_output = gr.Textbox(label="Output Text")
301
- phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
302
- audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
303
- btn = gr.Button("Generate!")
304
- cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
305
-
306
- download = gr.Button("Download Audio")
307
- download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
308
- with gr.Accordion(label="Speaking Pace Control", open=True):
309
 
310
- duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
311
- interactive = True)
312
- gr.Markdown(
313
- "The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme. "
314
- "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled. "
315
- "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
316
- "音素冒号后的数字代表音素在生成音频中的长度,( )内的数字代表每个音素与下一个音素之间间隔的长度。"
317
- "您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
318
- "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
319
- )
320
- btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
321
- outputs=[text_output, audio_output, phoneme_output, duration_output])
322
- cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
323
- outputs=[phoneme_output, audio_output])
324
-
325
- examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
326
- ['お疲れ様です,トレーナーさん。', '1:无声铃鹿', '日本語', 1, 0.667, 0.8, False],
327
- ['張り切っていこう!', '67:北部玄驹', '日本語', 1, 0.667, 0.8, False],
328
- ['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', '10:草上飞', '日本語', 1, 0.667, 0.8, False],
329
- ['授業中に出しだら,学校生活終わるですわ。', '12:目白麦昆', '日本語', 1, 0.667, 0.8, False],
330
- ['お帰りなさい,お兄様!', '29:米浴', '日本語', 1, 0.667, 0.8, False],
331
- ['私の処女をもらっでください!', '29:米浴', '日本語', 1, 0.667, 0.8, False]]
332
- gr.Examples(
333
- examples=examples,
334
- inputs=[textbox, char_dropdown, language_dropdown,
335
- duration_slider, noise_scale_slider,noise_scale_w_slider, symbol_input],
336
- outputs=[text_output, audio_output],
337
- fn=infer
338
- )
339
- gr.Markdown("# Updates Logs 更新日志:\n\n"
340
- "2023/1/24:\n\n"
341
- "Improved the format of phoneme length control.\n\n"
342
- "改善了音素控制的格式。\n\n"
343
- "2023/1/24:\n\n"
344
- "Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
345
- "增加了对说话节奏的音素级控制。\n\n"
346
- "2023/1/13:\n\n"
347
- "Added one example of phoneme input.\n\n"
348
- "增加了音素输入的example(米浴喘气)\n\n"
349
- "2023/1/12:\n\n"
350
- "Added phoneme input, which enables more precise control on output audio.\n\n"
351
- "增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
352
- "Adjusted UI arrangements.\n\n"
353
- "调整了UI的布局。\n\n"
354
- "2023/1/10:\n\n"
355
- "Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
356
- "数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
357
- "2023/1/9:\n\n"
358
- "Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
359
- "模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
360
- "Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
361
- "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
362
- )
363
  app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
 
5
  import tempfile
6
  import logging
7
  logging.getLogger('numba').setLevel(logging.WARNING)
 
8
  import librosa
9
  import numpy as np
10
  import torch
 
14
  import gradio as gr
15
  import gradio.utils as gr_utils
16
  import gradio.processing_utils as gr_processing_utils
17
+ from ONNXVITS_infer import SynthesizerTrn
18
  from text import text_to_sequence, _clean_text
19
  from text.symbols import symbols
20
  from mel_processing import spectrogram_torch
 
21
  import psutil
22
  from datetime import datetime
 
23
 
24
  def audio_postprocess(self, y):
25
  if y is None:
 
39
  return gr_processing_utils.encode_url_or_file_to_base64(file.name)
40
 
41
 
42
+ language_marks = {
43
+ "日本語": "[JA]",
44
+ "简体中文": "[ZH]",
45
+ "English": "[EN]",
46
+ "Mix": "",
47
+ }
48
+
49
  gr.Audio.postprocess = audio_postprocess
50
 
51
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
52
+ def create_tts_fn(model, hps, speaker_ids):
53
+ def tts_fn(text, speaker, language, speed, is_symbol):
54
+ if limitation:
55
+ text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
56
+ max_len = 150
57
+ if is_symbol:
58
+ max_len *= 3
59
+ if text_len > max_len:
60
+ return "Error: Text is too long", None
61
+ if language is not None:
62
+ text = language_marks[language] + text + language_marks[language]
63
+ speaker_id = speaker_ids[speaker]
64
+ stn_tst = get_text(text, hps, is_symbol)
65
+ with no_grad():
66
+ x_tst = stn_tst.unsqueeze(0).to(device)
67
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
68
+ sid = LongTensor([speaker_id]).to(device)
69
+ audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
70
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
71
+ del stn_tst, x_tst, x_tst_lengths, sid
72
+ return "Success", (hps.data.sampling_rate, audio)
73
+
74
+ return tts_fn
 
 
 
 
 
 
75
 
76
+ def create_vc_fn(model, hps, speaker_ids):
77
+ def vc_fn(original_speaker, target_speaker, input_audio):
78
+ if input_audio is None:
79
+ return "You need to upload an audio", None
80
+ sampling_rate, audio = input_audio
81
+ duration = audio.shape[0] / sampling_rate
82
+ if limitation and duration > 30:
83
+ return "Error: Audio is too long", None
84
+ original_speaker_id = speaker_ids[original_speaker]
85
+ target_speaker_id = speaker_ids[target_speaker]
86
 
87
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
88
+ if len(audio.shape) > 1:
89
+ audio = librosa.to_mono(audio.transpose(1, 0))
90
+ if sampling_rate != hps.data.sampling_rate:
91
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
92
+ with no_grad():
93
+ y = torch.FloatTensor(audio)
94
+ y = y.unsqueeze(0)
95
+ spec = spectrogram_torch(y, hps.data.filter_length,
96
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
97
+ center=False).to(device)
98
+ spec_lengths = LongTensor([spec.size(-1)]).to(device)
99
+ sid_src = LongTensor([original_speaker_id]).to(device)
100
+ sid_tgt = LongTensor([target_speaker_id]).to(device)
101
+ audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
102
+ 0, 0].data.cpu().float().numpy()
103
+ del y, spec, spec_lengths, sid_src, sid_tgt
104
+ return "Success", (hps.data.sampling_rate, audio)
105
+
106
+ return vc_fn
107
 
108
  def get_text(text, hps, is_symbol):
109
  text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
 
112
  text_norm = LongTensor(text_norm)
113
  return text_norm
114
 
115
+ def create_to_symbol_fn(hps):
116
+ def to_symbol_fn(is_symbol_input, input_text, temp_text):
117
+ return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
118
+ else (temp_text, temp_text)
 
 
 
 
 
 
 
119
 
120
+ return to_symbol_fn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  download_audio_js = """
123
  () =>{{
 
137
  }}
138
  """
139
 
140
+ models_tts = []
141
+ models_vc = []
142
+ models_info = [
143
+ {
144
+ "title": "Japanese",
145
+ "langugages": ["日本語"],
146
+ "description": "",
147
+ "model_path": "./pretrained_models/G_1153000.pth",
148
+ "config_path": "./configs/uma87.json"
149
+ "examples": [['お疲れ様です,トレーナーさん。', 'Silence Suzuka', '日本語', 1, False],
150
+ ['張り切っていこう!', 'Kitasan Black', '日本語', 1, False],
151
+ ['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', 'Grass Wonder', '日本語', 1, False],
152
+ ['授業中に出しだら,学校生活終わるですわ。', 'Mejiro Mcqueen', '日本語', 1, False],
153
+ ['お帰りなさい,お兄様!', 'Rice Shower', '日本語', 1, False],
154
+ ['私の処女をもらっでください!', 'Rice Shower', '日本語', 1, False]]
155
+ },
156
+ {
157
+ "title": "Japanese",
158
+ "langugages": ['日本語', '简体中文', 'English', 'Mix'],
159
+ "description": "",
160
+ "model_path": "./pretrained_models/G_1396000.pth",
161
+ "config_path": "./configs/uma_trilingual.json"
162
+ "examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
163
+ ['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English', 1, False],
164
+ ['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False]]
165
+ }
166
+ ]
167
+
168
+
169
+
170
  if __name__ == "__main__":
171
  parser = argparse.ArgumentParser()
172
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
173
  args = parser.parse_args()
174
+ for info in models_info:
175
+ name = info['title']
176
+ lang = info['languages']
177
+ examples = info['examples']
178
+ config_path = info['config_path']
179
+ model_path = info['model_path']
180
+ hps = utils.get_hparams_from_file(config_path)
181
+ model = SynthesizerTrn(
182
+ len(hps.symbols),
183
+ hps.data.filter_length // 2 + 1,
184
+ hps.train.segment_size // hps.data.hop_length,
185
+ n_speakers=hps.data.n_speakers,
186
+ **hps.model)
187
+ utils.load_checkpoint(model_path, model, None)
188
+ model.eval()
189
+ speaker_ids = hps.speakers
190
+ speakers = list(hps.speakers.keys())
191
+ models_tts.append((name, speakers, lang, example,
192
+ hps.symbols, create_tts_fn(model, hps, speaker_ids),
193
+ create_to_symbol_fn(hps)))
194
+ models_vc.append((name, speakers, create_vc_fn(model, hps, speaker_ids)))
195
  app = gr.Blocks()
196
  with app:
197
+ gr.Markdown("# English & Chinese & Japanese Anime TTS\n\n"
198
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
199
+ "Including Japanese TTS & Trilingual TTS, speakers are all anime characters. 包含一个纯日语TTS和一个中日英三语TTS模型,主要为二次元角色。"
 
 
 
 
200
  "If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
201
  "若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
 
 
202
  )
203
+ with gr.Tabs():
204
+ with gr.TabItem("TTS"):
205
+ with gr.Tabs():
206
+ for i, (name, speakers, lang, example, symbols, tts_fn, to_symbol_fn) in enumerate(models_tts):
207
+ with gr.TabItem(name)
208
+ with gr.Row():
209
+ with gr.Column():
210
+ textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
211
+ with gr.Accordion(label="Phoneme Input", open=False):
212
+ temp_text_var = gr.Variable()
213
+ symbol_input = gr.Checkbox(value=False, label="Symbol input")
214
+ symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
215
+ samples=[[x] for x in symbols],
216
+ elem_id=f"symbol-list")
217
+ symbol_list_json = gr.Json(value=symbols, visible=False)
218
+ symbol_input.change(to_symbol_fn,
219
+ [symbol_input, textbox, temp_text_var],
220
+ [textbox, temp_text_var])
221
+ symbol_list.click(None, [symbol_list, symbol_list_json], textbox,
222
+ _js=f"""
223
+ (i, symbols, text) => {{
224
+ let root = document.querySelector("body > gradio-app");
225
+ if (root.shadowRoot != null)
226
+ root = root.shadowRoot;
227
+ let text_input = root.querySelector("#tts-input").querySelector("textarea");
228
+ let startPos = text_input.selectionStart;
229
+ let endPos = text_input.selectionEnd;
230
+ let oldTxt = text_input.value;
231
+ let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
232
+ text_input.value = result;
233
+ let x = window.scrollX, y = window.scrollY;
234
+ text_input.focus();
235
+ text_input.selectionStart = startPos + symbols[i].length;
236
+ text_input.selectionEnd = startPos + symbols[i].length;
237
+ text_input.blur();
238
+ window.scrollTo(x, y);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ text = text_input.value;
241
+
242
+ return text;
243
+ }}""")
244
+ # select character
245
+ char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
246
+ language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
247
+ duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1, label='时长 Duration')
248
+ with gr.Column():
249
+ text_output = gr.Textbox(label="Message")
250
+ audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
251
+ btn = gr.Button("Generate!")
252
+
253
+ download = gr.Button("Download Audio")
254
+ download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
255
+ if len(lang) == 1:
256
+ btn.click(tts_fn, inputs=[textbox, char_dropdown, None, duration_slider, symbol_input],
257
+ outputs=[text_output, audio_output])
258
+ else:
259
+ btn.click(tts_fn, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, symbol_input],
260
+ outputs=[text_output, audio_output])
261
+ gr.Examples(
262
+ examples=example,
263
+ inputs=[textbox, char_dropdown, language_dropdown,
264
+ duration_slider, symbol_input],
265
+ outputs=[text_output, audio_output],
266
+ fn=tts_fn
267
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  app.queue(concurrency_count=3).launch(show_api=False, share=args.share)