Plachta commited on
Commit
b89ec7e
·
1 Parent(s): 33fa82e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -23
app.py CHANGED
@@ -159,12 +159,12 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, i
159
  # convert duration information to string
160
  duration_info_str = ""
161
  for i in range(len(char_spacings)):
162
- if char_spacings[i] == "spacing":
163
- duration_info_str += str(char_spacing_dur_list[i])
 
 
164
  else:
165
- duration_info_str += "{" + char_spacings[i] + ":" + str(char_spacing_dur_list[i]) + "}"
166
- if i != len(char_spacings)-1:
167
- duration_info_str += ", "
168
  audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
169
  currentDateAndTime = datetime.now()
170
  print(f"\nCharacter {character} inference successful: {text}")
@@ -178,12 +178,14 @@ def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale,
178
  phonemes = duration_info_str.split(", ")
179
  recons_durs = []
180
  recons_phonemes = ""
181
- for item in phonemes:
182
- if "{" not in item: # spacing
183
- recons_durs.append(int(item))
184
  else:
185
- recons_phonemes += item.strip("{}").split(":")[0]
186
- recons_durs.append(int(item.strip("{}").split(":")[1]))
 
 
187
  except ValueError:
188
  return ("Error: Format must not be changed!", None)
189
  except AssertionError:
@@ -232,8 +234,8 @@ if __name__ == "__main__":
232
  "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
233
  "This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
234
  "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
235
- "If you have any suggestions or bug reports, feel free to open discussion in Community.\n\n"
236
- "若有bug反馈或建议,请在Community下开启一个新的Discussion。 \n\n"
237
  "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
238
  "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
239
  )
@@ -296,12 +298,12 @@ if __name__ == "__main__":
296
  duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
297
  interactive = True)
298
  gr.Markdown(
299
- "\{ \}内的数字代表每个音素在生成的音频中的长度,\{ \}外的数字代表音素之间间隔的长度。"
300
- "您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
301
- "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
302
- "The numbers inside \{ \} represent the length for each phoneme in the generated audio, while the numbers out of \{ \} represent the length of spacings between phonemes."
303
  "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
304
  "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
 
 
 
305
  )
306
  btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
307
  outputs=[text_output, audio_output, phoneme_output, duration_output])
@@ -324,23 +326,26 @@ if __name__ == "__main__":
324
  )
325
  gr.Markdown("# Updates Logs 更新日志:\n\n"
326
  "2023/1/24:\n\n"
327
- "增加了对说话节奏的音素级控制。\n\n"
 
 
328
  "Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
 
329
  "2023/1/13:\n\n"
330
- "增加了音素输入的example(米浴喘气)\n\n"
331
  "Added one example of phoneme input.\n\n"
 
332
  "2023/1/12:\n\n"
333
- "增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
334
  "Added phoneme input, which enables more precise control on output audio.\n\n"
335
- "调整了UI的布局。\n\n"
336
  "Adjusted UI arrangements.\n\n"
 
337
  "2023/1/10:\n\n"
338
- "数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
339
  "Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
 
340
  "2023/1/9:\n\n"
341
- "模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
342
  "Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
343
- "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
344
  "Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
 
345
  )
346
  app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
 
159
  # convert duration information to string
160
  duration_info_str = ""
161
  for i in range(len(char_spacings)):
162
+ if i == len(char_spacings) - 1:
163
+ duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")"
164
+ elif char_spacings[i] == "spacing":
165
+ duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")" + ", "
166
  else:
167
+ duration_info_str += char_spacings[i] + ":" + str(char_spacing_dur_list[i])
 
 
168
  audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
169
  currentDateAndTime = datetime.now()
170
  print(f"\nCharacter {character} inference successful: {text}")
 
178
  phonemes = duration_info_str.split(", ")
179
  recons_durs = []
180
  recons_phonemes = ""
181
+ for i, item in enumerate(phonemes):
182
+ if i == 0:
183
+ recons_durs.append(int(item.strip("()")))
184
  else:
185
+ phoneme_n_dur, spacing_dur = item.split("(")
186
+ recons_phonemes += phoneme_n_dur.split(":")[0]
187
+ recons_durs.append(int(phoneme_n_dur.split(":")[1]))
188
+ recons_durs.append(int(spacing_dur.strip(")")))
189
  except ValueError:
190
  return ("Error: Format must not be changed!", None)
191
  except AssertionError:
 
234
  "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
235
  "This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
236
  "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
237
+ "If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
238
+ "若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
239
  "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
240
  "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
241
  )
 
298
  duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
299
  interactive = True)
300
  gr.Markdown(
301
+ "The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme."
 
 
 
302
  "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
303
  "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
304
+ "音素冒号后的数字代��音素在生成音频中的长度,( )内的数字代表每个音素与下一个音素之间间隔的长度。"
305
+ "您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
306
+ "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
307
  )
308
  btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
309
  outputs=[text_output, audio_output, phoneme_output, duration_output])
 
326
  )
327
  gr.Markdown("# Updates Logs 更新日志:\n\n"
328
  "2023/1/24:\n\n"
329
+ "Improved the format of phoneme length control.\n\n"
330
+ "改善了音素控制的格式。\n\n"
331
+ "2023/1/24:\n\n"
332
  "Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
333
+ "增加了对说话节奏的音素级控制。\n\n"
334
  "2023/1/13:\n\n"
 
335
  "Added one example of phoneme input.\n\n"
336
+ "增加了音素输入的example(米浴喘气)\n\n"
337
  "2023/1/12:\n\n"
 
338
  "Added phoneme input, which enables more precise control on output audio.\n\n"
339
+ "增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
340
  "Adjusted UI arrangements.\n\n"
341
+ "调整了UI的布局。\n\n"
342
  "2023/1/10:\n\n"
 
343
  "Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
344
+ "数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
345
  "2023/1/9:\n\n"
 
346
  "Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
347
+ "模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
348
  "Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
349
+ "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
350
  )
351
  app.queue(concurrency_count=3).launch(show_api=False, share=args.share)