Spaces:
Running
on
L4
Running
on
L4
Replaced Encodec with Vocos
Browse files
app.py
CHANGED
|
@@ -323,7 +323,7 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
|
|
| 323 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|
| 324 |
|
| 325 |
|
| 326 |
-
|
| 327 |
@torch.no_grad()
|
| 328 |
def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
|
| 329 |
"""
|
|
@@ -331,11 +331,9 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
|
|
| 331 |
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
|
| 332 |
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
|
| 333 |
"""
|
| 334 |
-
from utils.sentence_cutter import split_text_into_sentences
|
| 335 |
if len(text) > 1000:
|
| 336 |
return "Rejected, Text too long (should be less than 1000 characters)", None
|
| 337 |
mode = 'fixed-prompt'
|
| 338 |
-
global model, audio_tokenizer, text_tokenizer, text_collater
|
| 339 |
if (prompt is None or prompt == "") and preset_prompt == "":
|
| 340 |
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
|
| 341 |
sentences = split_text_into_sentences(text)
|
|
@@ -463,122 +461,113 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
|
|
| 463 |
else:
|
| 464 |
raise ValueError(f"No such mode {mode}")
|
| 465 |
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
with
|
| 470 |
-
gr.Markdown(
|
| 471 |
-
with gr.
|
| 472 |
-
gr.
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
with gr.
|
| 507 |
-
gr.
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
with gr.
|
| 532 |
-
gr.
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
with gr.
|
| 557 |
-
gr.
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
app.launch()
|
| 578 |
-
|
| 579 |
-
if __name__ == "__main__":
|
| 580 |
-
formatter = (
|
| 581 |
-
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
| 582 |
-
)
|
| 583 |
-
logging.basicConfig(format=formatter, level=logging.INFO)
|
| 584 |
-
main()
|
|
|
|
| 323 |
return message, (24000, samples.squeeze(0).cpu().numpy())
|
| 324 |
|
| 325 |
|
| 326 |
+
from utils.sentence_cutter import split_text_into_sentences
|
| 327 |
@torch.no_grad()
|
| 328 |
def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
|
| 329 |
"""
|
|
|
|
| 331 |
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
|
| 332 |
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
|
| 333 |
"""
|
|
|
|
| 334 |
if len(text) > 1000:
|
| 335 |
return "Rejected, Text too long (should be less than 1000 characters)", None
|
| 336 |
mode = 'fixed-prompt'
|
|
|
|
| 337 |
if (prompt is None or prompt == "") and preset_prompt == "":
|
| 338 |
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
|
| 339 |
sentences = split_text_into_sentences(text)
|
|
|
|
| 461 |
else:
|
| 462 |
raise ValueError(f"No such mode {mode}")
|
| 463 |
|
| 464 |
+
app = gr.Blocks()
|
| 465 |
+
with app:
|
| 466 |
+
gr.Markdown(top_md)
|
| 467 |
+
with gr.Tab("Infer from audio"):
|
| 468 |
+
gr.Markdown(infer_from_audio_md)
|
| 469 |
+
with gr.Row():
|
| 470 |
+
with gr.Column():
|
| 471 |
+
|
| 472 |
+
textbox = gr.TextArea(label="Text",
|
| 473 |
+
placeholder="Type your sentence here",
|
| 474 |
+
value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
|
| 475 |
+
language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect', label='language')
|
| 476 |
+
accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
|
| 477 |
+
textbox_transcript = gr.TextArea(label="Transcript",
|
| 478 |
+
placeholder="Write transcript here. (leave empty to use whisper)",
|
| 479 |
+
value="", elem_id=f"prompt-name")
|
| 480 |
+
upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
| 481 |
+
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
| 482 |
+
with gr.Column():
|
| 483 |
+
text_output = gr.Textbox(label="Message")
|
| 484 |
+
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
| 485 |
+
btn = gr.Button("Generate!")
|
| 486 |
+
btn.click(infer_from_audio,
|
| 487 |
+
inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
| 488 |
+
outputs=[text_output, audio_output])
|
| 489 |
+
textbox_mp = gr.TextArea(label="Prompt name",
|
| 490 |
+
placeholder="Name your prompt here",
|
| 491 |
+
value="prompt_1", elem_id=f"prompt-name")
|
| 492 |
+
btn_mp = gr.Button("Make prompt!")
|
| 493 |
+
prompt_output = gr.File(interactive=False)
|
| 494 |
+
btn_mp.click(make_npz_prompt,
|
| 495 |
+
inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
| 496 |
+
outputs=[text_output, prompt_output])
|
| 497 |
+
gr.Examples(examples=infer_from_audio_examples,
|
| 498 |
+
inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
| 499 |
+
outputs=[text_output, audio_output],
|
| 500 |
+
fn=infer_from_audio,
|
| 501 |
+
cache_examples=False,)
|
| 502 |
+
with gr.Tab("Make prompt"):
|
| 503 |
+
gr.Markdown(make_prompt_md)
|
| 504 |
+
with gr.Row():
|
| 505 |
+
with gr.Column():
|
| 506 |
+
textbox2 = gr.TextArea(label="Prompt name",
|
| 507 |
+
placeholder="Name your prompt here",
|
| 508 |
+
value="prompt_1", elem_id=f"prompt-name")
|
| 509 |
+
# 添加选择语言和输入台本的地方
|
| 510 |
+
textbox_transcript2 = gr.TextArea(label="Transcript",
|
| 511 |
+
placeholder="Write transcript here. (leave empty to use whisper)",
|
| 512 |
+
value="", elem_id=f"prompt-name")
|
| 513 |
+
upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
| 514 |
+
record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
| 515 |
+
with gr.Column():
|
| 516 |
+
text_output_2 = gr.Textbox(label="Message")
|
| 517 |
+
prompt_output_2 = gr.File(interactive=False)
|
| 518 |
+
btn_2 = gr.Button("Make!")
|
| 519 |
+
btn_2.click(make_npz_prompt,
|
| 520 |
+
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
| 521 |
+
outputs=[text_output_2, prompt_output_2])
|
| 522 |
+
gr.Examples(examples=make_npz_prompt_examples,
|
| 523 |
+
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
| 524 |
+
outputs=[text_output_2, prompt_output_2],
|
| 525 |
+
fn=make_npz_prompt,
|
| 526 |
+
cache_examples=False,)
|
| 527 |
+
with gr.Tab("Infer from prompt"):
|
| 528 |
+
gr.Markdown(infer_from_prompt_md)
|
| 529 |
+
with gr.Row():
|
| 530 |
+
with gr.Column():
|
| 531 |
+
textbox_3 = gr.TextArea(label="Text",
|
| 532 |
+
placeholder="Type your sentence here",
|
| 533 |
+
value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
|
| 534 |
+
language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
|
| 535 |
+
label='language')
|
| 536 |
+
accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
|
| 537 |
+
label='accent')
|
| 538 |
+
preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
|
| 539 |
+
prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
|
| 540 |
+
with gr.Column():
|
| 541 |
+
text_output_3 = gr.Textbox(label="Message")
|
| 542 |
+
audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
| 543 |
+
btn_3 = gr.Button("Generate!")
|
| 544 |
+
btn_3.click(infer_from_prompt,
|
| 545 |
+
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
| 546 |
+
outputs=[text_output_3, audio_output_3])
|
| 547 |
+
gr.Examples(examples=infer_from_prompt_examples,
|
| 548 |
+
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
| 549 |
+
outputs=[text_output_3, audio_output_3],
|
| 550 |
+
fn=infer_from_prompt,
|
| 551 |
+
cache_examples=False,)
|
| 552 |
+
with gr.Tab("Infer long text"):
|
| 553 |
+
gr.Markdown(long_text_md)
|
| 554 |
+
with gr.Row():
|
| 555 |
+
with gr.Column():
|
| 556 |
+
textbox_4 = gr.TextArea(label="Text",
|
| 557 |
+
placeholder="Type your sentence here",
|
| 558 |
+
value=long_text_example, elem_id=f"tts-input")
|
| 559 |
+
language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
|
| 560 |
+
label='language')
|
| 561 |
+
accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
|
| 562 |
+
label='accent')
|
| 563 |
+
preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
|
| 564 |
+
prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
|
| 565 |
+
with gr.Column():
|
| 566 |
+
text_output_4 = gr.TextArea(label="Message")
|
| 567 |
+
audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
| 568 |
+
btn_4 = gr.Button("Generate!")
|
| 569 |
+
btn_4.click(infer_long_text,
|
| 570 |
+
inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
|
| 571 |
+
outputs=[text_output_4, audio_output_4])
|
| 572 |
+
|
| 573 |
+
app.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|