File size: 3,757 Bytes
f654d12
12bfd03
 
 
 
 
 
 
 
 
 
 
 
f654d12
12bfd03
 
73c1b13
 
12bfd03
 
 
 
 
 
 
 
 
 
73c1b13
12bfd03
 
 
 
73c1b13
 
12bfd03
73c1b13
 
 
 
12bfd03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import spaces
import random

import gradio as gr
from css.utils import *


# 定制语音生成
def custom():

    def random_seed():
        return random.randint(1, 100000000)

    @spaces.GPU
    def generate_audio(_recorded_audio, _prompt_input_textbox, _language_radio,
                       _synthetic_input_textbox, _seed):
        import time
        t1 = time.time()
        print(_recorded_audio, _prompt_input_textbox, _language_radio, _synthetic_input_textbox, _seed)
        if _synthetic_input_textbox == '':
            gr.Warning('合成文本为空,您是否忘记输入合成文本?')
            return (target_sr, default_data)
        set_all_random_seed(_seed)
        if use_instruct(_synthetic_input_textbox):
            model = cosyvoice_instruct
        else:
            model = cosyvoice
        prompt_speech_16k = postprocess(load_wav(_recorded_audio, prompt_sr))
        t2 = time.time()
        if _language_radio == 'cross' or _prompt_input_textbox == '':
            output = model.inference_cross_lingual(_synthetic_input_textbox, prompt_speech_16k)
        else:
            output = model.inference_zero_shot(_synthetic_input_textbox, _prompt_input_textbox, prompt_speech_16k)

        t3 = time.time()
        audio_data = postprocess(output['tts_speech']).numpy().flatten()
        t4 = time.time()
        print(f'load and preprocess time: {t2-t1}s')
        print(f'inference time: {t3-t2}s')
        print(f'postprocess time: {t4-t3}s')
        return (target_sr, audio_data)

    with gr.Column():
        with gr.Row():
            with gr.Column(scale=1, min_width=400):
                with gr.Group():
                    recorded_audio = gr.Audio(sources=['microphone'],
                                              label="录制音频文件",
                                              type='filepath')
                    gr.Text("请点击录制,并朗读右方文字(中文或英文)完成录入",
                            max_lines=1,
                            container=False,
                            interactive=False)
            with gr.Column(scale=10):
                prompt_input_textbox = gr.Textbox(label="输入待录制文本")
                gr.Examples(
                    label="示例待录制文本",
                    examples=example_prompt_text,
                    inputs=[prompt_input_textbox])

    with gr.Column():
        language_radio = gr.Radio(choices=[('同语种', 'same'), ('跨语种', 'cross')],
                                  value='same',
                                  label="输入合成文本")
        synthetic_input_textbox = gr.Textbox(show_label=False)
        gr.Examples(
            label="示例文本",
            examples=example_tts_text,
            inputs=[synthetic_input_textbox])

    with gr.Accordion(label="随机种子"):
        with gr.Row():
            with gr.Column(scale=1, min_width=180):
                seed_button = gr.Button(value="\U0001F3B2 随机换一换",
                                        elem_classes="full-height")
            with gr.Column(scale=10):
                seed = gr.Number(show_label=False,
                                 value=0,
                                 container=False,
                                 elem_classes="full-height")
    with gr.Column():
        generate_button = gr.Button("生成音频", variant="primary", size="lg")

    with gr.Column():
        output_audio = gr.Audio(label="合成音频")

    seed_button.click(fn=random_seed, outputs=[seed])
    generate_button.click(
        fn=generate_audio,
        inputs=[recorded_audio, prompt_input_textbox, language_radio, synthetic_input_textbox, seed],
        outputs=[output_audio])