File size: 11,135 Bytes
4042a65
076829a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0f730b
076829a
 
 
 
f388150
 
90433f5
 
076829a
90433f5
49015f6
90433f5
 
 
 
 
 
 
 
 
f388150
076829a
 
 
 
 
 
 
f388150
076829a
 
 
 
 
 
 
 
 
 
 
f388150
076829a
 
 
f388150
90433f5
e19e80f
076829a
 
 
 
 
 
39f02fa
076829a
92e6db2
39f02fa
90433f5
076829a
 
90433f5
076829a
 
39f02fa
076829a
92e6db2
39f02fa
90433f5
076829a
 
 
 
90433f5
076829a
 
 
 
 
90433f5
076829a
 
90433f5
076829a
 
 
 
 
 
 
 
90433f5
076829a
 
 
 
 
 
92e6db2
90433f5
076829a
 
 
 
92e6db2
90433f5
076829a
 
 
 
92e6db2
90433f5
076829a
 
 
92e6db2
90433f5
 
1383b23
2749090
 
 
6a9220e
 
 
aa6336e
c1bc199
1383b23
 
 
 
 
 
 
 
 
 
 
 
11f92b1
 
92e6db2
11f92b1
92e6db2
 
11f92b1
 
 
 
 
 
 
 
92e6db2
 
11f92b1
1383b23
 
5b76a16
 
 
 
 
 
 
 
 
 
 
 
11f92b1
 
 
5b76a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2749090
5b76a16
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import argparse
import gradio as gr
import numpy as np
os.system('pip install torchaudio==2.0.2')
import torch
import torchaudio
import random
import librosa
import spaces

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav, logging
from cosyvoice.utils.common import set_all_random_seed

inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
                 '3s极速复刻': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
                 '跨语种复刻': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 点击生成音频按钮',
                 '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'}
stream_mode_list = [('否', False), ('是', True)]
max_val = 0.8

@spaces.GPU
def generate_seed():
    seed = random.randint(1, 100000000)
    return {
        "__type__": "update",
        "value": seed
    }

@spaces.GPU
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech

@spaces.GPU
def change_instruction(mode_checkbox_group):
    return instruct_dict[mode_checkbox_group]

@spaces.GPU
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
                   seed, stream, speed):
    if prompt_wav_upload is not None:
        prompt_wav = prompt_wav_upload
    elif prompt_wav_record is not None:
        prompt_wav = prompt_wav_record
    else:
        prompt_wav = None
    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
    if mode_checkbox_group in ['自然语言控制']:
        if get_cosyvoice().frontend.instruct is False:
            gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
            yield (target_sr, default_data)
        if instruct_text == '':
            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
            yield (target_sr, default_data)
        if prompt_wav is not None or prompt_text != '':
            gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
    # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
    if mode_checkbox_group in ['跨语种复刻']:
        if get_cosyvoice().frontend.instruct is True:
            gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
            yield (target_sr, default_data)
        if instruct_text != '':
            gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
        if prompt_wav is None:
            gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
            yield (target_sr, default_data)
        gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
        if prompt_wav is None:
            gr.Warning('prompt音频为空,您是否忘记输入prompt音频?')
            yield (target_sr, default_data)
        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
            yield (target_sr, default_data)
    # sft mode only use sft_dropdown
    if mode_checkbox_group in ['预训练音色']:
        if instruct_text != '' or prompt_wav is not None or prompt_text != '':
            gr.Info('您正在使用预训练音色模式,prompt文本/prompt音频/instruct文本会被忽略!')
    # zero_shot mode only use prompt_wav prompt text
    if mode_checkbox_group in ['3s极速复刻']:
        if prompt_text == '':
            gr.Warning('prompt文本为空,您是否忘记输入prompt文本?')
            yield (target_sr, default_data)
        if instruct_text != '':
            gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!')

    if mode_checkbox_group == '预训练音色':
        logging.info('get sft inference request')
        set_all_random_seed(seed)
        for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
            yield (target_sr, i['tts_speech'].numpy().flatten())
    elif mode_checkbox_group == '3s极速复刻':
        logging.info('get zero_shot inference request')
        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
        set_all_random_seed(seed)
        for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
            yield (target_sr, i['tts_speech'].numpy().flatten())
    elif mode_checkbox_group == '跨语种复刻':
        logging.info('get cross_lingual inference request')
        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
        set_all_random_seed(seed)
        for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
            yield (target_sr, i['tts_speech'].numpy().flatten())
    else:
        logging.info('get instruct inference request')
        set_all_random_seed(seed)
        for i in get_cosyvoice().inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream, speed=speed):
            yield (target_sr, i['tts_speech'].numpy().flatten())

# SDK模型下载
import platform
python_version = platform.python_version()
print("Python version:", python_version)
from huggingface_hub import dump_environment_info

dump_environment_info()
from huggingface_hub import snapshot_download
snapshot_download(repo_id="FunAudioLLM/CosyVoice-300M", local_dir='pretrained_models/CosyVoice-300M',repo_type='model')

os.system('apt-get -y update && apt-get -y install sox libsox-dev')

parser = argparse.ArgumentParser()
parser.add_argument('--port',
                    type=int,
                    default=8000)
parser.add_argument('--model_dir',
                    type=str,
                    default='pretrained_models/CosyVoice-300M',
                    help='local path or modelscope repo id')
args = parser.parse_args()

cosyvoice_instance = None
model_dir=args.model_dir
@spaces.GPU
def get_cosyvoice():
    global cosyvoice_instance, model_dir
    # 在这里加入你需要的处理逻辑
    if cosyvoice_instance is not None:
        return cosyvoice_instance
    
    return CosyVoice(model_dir)

@spaces.GPU
def load_sft_options():
    return get_cosyvoice().list_avaliable_spks()


prompt_sr, target_sr = 16000, 22050
default_data = np.zeros(target_sr)

with gr.Blocks() as demo:
    gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
                预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
                [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
                [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
    gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")

    tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
    with gr.Row():
        mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
        instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
        sft_dropdown = gr.Dropdown(choices=[], label='选择预训练音色',  scale=0.25)
        load_sft_button = gr.Button("加载预训练音色")
        load_sft_button.click(load_sft_options, outputs=sft_dropdown)
        stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
        speed = gr.Number(value=1, label="速度调节(仅支持非流式推理)", minimum=0.5, maximum=2.0, step=0.1)
        with gr.Column(scale=0.25):
            seed_button = gr.Button(value="\U0001F3B2")
            seed = gr.Number(value=0, label="随机推理种子")

    with gr.Row():
        prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件,注意采样率不低于16khz')
        prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
    prompt_text = gr.Textbox(label="输入prompt文本", lines=1, placeholder="请输入prompt文本,需与prompt音频内容一致,暂时不支持自动识别...", value='')
    instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.", value='')

    generate_button = gr.Button("生成音频")

    audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=True)

    seed_button.click(generate_seed, inputs=[], outputs=seed)
    generate_button.click(generate_audio,
                            inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
                                    seed, stream, speed],
                            outputs=[audio_output])
    mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
demo.queue(max_size=4, default_concurrency_limit=2)
demo.launch()