|
import re |
|
import os |
|
import gc |
|
import tempfile |
|
from uuid import uuid4 |
|
|
|
import spaces |
|
import gradio as gr |
|
import torchaudio |
|
import numpy as np |
|
from df.enhance import enhance, load_audio, save_audio |
|
|
|
from config import Config |
|
from .load_models import * |
|
from .modules.CosyVoice.cosyvoice.utils.file_utils import load_wav |
|
|
|
|
|
|
|
def create_temp_file(): |
|
return tempfile.NamedTemporaryFile(delete=False) |
|
|
|
|
|
|
|
def assign_language_tags(text): |
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_mode(mode, sft_speaker, speaker_audio, voice_instructions): |
|
if mode == 'SFT': |
|
return ( |
|
gr.update( |
|
|
|
), |
|
gr.update( |
|
visible=False, |
|
), |
|
gr.update( |
|
visible=False, |
|
), |
|
) |
|
elif mode == 'VC': |
|
return ( |
|
gr.update( |
|
visible=False, |
|
), |
|
gr.update( |
|
visible=True, |
|
), |
|
gr.update( |
|
visible=True, |
|
), |
|
) |
|
elif mode == 'VC-CrossLingual': |
|
return ( |
|
gr.update( |
|
visible=False, |
|
), |
|
gr.update( |
|
visible=True, |
|
), |
|
gr.update( |
|
visible=False, |
|
), |
|
) |
|
elif mode == 'Instruct': |
|
return ( |
|
gr.update( |
|
visible=True, |
|
), |
|
gr.update( |
|
visible=False, |
|
), |
|
gr.update( |
|
visible=True, |
|
), |
|
) |
|
else: |
|
raise gr.Error('Invalid mode') |
|
|
|
|
|
@spaces.GPU(duration=10) |
|
def clear_audio(audio: np.ndarray): |
|
|
|
audio_file = create_temp_file() |
|
np.save(audio_file.name, audio) |
|
|
|
|
|
audio, _ = load_audio(audio_file.name, sr=df_state.sr()) |
|
enhanced = enhance(df_model, df_state, audio) |
|
|
|
|
|
save_audio(audio_file.name, enhanced, df_state.sr()) |
|
|
|
return gr.update( |
|
value=audio_file.name, |
|
) |
|
|
|
|
|
@spaces.GPU(duration=20) |
|
def gen_audio(text, mode, sft_speaker = None, speaker_audio = None, voice_instructions = None): |
|
if mode == any(['VC', 'VC-CrossLingual']): |
|
|
|
speaker_audio_file = create_temp_file() |
|
np.save(speaker_audio_file.name, speaker_audio) |
|
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000) |
|
else: |
|
speaker_audio_file = None |
|
prompt_speech_16k = None |
|
|
|
|
|
text = assign_language_tags(text) |
|
|
|
|
|
out_file = create_temp_file() |
|
if mode == 'SFT': |
|
if not sft_speaker: |
|
raise gr.Error('Please select a speaker') |
|
|
|
for i, j in enumerate(cosyvoice_sft.inference_sft( |
|
tts_text=text, |
|
spk_id=sft_speaker, |
|
)): |
|
torchaudio.save( |
|
out_file.name.format(i), |
|
j['tts_speech'], |
|
22050, |
|
) |
|
elif mode == 'VC': |
|
if not speaker_audio_file: |
|
raise gr.Error('Please upload an audio') |
|
|
|
for i, j in enumerate(cosyvoice.inference_zero_shot( |
|
tts_text=text, |
|
prompt_text=voice_instructions, |
|
prompt_speech_16k=prompt_speech_16k, |
|
)): |
|
torchaudio.save( |
|
out_file.name.format(i), |
|
j['tts_speech'], |
|
22050, |
|
) |
|
elif mode == 'VC-CrossLingual': |
|
if not speaker_audio_file: |
|
raise gr.Error('Please upload an audio') |
|
|
|
for i, j in enumerate(cosyvoice.inference_cross_lingual( |
|
tts_text=text, |
|
prompt_speech_16k=prompt_speech_16k, |
|
)): |
|
torchaudio.save( |
|
out_file.name.format(i), |
|
j['tts_speech'], |
|
22050, |
|
) |
|
elif mode == 'Instruct': |
|
if not voice_instructions: |
|
raise gr.Error('Please enter voice instructions') |
|
|
|
for i, j in enumerate(cosyvoice_instruct.inference_instruct( |
|
tts_text=text, |
|
spk_id=sft_speaker, |
|
instruct_text=voice_instructions, |
|
)): |
|
torchaudio.save( |
|
out_file.name.format(i), |
|
j['tts_speech'], |
|
22050, |
|
) |
|
|
|
return gr.update( |
|
value=out_file.name, |
|
) |
|
|