import gradio as gr from textwrap import dedent import edge_tts import tempfile from tts_voice import tts_order_voice from english.translate import Translate from english.split_text import sentence_split from english.generator import generatorArticle import random import codecs import torch import librosa from models import SynthesizerTrn from scipy.io.wavfile import write import utils from mel_processing import mel_spectrogram_torch from speaker_encoder.voice_encoder import SpeakerEncoder from transformers import WavLMModel language_dict = tts_order_voice def parse_text(input): text = generatorArticle(input).strip() lines = text.split("\n") lines = [line for line in lines if line != ""] count = 0 for i, line in enumerate(lines): if "```" in line: count += 1 items = line.split("`") if count % 2 == 1: lines[i] = f'
'
            else:
                lines[i] = "
" else: if i > 0: if count % 2 == 1: line = line.replace("`", r"\`") line = line.replace("<", "<") line = line.replace(">", ">") line = line.replace(" ", " ") line = line.replace("*", "*") line = line.replace("_", "_") line = line.replace("-", "-") line = line.replace(".", ".") line = line.replace("!", "!") line = line.replace("(", "(") line = line.replace(")", ")") line = line.replace("$", "$") lines[i] = "
" + line return text def predict(input): article = parse_text(input) yield article,article async def text_to_speech_edge(text, language_code): voice = language_dict[language_code] communicate = edge_tts.Communicate(text, voice) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path def tran_2_chianese(text): translate = Translate() sentence_str = sentence_split(text) i = 0 result='' length = len(sentence_str) while(i < length): tmp = sentence_str[i] print('\n'+tmp) tran = translate.translateToZh(tmp) result = result+tmp+'\n'+tran+'\n' i+=1 return result def readWorldsFile(file_path): fp = codecs.open(file_path, 'r', encoding='gb2312') lines = fp.readlines() worlds ,paraphrase = [],[] for line in lines: tmp = line.split('|') worlds.append(tmp[0].strip()) paraphrase.append(tmp[1].strip()) fp.close() return worlds, paraphrase def generatorWorlds(file_path): worlds,paraphrase = readWorldsFile(file_path) length = len(worlds) index = 0 worlds_text = '' while index < 15: num = random.randint(0,length) worlds_text += f'{worlds[num]},【{paraphrase[num]}】\n' index += 1 print('\n' + worlds_text) return worlds_text def choose_word_from_file(input): result = generatorWorlds(input.orig_name) return result device = torch.device("cuda" if torch.cuda.is_available() else "cpu") smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') print("Loading FreeVC(24k)...") hps = utils.get_hparams_from_file("configs/freevc-24.json") freevc_24 = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model).to(device) _ = freevc_24.eval() _ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None) print("Loading WavLM for content...") cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) def convert(model, src, tgt): with torch.no_grad(): # tgt wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) if model == "FreeVC" or model == "FreeVC (24kHz)": g_tgt = smodel.embed_utterance(wav_tgt) g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) else: wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device) mel_tgt = mel_spectrogram_torch( wav_tgt, hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax ) # src wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) # infer if model == "FreeVC": audio = freevc.infer(c, g=g_tgt) elif model == "FreeVC-s": audio = freevc_s.infer(c, mel=mel_tgt) else: audio = freevc_24.infer(c, g=g_tgt) audio = audio[0][0].data.cpu().float().numpy() if model == "FreeVC" or model == "FreeVC-s": write("out.wav", hps.data.sampling_rate, audio) else: write("out.wav", 24000, audio) out = "out.wav" return out with gr.Blocks(title="Learn English By AI", theme=gr.themes.Soft(text_size="sm")) as demo: gr.HTML("
" "

OpenAI + 声音克隆:根据单词生成短文,帮助理解单词使用的语境!!

" "
") with gr.Accordion("📒 相关信息", open=True): _ = f"""OpenAI Prompt 的可选参数信息: * 输入 10-15 个单词为宜 * prompt = '你是一个非常厉害的英语助手,请将'{'words'}'组成一篇英语文章,字数限制在100 字以内' * Open AI 用的是限制账号,每分钟请求 3 次 * 单词文件:每个单词及解释单独成行,单词与注释同行,用 “|” 分割 """ gr.Markdown(dedent(_)) with gr.Row(): file = gr.File() chooseBtn = gr.Button("从文件提取或输入 -》", variant="secondary") user_input = gr.Textbox( max_lines=5, lines=3, label="单词用逗号分割:", placeholder="10-15 words will be better", ) with gr.Column(scale=1): submitBtn = gr.Button("开始生成英语短文", variant="primary") chatbot = gr.Textbox(label="英语短文:", lines = 5, max_lines=8) chooseBtn.click( choose_word_from_file, inputs=[file], outputs=[user_input], show_progress="full", api_name="choose_word_from_file" ) with gr.Column(scale=3): with gr.Row(): tran_result = gr.Textbox(label="翻译结果", lines = 5,max_lines=8,scale=2) tran_btn = gr.Button("翻译", variant="primary") tran_btn.click( tran_2_chianese, inputs=[chatbot], outputs=[tran_result], show_progress="full", api_name="tran_2_chianese" ) with gr.Column(min_width=32, scale=2): with gr.Row(): with gr.Column(): language = gr.Dropdown(choices=list(language_dict.keys()), value="普通话 (中国大陆)-Xiaoxiao-女", label="请选择文本对应的语言及您喜欢的说话人") tts_btn = gr.Button("生成对应的音频吧", variant="primary") output_audio = gr.Audio(type="filepath", label="为您生成的音频", interactive=False) tts_btn.click(text_to_speech_edge, inputs=[chatbot, language], outputs=[output_audio]) with gr.Row(): model_choice = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC (24kHz)", label="Model", visible=False) audio1 = output_audio audio2 = gr.Audio(label="请上传您喜欢的声音进行声音克隆", type='filepath') clone_btn = gr.Button("开始AI声音克隆吧", variant="primary") audio_cloned = gr.Audio(label="为您生成的专属声音克隆音频", type='filepath') clone_btn.click(convert, inputs=[model_choice, audio1, audio2], outputs=[audio_cloned]) user_input.submit( predict, [user_input], [chatbot,tran_result], show_progress="full", ) submitBtn.click( predict, [user_input], [chatbot,tran_result], show_progress="full", api_name="predict", ) # submitBtn.click(reset_user_input, [], [user_input]) demo.queue().launch(show_error=True, debug=True)