import asyncio import datetime import logging import os import time import traceback import edge_tts import gradio as gr import librosa import numpy as np from pydub import AudioSegment from scipy.io import wavfile from src.rmvpe import RMVPE from model_loader import ModelLoader logging.getLogger("fairseq").setLevel(logging.WARNING) logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) limitation = os.getenv("SYSTEM") == "spaces" edge_output_filename = "edge_output.mp3" tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices()) tts_voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list] model_root = "weights" print("Loading...") model_loader = ModelLoader() gpu_config = model_loader.config hubert_model = model_loader.load_hubert() rmvpe_model = RMVPE( os.path.join(os.getcwd(), "weights", "rmvpe.pt"), gpu_config.is_half, gpu_config.device, ) model_loader.load("char2") def add_robotic_effect(mp3_path): audio = AudioSegment.from_mp3(mp3_path) # Convert to numpy array data = np.array(audio.get_array_of_samples()) sample_rate = audio.frame_rate # If stereo, average the channels to mono if audio.channels == 2: data = data.reshape((-1, 2)).mean(axis=1).astype(np.int16) # Apply delay effect delay = 0.05 alpha = 0.55 delay_samples = int(delay * sample_rate) delayed_data = np.zeros_like(data) delayed_data[delay_samples:] = data[:-delay_samples] * alpha delayed_data += data # Clip the values to int16 range delayed_data = np.clip(delayed_data, -32768, 32767) wavfile.write("processed.wav", sample_rate, delayed_data.astype(np.int16)) return "processed.wav" def tts( rvc, effect, speed, pitch, tts_text, tts_voice, f0_up_key, f0_method="rmvpe", index_rate=1, protect=0.2, filter_radius=3, resample_sr=0, rms_mix_rate=0.25, ): print("------------------") print(datetime.datetime.now()) print("tts_text:") print(tts_text) print(f"tts_voice: {tts_voice}") print(f"F0: {f0_method}, Key: {f0_up_key}, Index: {index_rate}, Protect: {protect}") edge_output_filename = "edge_output.mp3" try: if limitation and len(tts_text) > 280: print("Error: Text too long") return ( f"Text characters should be at most 280 in this huggingface space, but got {len(tts_text)} characters.", None, None, ) t0 = time.time() if speed >= 0: speed_str = f"+{speed}%" else: speed_str = f"{speed}%" if pitch >= 0: pitch = f'+{pitch}Hz' else: pitch = f'{pitch}Hz' asyncio.run( edge_tts.Communicate( tts_text, "-".join(tts_voice.split("-")[:-1]), rate=speed_str, pitch=pitch ).save(edge_output_filename) ) t1 = time.time() edge_time = t1 - t0 if not rvc: if effect: edge_output_filename = add_robotic_effect(edge_output_filename) info = f"Success. Time: edge-tts: {edge_time}s" print(info) return ( info, edge_output_filename, ) tgt_sr, net_g, vc, version, index_file, if_f0 = ( model_loader.tgt_sr, model_loader.net_g, model_loader.vc, model_loader.version, model_loader.index_file, model_loader.if_f0, ) audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True) duration = len(audio) / sr print(f"Audio duration: {duration}s") if limitation and duration >= 20: print("Error: Audio too long") return ( f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.", edge_output_filename, None, ) f0_up_key = int(f0_up_key) if f0_method == "rmvpe": vc.model_rmvpe = rmvpe_model times = [0, 0, 0] audio_opt = vc.pipeline( hubert_model, net_g, 0, audio, edge_output_filename, times, f0_up_key, f0_method, index_file, # file_big_npy, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, None, ) if tgt_sr != resample_sr >= 16000: tgt_sr = resample_sr info = f"Success. Time: edge-tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s" print(info) return ( info, (tgt_sr, audio_opt), ) except EOFError: info = ( "It seems that the edge-tts output is not valid. " "This may occur when the input text and the speaker do not match. " "For example, maybe you entered Japanese (without alphabets) text but chose non-Japanese speaker?" ) print(info) return info, None except: info = traceback.format_exc() print(info) return info, None initial_md = """ # Text-to-speech webui This is a text-to-speech webui of RVC models. """ app = gr.Blocks() with app: gr.Markdown(initial_md) with gr.Row(): with gr.Column(): f0_key_up = gr.Number( label="Transpose (the best value depends on the models and speakers)", value=4, ) with gr.Row(): with gr.Column(): tts_voice = gr.Dropdown( label="speaker (format: language-Country-Name-Gender)", choices=tts_voices, allow_custom_value=False, value="en-US-JennyNeural-Female", ) speed = gr.Slider( minimum=-100, maximum=100, label="Speech speed (%)", value=10, step=10, interactive=True, ) pitch = gr.Slider( minimum=-100, maximum=100, label="Speech pitch", value=20, step=5, interactive=True, ) tts_text = gr.Textbox( label="Input Text", value="I'm Never Gonna Give You Up", ) rvc = gr.Checkbox(label="Transform Voice", info="Would you like to apply voice transformation? Check means yes", value=False) effect = gr.Checkbox(label="Add Effect", info="Would you like to apply Effect?", value=True) with gr.Column(): but0 = gr.Button("Convert", variant="primary") info_text = gr.Textbox(label="Output info") with gr.Column(): tts_output = gr.Audio(label="Result") but0.click( tts, [ rvc, effect, speed, pitch, tts_text, tts_voice, f0_key_up, ], [info_text, tts_output], ) with gr.Row(): examples = gr.Examples( examples_per_page=10, examples=[ [ "これは日本語テキストから音声への変換デモです。", "ja-JP-NanamiNeural-Female", ], [ "This is an English text to speech conversation demo.", "en-US-AriaNeural-Female", ], ["這是用來測試的demo啦", "zh-TW-HsiaoChenNeural-Female"], ["这是一个中文文本到语音的转换演示。", "zh-CN-XiaoxiaoNeural-Female"], [ "한국어 텍스트에서 음성으로 변환하는 데모입니다.", "ko-KR-SunHiNeural-Female", ], [ "Il s'agit d'une démo de conversion du texte français à la parole.", "fr-FR-DeniseNeural-Female", ], [ "Dies ist eine Demo zur Umwandlung von Deutsch in Sprache.", "de-DE-AmalaNeural-Female", ], [ "Tämä on suomenkielinen tekstistä puheeksi -esittely.", "fi-FI-NooraNeural-Female", ], [ "Это демонстрационный пример преобразования русского текста в речь.", "ru-RU-SvetlanaNeural-Female", ], [ "Αυτή είναι μια επίδειξη μετατροπής ελληνικού κειμένου σε ομιλία.", "el-GR-AthinaNeural-Female", ], [ "Esta es una demostración de conversión de texto a voz en español.", "es-ES-ElviraNeural-Female", ], [ "Questa è una dimostrazione di sintesi vocale in italiano.", "it-IT-ElsaNeural-Female", ], [ "Esta é uma demonstração de conversão de texto em fala em português.", "pt-PT-RaquelNeural-Female", ], [ "Це демонстрація тексту до мовлення українською мовою.", "uk-UA-PolinaNeural-Female", ], [ "هذا عرض توضيحي عربي لتحويل النص إلى كلام.", "ar-EG-SalmaNeural-Female", ], [ "இது தமிழ் உரையிலிருந்து பேச்சு மாற்ற டெமோ.", "ta-IN-PallaviNeural-Female", ], ], inputs=[tts_text, tts_voice], ) app.launch(inbrowser=True)