Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import torch | |
from TTS.tts.configs.xtts_config import XttsConfig | |
from TTS.tts.models.xtts import Xtts | |
from pathlib import Path | |
import gradio as gr | |
CONFIG_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json' | |
VOCAB_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json' | |
MODEL_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth' | |
SPEAKER_AUDIO_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav' | |
base_path = Path(__file__).parent | |
# Download the files into the base_path | |
config_path = base_path / 'config.json' | |
if not config_path.exists(): | |
torch.hub.download_url_to_file(CONFIG_URL, config_path) | |
vocab_path = base_path / 'vocab.json' | |
if not vocab_path.exists(): | |
torch.hub.download_url_to_file(VOCAB_URL, vocab_path) | |
model_path = base_path / 'model.pth' | |
if not model_path.exists(): | |
torch.hub.download_url_to_file(MODEL_URL, model_path) | |
speaker_audio_path = base_path / 'speaker_reference.wav' | |
if not speaker_audio_path.exists(): | |
torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path) | |
config_path = str(config_path) | |
vocab_path = str(vocab_path) | |
model_path = str(model_path) | |
speaker_audio_path = str(speaker_audio_path) | |
config = XttsConfig() | |
config.load_json(config_path) | |
print("Loading model...") | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(device) | |
model = Xtts.init_from_config(config) | |
model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True) | |
model.to(device) | |
def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75): | |
print("Computing speaker latents...") | |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path]) | |
print("Inference...") | |
out = model.inference( | |
text, | |
"ar", | |
gpt_cond_latent, | |
speaker_embedding, | |
temperature=temperature, | |
) | |
return 24000, out["wav"] | |
markdown_description = """## Instructions: | |
1. Enter the text you want to synthesize. | |
2. Upload a 4-5 seconds audio file of the speaker you want to clone. | |
3. Click on the "Generate" button. | |
**This space was only possible because of the amazing work done by [OmarSamir](https://huggingface.co/OmarSamir) on the [EGTTS](https://huggingface.co/OmarSamir/EGTTS-V0.1) model.** | |
""" | |
with gr.Blocks(title="EGTTS") as app: | |
gr.HTML("<center><h1>Egyptian-Arabic-TTS (EGTTS)</h1></center>") | |
gr.Markdown(markdown_description) | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.Textbox(label="Text to synthesize", value="ุงูุณูุงู ุนูููู ูุฑุญู ุฉ ุงููู", rtl=True, text_align="right", lines=3) | |
speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath") | |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05) | |
generate_btn = gr.Button(value="Generate", variant="primary") | |
output = gr.Audio(label="Synthesized audio") | |
generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output) | |
app.launch() |