MohamedRashad's picture
Add RTL support and text alignment to the input textbox in app.py
fff9e6e
import spaces
import torch
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from pathlib import Path
import gradio as gr
CONFIG_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json'
VOCAB_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json'
MODEL_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth'
SPEAKER_AUDIO_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav'
base_path = Path(__file__).parent
# Download the files into the base_path
config_path = base_path / 'config.json'
if not config_path.exists():
torch.hub.download_url_to_file(CONFIG_URL, config_path)
vocab_path = base_path / 'vocab.json'
if not vocab_path.exists():
torch.hub.download_url_to_file(VOCAB_URL, vocab_path)
model_path = base_path / 'model.pth'
if not model_path.exists():
torch.hub.download_url_to_file(MODEL_URL, model_path)
speaker_audio_path = base_path / 'speaker_reference.wav'
if not speaker_audio_path.exists():
torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path)
config_path = str(config_path)
vocab_path = str(vocab_path)
model_path = str(model_path)
speaker_audio_path = str(speaker_audio_path)
config = XttsConfig()
config.load_json(config_path)
print("Loading model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True)
model.to(device)
@spaces.GPU
def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75):
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path])
print("Inference...")
out = model.inference(
text,
"ar",
gpt_cond_latent,
speaker_embedding,
temperature=temperature,
)
return 24000, out["wav"]
markdown_description = """## Instructions:
1. Enter the text you want to synthesize.
2. Upload a 4-5 seconds audio file of the speaker you want to clone.
3. Click on the "Generate" button.
**This space was only possible because of the amazing work done by [OmarSamir](https://huggingface.co/OmarSamir) on the [EGTTS](https://huggingface.co/OmarSamir/EGTTS-V0.1) model.**
"""
with gr.Blocks(title="EGTTS") as app:
gr.HTML("<center><h1>Egyptian-Arabic-TTS (EGTTS)</h1></center>")
gr.Markdown(markdown_description)
with gr.Row():
with gr.Column():
text = gr.Textbox(label="Text to synthesize", value="ุงู„ุณู„ุงู… ุนู„ูŠูƒู… ูˆุฑุญู…ุฉ ุงู„ู„ู‡", rtl=True, text_align="right", lines=3)
speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath")
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05)
generate_btn = gr.Button(value="Generate", variant="primary")
output = gr.Audio(label="Synthesized audio")
generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output)
app.launch()