|
import spaces |
|
import torch |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
from pathlib import Path |
|
import gradio as gr |
|
|
|
CONFIG_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json' |
|
VOCAB_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json' |
|
MODEL_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth' |
|
SPEAKER_AUDIO_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav' |
|
|
|
base_path = Path(__file__).parent |
|
|
|
|
|
config_path = base_path / 'config.json' |
|
if not config_path.exists(): |
|
torch.hub.download_url_to_file(CONFIG_URL, config_path) |
|
vocab_path = base_path / 'vocab.json' |
|
if not vocab_path.exists(): |
|
torch.hub.download_url_to_file(VOCAB_URL, vocab_path) |
|
model_path = base_path / 'model.pth' |
|
if not model_path.exists(): |
|
torch.hub.download_url_to_file(MODEL_URL, model_path) |
|
speaker_audio_path = base_path / 'speaker_reference.wav' |
|
if not speaker_audio_path.exists(): |
|
torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path) |
|
|
|
config_path = str(config_path) |
|
vocab_path = str(vocab_path) |
|
model_path = str(model_path) |
|
speaker_audio_path = str(speaker_audio_path) |
|
|
|
config = XttsConfig() |
|
config.load_json(config_path) |
|
|
|
print("Loading model...") |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(device) |
|
model = Xtts.init_from_config(config) |
|
model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True) |
|
model.to(device) |
|
|
|
@spaces.GPU |
|
def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75): |
|
print("Computing speaker latents...") |
|
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path]) |
|
|
|
print("Inference...") |
|
out = model.inference( |
|
text, |
|
"ar", |
|
gpt_cond_latent, |
|
speaker_embedding, |
|
temperature=temperature, |
|
) |
|
|
|
return 24000, out["wav"] |
|
|
|
markdown_description = """## Instructions: |
|
|
|
1. Enter the text you want to synthesize. |
|
2. Upload a 4-5 seconds audio file of the speaker you want to clone. |
|
3. Click on the "Generate" button. |
|
|
|
**This space was only possible because of the amazing work done by [OmarSamir](https://huggingface.co/OmarSamir) on the [EGTTS](https://huggingface.co/OmarSamir/EGTTS-V0.1) model.** |
|
""" |
|
with gr.Blocks(title="EGTTS") as app: |
|
gr.HTML("<center><h1>Egyptian-Arabic-TTS (EGTTS)</h1></center>") |
|
gr.Markdown(markdown_description) |
|
with gr.Row(): |
|
with gr.Column(): |
|
text = gr.Textbox(label="Text to synthesize", value="ุงูุณูุงู
ุนูููู
ูุฑุญู
ุฉ ุงููู", rtl=True, text_align="right", lines=3) |
|
speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath") |
|
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05) |
|
generate_btn = gr.Button(value="Generate", variant="primary") |
|
output = gr.Audio(label="Synthesized audio") |
|
|
|
generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output) |
|
|
|
app.launch() |