Spaces:

MohamedRashad
/

Egyptian-Arabic-TTS

Running on Zero

App Files Files Community

Egyptian-Arabic-TTS / app.py

MohamedRashad

Add RTL support and text alignment to the input textbox in app.py

fff9e6e 14 days ago

raw

history blame contribute delete

3.2 kB

	import spaces
	import torch
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts
	from pathlib import Path
	import gradio as gr

	CONFIG_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json'
	VOCAB_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json'
	MODEL_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth'
	SPEAKER_AUDIO_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav'

	base_path = Path(__file__).parent

	# Download the files into the base_path
	config_path = base_path / 'config.json'
	if not config_path.exists():
	torch.hub.download_url_to_file(CONFIG_URL, config_path)
	vocab_path = base_path / 'vocab.json'
	if not vocab_path.exists():
	torch.hub.download_url_to_file(VOCAB_URL, vocab_path)
	model_path = base_path / 'model.pth'
	if not model_path.exists():
	torch.hub.download_url_to_file(MODEL_URL, model_path)
	speaker_audio_path = base_path / 'speaker_reference.wav'
	if not speaker_audio_path.exists():
	torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path)

	config_path = str(config_path)
	vocab_path = str(vocab_path)
	model_path = str(model_path)
	speaker_audio_path = str(speaker_audio_path)

	config = XttsConfig()
	config.load_json(config_path)

	print("Loading model...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(device)
	model = Xtts.init_from_config(config)
	model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True)
	model.to(device)

	@spaces.GPU
	def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75):
	print("Computing speaker latents...")
	gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path])

	print("Inference...")
	out = model.inference(
	text,
	"ar",
	gpt_cond_latent,
	speaker_embedding,
	temperature=temperature,
	)

	return 24000, out["wav"]

	markdown_description = """## Instructions:

	1. Enter the text you want to synthesize.
	2. Upload a 4-5 seconds audio file of the speaker you want to clone.
	3. Click on the "Generate" button.

	This space was only possible because of the amazing work done by [OmarSamir](https://huggingface.co/OmarSamir) on the [EGTTS](https://huggingface.co/OmarSamir/EGTTS-V0.1) model.
	"""
	with gr.Blocks(title="EGTTS") as app:
	gr.HTML("<center><h1>Egyptian-Arabic-TTS (EGTTS)</h1></center>")
	gr.Markdown(markdown_description)
	with gr.Row():
	with gr.Column():
	text = gr.Textbox(label="Text to synthesize", value="السلام عليكم ورحمة الله", rtl=True, text_align="right", lines=3)
	speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath")
	temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05)
	generate_btn = gr.Button(value="Generate", variant="primary")
	output = gr.Audio(label="Synthesized audio")

	generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output)

	app.launch()