import os import torch import gradio as gr import numpy as np import os.path as op import pyarabic.araby as araby from artst.tasks.artst import ArTSTTask from transformers import SpeechT5HifiGan from artst.models.artst import ArTSTTransformerModel from fairseq.tasks.hubert_pretraining import LabelEncoder from fairseq.data.audio.speech_to_text_dataset import get_features_or_waveform device = torch.device("cuda" if torch.cuda.is_available() else "cpu") checkpoint = torch.load('ckpts/clartts_tts.pt') checkpoint['cfg']['task'].t5_task = 't2s' checkpoint['cfg']['task'].bpe_tokenizer = "utils/arabic.model" checkpoint['cfg']['task'].data = "utils/" checkpoint['cfg']['model'].mask_prob = 0.5 checkpoint['cfg']['task'].mask_prob = 0.5 task = ArTSTTask.setup_task(checkpoint['cfg']['task']) emb_path='embs/clartts.npy' model = ArTSTTransformerModel.build_model(checkpoint['cfg']['model'], task) model.load_state_dict(checkpoint['model']) checkpoint['cfg']['task'].bpe_tokenizer = task.build_bpe(checkpoint['cfg']['model']) tokenizer = checkpoint['cfg']['task'].bpe_tokenizer processor = LabelEncoder(task.dicts['text']) vocoder = SpeechT5HifiGan.from_pretrained('microsoft/speecht5_hifigan').to(device) def get_embs(emb_path): spkembs = get_features_or_waveform(emb_path) spkembs = torch.from_numpy(spkembs).float().unsqueeze(0) return spkembs def process_text(text): text = araby.strip_diacritics(text) return processor(tokenizer.encode(text)).reshape(1, -1) net_input = {} def inference(text, spkr=emb_path): if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) net_input['src_tokens'] = process_text(text) net_input['spkembs'] = get_embs(spkr) outs, _, attn = task.generate_speech( [model], net_input, ) with torch.no_grad(): gen_audio = vocoder(outs.to(device)) speech = (gen_audio.cpu().numpy() * 32767).astype(np.int16) return (16000,speech) text_box = gr.Textbox(max_lines=2, label="Arabic Text", rtl=True) out = gr.Audio(label="Synthesized Audio", type="numpy") title="ArTST: Arabic Speech Synthesis" description="ArTST: Arabic text and speech transformer based on the T5 transformer. This space demonstarates the TTS checkpoint finetuned on \ the Classical Arabic Text-To-Speech (CLARTTS) dataset. The model is pre-trained on the MGB-2 dataset." examples=["لأن فراق المألوف في العادة ومجانبة ما صار متفقا عليه بالمواضعة",\ "ومن لطيف حكمته أن جعل لكل عبادة حالتين",\ "فمن لهم عدل الإنسان مع من فوقه"] article = """
References: ArTST paper | GitHub | Weights and Tokenizer
@inproceedings{toyin-etal-2023-artst, title = "{A}r{TST}: {A}rabic Text and Speech Transformer", author = "Toyin, Hawau and Djanibekov, Amirbek and Kulkarni, Ajinkya and Aldarmaki, Hanan", editor = "Sawaf, Hassan and El-Beltagy, Samhaa and Zaghouani, Wajdi and Magdy, Walid and Abdelali, Ahmed and Tomeh, Nadi and Abu Farha, Ibrahim and Habash, Nizar and Khalifa, Salam and Keleg, Amr and Haddad, Hatem and Zitouni, Imed and Mrini, Khalil and Almatham, Rawan", booktitle = "Proceedings of ArabicNLP 2023", month = dec, year = "2023", address = "Singapore (Hybrid)", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.arabicnlp-1.5", pages = "41--51" }
Speaker embeddings were generated from CMU ARCTIC.
ArTST is based on SpeechT5 architecture.