import os import glob import logging from typing import cast from threading import Lock import gradio as gr from balacoon_tts import TTS from huggingface_hub import hf_hub_download, list_repo_files # locker that disallow access to the tts object from more then one thread locker = Lock() # global tts module, initialized from a model selected tts = None # path to the model that is currently used in tts cur_model_path = None # cache of speakers, maps model name to speaker list model_to_speakers = dict() model_repo_dir = "/data" for name in list_repo_files(repo_id="balacoon/tts"): if not os.path.isfile(os.path.join(model_repo_dir, name)): hf_hub_download( repo_id="balacoon/tts", filename=name, local_dir=model_repo_dir, ) def main(): logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: gr.Markdown( """

Balacoon🦝 Text-to-Speech

1. Write an utterance to generate, 2. Select the model to synthesize with 3. Select speaker 4. Hit "Generate" and listen to the result! You can learn more about models available [here](https://huggingface.co/balacoon/tts). Visit [Balacoon website](https://balacoon.com/) for more info. """ ) with gr.Row(variant="panel"): text = gr.Textbox(label="Text", placeholder="Type something here...") with gr.Row(): with gr.Column(variant="panel"): repo_files = os.listdir(model_repo_dir) model_files = [x for x in repo_files if x.endswith("_cpu.addon")] model_name = gr.Dropdown( label="Model", choices=model_files, ) with gr.Column(variant="panel"): speaker = gr.Dropdown(label="Speaker", choices=[]) def set_model(model_name_str: str): """ gets value from `model_name`. either uses cached list of speakers for the given model name or loads the addon and checks what are the speakers. """ global model_to_speakers if model_name_str in model_to_speakers: speakers = model_to_speakers[model_name_str] else: global tts, cur_model_path, locker with locker: # need to load this model to learn the list of speakers model_path = os.path.join(model_repo_dir, model_name_str) if tts is not None: del tts tts = TTS(model_path) cur_model_path = model_path speakers = tts.get_speakers() model_to_speakers[model_name_str] = speakers value = speakers[-1] return gr.Dropdown.update( choices=speakers, value=value, visible=True ) model_name.change(set_model, inputs=model_name, outputs=speaker) with gr.Row(variant="panel"): generate = gr.Button("Generate") with gr.Row(variant="panel"): audio = gr.Audio() def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str): """ gets utterance to synthesize from `text` Textbox and speaker name from `speaker` dropdown list. speaker name might be empty for single-speaker models. Synthesizes the waveform and updates `audio` with it. """ if not text_str or not model_name_str or not speaker_str: logging.info("text, model name or speaker are not provided") return None expected_model_path = os.path.join(model_repo_dir, model_name_str) global tts, cur_model_path, locker with locker: if expected_model_path != cur_model_path: # reload model if tts is not None: del tts tts = TTS(expected_model_path) cur_model_path = expected_model_path if len(text_str) > 1024: # truncate the text text_str = text_str[:1024] samples = tts.synthesize(text_str, speaker_str) return gr.Audio.update(value=(tts.get_sampling_rate(), samples)) generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio) demo.queue(concurrency_count=1).launch() if __name__ == "__main__": main()