import spaces import gradio as gr import torch from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer from string import punctuation import re from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed device = "cuda:0" if torch.cuda.is_available() else "cpu" repo_id = "ylacombe/p-m-e" model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) tokenizer = AutoTokenizer.from_pretrained(repo_id) feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) SAMPLE_RATE = feature_extractor.sampling_rate SEED = 42 default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres." default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue." examples = [ # French [ "La voix humaine est un instrument de musique au-dessus de tous les autres.", "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.", None, ], # Spanish [ "La voz es el reflejo del alma en el espejo del tiempo.", "A female voice speaks with moderate speed, showing warmth and clarity. The recording is clean with minimal background noise and has natural resonance.", None, ], # Italian [ "La voce umana è la più bella musica che esista al mondo.", "A male voice delivers the message with passion and depth. The recording has good clarity with slight room acoustics and a medium-distance perspective.", None, ], # Portuguese [ "A voz é o espelho da alma e o som do coração.", "A young female voice speaks with enthusiasm and energy. The recording is close-miked with crisp audio quality and subtle room ambiance.", None, ], # Polish [ "Głos ludzki jest najpiękniejszym instrumentem świata.", "An elderly male voice speaks with wisdom and gravitas. The recording has a vintage quality with some characteristic analog warmth.", None, ], # German [ "Die menschliche Stimme ist das schönste Instrument der Welt.", "A mature female voice speaks with authority and precision. The recording is studio-quality with perfect clarity and no background noise.", None, ], # Dutch [ "De menselijke stem is het mooiste instrument dat er bestaat.", "A middle-aged male voice speaks with gentle inflection and warmth. The recording has natural room acoustics and balanced frequency response.", None, ], # English [ "The human voice is nature's most perfect instrument.", "A young male voice speaks with dynamic expression and energy. The recording is professional quality with subtle environmental ambiance.", None, ], ] number_normalizer = EnglishNumberNormalizer() def preprocess(text): text = number_normalizer(text).strip() text = text.replace("-", " ") if text[-1] not in punctuation: text = f"{text}." abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b' def separate_abb(chunk): chunk = chunk.replace(".","") print(chunk) return " ".join(chunk) abbreviations = re.findall(abbreviations_pattern, text) for abv in abbreviations: if abv in text: text = text.replace(abv, separate_abb(abv)) return text @spaces.GPU def gen_tts(text, description): inputs = tokenizer(description.strip(), return_tensors="pt").to(device) prompt = tokenizer(preprocess(text), return_tensors="pt").to(device) set_seed(SEED) generation = model.generate( input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 ) audio_arr = generation.cpu().numpy().squeeze() return SAMPLE_RATE, audio_arr css = """ #share-btn-container { display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; margin-top: 10px; margin-left: auto; flex: unset !important; } #share-btn { all: initial; color: #ffffff; font-weight: 600; cursor: pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; right:0; } #share-btn * { all: unset !important; } #share-btn-container div:nth-child(-n+2){ width: auto !important; min-height: 0px !important; } #share-btn-container .wrap { display: none !important; } """ with gr.Blocks(css=css) as block: gr.HTML( """

Multi Parler-TTS 🗣️

""" ) gr.HTML( f"""

Parler-TTS is a training and inference library for high-fidelity text-to-speech (TTS) models.

This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).

By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 speaker consistency across generations, try to use consistent descriptions in your prompts.

Note: you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker")

""" ) with gr.Row(): with gr.Column(): input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text") description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description") run_button = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out") inputs = [input_text, description] outputs = [audio_out] run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True) gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True) gr.HTML( """

Tips for ensuring good generation:

""" ) block.queue() block.launch(share=True)