multi_parler_tts

Running on Zero

App Files Files Community

PHBJT commited on Oct 30, 2024

Commit

d842ce0

verified ·

1 Parent(s): a2e3b0a

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -80

app.py CHANGED Viewed

@@ -1,82 +1,78 @@
 import spaces
 import gradio as gr
 import torch
 from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
 from string import punctuation
 import re
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-repo_id =  "ylacombe/p-m-e"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 text_tokenizer = AutoTokenizer.from_pretrained(repo_id)
 description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
 SAMPLE_RATE = feature_extractor.sampling_rate
 SEED = 42
 default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
-default_description = "a woman with a slightly low- pitched voice speaks slowly in a clear and close- sounding environment, but her delivery is quite monotone."
 examples = [
-    # French
     [
         "La voix humaine est un instrument de musique au-dessus de tous les autres.",
-        "a woman with a slightly low- pitched voice speaks slowly in a clear and close- sounding environment, but her delivery is quite monotone.",
-        None,
-    ],
-    # Spanish
-    [
-        "La voz es el reflejo del alma en el espejo del tiempo.",
-        "a man with a moderate pitch voice speaks slowly with a slightly animated delivery in a very close- sounding environment with minimal background noise.",
-        None,
-    ],
-    # Italian
-    [
-        "La voce umana è la più bella musica che esista al mondo.",
-        "a man with a moderate pitch speaks slowly in a very noisy environment that sounds very distant, delivering his words in a monotone manner.",
-        None,
-    ],
-    # Portuguese
-    [
-        "A voz é o espelho da alma e o som do coração.",
-        "a man speaks slowly in a distant- sounding environment with a clean audio quality, delivering his message in a monotone voice at a moderate pitch. ",
-        None,
-    ],
-    # Polish
-    [
-        "Głos ludzki jest najpiękniejszym instrumentem świata.",
-        "a man with a moderate pitch speaks in a monotone manner at a slightly slow pace, but the recording is quite noisy and sounds very distant.",
-        None,
-    ],
-    # German
-    [
-        "Die menschliche Stimme ist das schönste Instrument der Welt.",
-        "a man with a moderate pitch speaks slowly in a noisy environment with a flat tone of voice, creating a slightly close- sounding effect.",
-        None,
-    ],
-    # Dutch
-    [
-        "De menselijke stem is het mooiste instrument dat er bestaat.",
-        "a man with a moderate pitch speaks slightly slowly with an expressive and animated delivery in a very close- sounding environment with a bit of background noise.",
         None,
     ],
-    # English
     [
         "The human voice is nature's most perfect instrument.",
-        "Aa woman with a slightly low- pitched voice speaks slowly in a very distant- sounding environment with a clean audio quality, delivering her message in a very monotone manner.",
         None,
     ],
 ]
 number_normalizer = EnglishNumberNormalizer()
 def preprocess(text):
     text = number_normalizer(text).strip()
     text = text.replace("-", " ")
@@ -87,7 +83,6 @@ def preprocess(text):
     def separate_abb(chunk):
         chunk = chunk.replace(".","")
-        print(chunk)
         return " ".join(chunk)
     abbreviations = re.findall(abbreviations_pattern, text)
@@ -97,18 +92,22 @@ def preprocess(text):
     return text
 @spaces.GPU
-def gen_tts(text, description):
-    inputs = description_tokenizer(description.strip(), return_tensors="pt").to(device)
     prompt = text_tokenizer(preprocess(text), return_tensors="pt").to(device)
     set_seed(SEED)
     generation = model.generate(
-        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
     )
     audio_arr = generation.cpu().numpy().squeeze()
-    return SAMPLE_RATE, audio_arr
 css = """
         #share-btn-container {
@@ -146,15 +145,12 @@ css = """
             display: none !important;
         }
 """
 with gr.Blocks(css=css) as block:
     gr.HTML(
         """
             <div style="text-align: center; max-width: 700px; margin: 0 auto;">
-              <div
-                style="
-                  display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
-                "
-              >
                 <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
                   Multi Parler-TTS 🗣️
                 </h1>
@@ -163,40 +159,59 @@ with gr.Blocks(css=css) as block:
         """
     )
     gr.HTML(
-f"""
-       <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
 high-fidelity text-to-speech (TTS) models.</p>
-<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation). </p>
-<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
-<p><b>Note:</b> you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker") </p>
-        """
     )
     with gr.Row():
         with gr.Column():
-            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
-            description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
-            run_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
-            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
-    inputs = [input_text, description]
-    outputs = [audio_out]
-    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
-    gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
     gr.HTML(
-        """
-        <p>Tips for ensuring good generation:
         <ul>
             <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
-            <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
             <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
         </ul>
-        </p>
-        """
     )
 block.queue()
 block.launch(share=True)

 import spaces
 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
 from string import punctuation
 import re
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
+# Device setup
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# SmolLM setup
+checkpoint = "HuggingFaceTB/SmolLM-360M"
+smol_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+smol_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
+# Original model setup
+repo_id = "ylacombe/p-m-e"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 text_tokenizer = AutoTokenizer.from_pretrained(repo_id)
 description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
 SAMPLE_RATE = feature_extractor.sampling_rate
 SEED = 42
 default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
+default_description = "a woman with a slightly low-pitched voice speaks slowly in a clear and close-sounding environment, but her delivery is quite monotone."
 examples = [
     [
         "La voix humaine est un instrument de musique au-dessus de tous les autres.",
+        "a woman with a slightly low-pitched voice speaks slowly in a clear and close-sounding environment, but her delivery is quite monotone.",
+        True,
         None,
     ],
     [
         "The human voice is nature's most perfect instrument.",
+        "A woman with a slightly low-pitched voice speaks slowly in a very distant-sounding environment with a clean audio quality, delivering her message in a very monotone manner.",
+        True,
         None,
     ],
 ]
 number_normalizer = EnglishNumberNormalizer()
+def format_description(raw_description, do_format=True):
+    if not do_format:
+        return raw_description
+    prompt = f"""Format this voice description to match exactly:
+"a [gender] with a [pitch] voice speaks [speed] in a [environment], [delivery style]"
+Where:
+- gender: man/woman
+- pitch: slightly low-pitched/moderate pitch/high-pitched
+- speed: slowly/moderately/quickly
+- environment: close-sounding and clear/distant-sounding and noisy
+- delivery style: with monotone delivery/with animated delivery
+Description to format: {raw_description}
+Formatted description:"""
+    inputs = smol_tokenizer.encode(prompt, return_tensors="pt").to(device)
+    outputs = smol_model.generate(
+        inputs,
+        max_length=200,
+        num_return_sequences=1,
+        temperature=0.7,
+        do_sample=True,
+        pad_token_id=smol_tokenizer.eos_token_id
+    )
+    formatted = smol_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return formatted.split("Formatted description:")[-1].strip()
 def preprocess(text):
     text = number_normalizer(text).strip()
     text = text.replace("-", " ")
     def separate_abb(chunk):
         chunk = chunk.replace(".","")
         return " ".join(chunk)
     abbreviations = re.findall(abbreviations_pattern, text)
     return text
 @spaces.GPU
+def gen_tts(text, description, do_format=True):
+    formatted_desc = format_description(description, do_format)
+    inputs = description_tokenizer(formatted_desc.strip(), return_tensors="pt").to(device)
     prompt = text_tokenizer(preprocess(text), return_tensors="pt").to(device)
     set_seed(SEED)
     generation = model.generate(
+        input_ids=inputs.input_ids,
+        prompt_input_ids=prompt.input_ids,
+        attention_mask=inputs.attention_mask,
+        prompt_attention_mask=prompt.attention_mask,
+        do_sample=True,
+        temperature=1.0
     )
     audio_arr = generation.cpu().numpy().squeeze()
+    return formatted_desc, (SAMPLE_RATE, audio_arr)
 css = """
         #share-btn-container {
             display: none !important;
         }
 """
 with gr.Blocks(css=css) as block:
     gr.HTML(
         """
             <div style="text-align: center; max-width: 700px; margin: 0 auto;">
+              <div style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;">
                 <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
                   Multi Parler-TTS 🗣️
                 </h1>
         """
     )
     gr.HTML(
+        """<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
 high-fidelity text-to-speech (TTS) models.</p>
+<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt.</p>
+<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>"""
     )
     with gr.Row():
         with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                lines=2,
+                value=default_text
+            )
+            raw_description = gr.Textbox(
+                label="Voice Description",
+                lines=2,
+                value=default_description
+            )
+            do_format = gr.Checkbox(
+                label="Reformat description using SmolLM",
+                value=True
+            )
+            formatted_description = gr.Textbox(
+                label="Used Description",
+                lines=2
+            )
+            generate_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
+            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy")
+    generate_button.click(
+        fn=gen_tts,
+        inputs=[input_text, raw_description, do_format],
+        outputs=[formatted_description, audio_out]
+    )
+    gr.Examples(
+        examples=examples,
+        fn=gen_tts,
+        inputs=[input_text, raw_description, do_format],
+        outputs=[formatted_description, audio_out],
+        cache_examples=True
+    )
     gr.HTML(
+        """<p>Tips for ensuring good generation:
         <ul>
             <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
+            <li>Punctuation can be used to control the prosody of the generations</li>
             <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
         </ul>
+        </p>"""
     )
 block.queue()
 block.launch(share=True)