Spaces:

PuristanLabs1
/

Indic_ParlerTTS_Urdu

Running on Zero

App Files Files Community

PuristanLabs1 commited on Dec 8, 2024

Commit

1e8fb53

verified ·

1 Parent(s): 2166b1a

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -43

app.py CHANGED Viewed

@@ -6,31 +6,6 @@ from transformers import AutoTokenizer
 import soundfile as sf
 import tempfile
-# Title and description for the Space
-title = "Indic Parler-TTS: Multilingual Speech Synthesis"
-description = """
-Indic Parler-TTS lets you generate high-quality, natural-sounding speech in Urdu, Punjabi, and Sindhi.
-<br/>
-### How to Use:
-1. **Select the Language**: Choose Urdu, Punjabi, or Sindhi.
-2. **Enter Text**: Type your text in the input box or select from pre-defined samples.
-3. **Customize Speech**: Adjust settings like speaker gender, emotion, pitch, and speaking rate.
-4. **Generate Caption**: Click the 'Generate Caption' button to see a detailed description of the speech.
-5. **Generate Speech**: Click the 'Generate Speech' button to produce the audio output.
-<br/>
-Enjoy generating expressive, multilingual speech!
-"""
-article = """
-<div style='margin:20px auto;'>
-<p>This demo uses the Indic Parler-TTS model, developed by AI4Bharat in collaboration with Hugging Face. See the
-<a href="https://huggingface.co/ai4bharat/indic-parler-tts">model card</a> for more details.</p>
-<p>For faster inference, duplicate this space and upgrade to GPU via the settings:</p>
-<a href="https://huggingface.co/spaces/your-space-name?duplicate=true">
-<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-</div>
-"""
 # Load model and tokenizers at startup (on CPU initially)
 print("Loading model and tokenizers...")
 model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
@@ -38,6 +13,20 @@ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
 description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
 print("Model and tokenizers loaded.")
 # Pre-defined sample inputs
 sample_inputs = [
     " آسٹریلوی قانون سازوں نے فیس بک، انسٹاگرام اور ایکس جیسی مشہور سماجی ویب سائٹس کے خلاف دنیا کے مشکل ترین کریک ڈاؤن کی منظوری دیتے ہوئے 16 سال سے کم عمر افراد کے لیے سوشل میڈیا پر پابندی کا تاریخی قانون منظور کرلیا۔ ",
@@ -86,25 +75,57 @@ def generate_audio(text, description):
 # Gradio Interface
 def app():
-    with gr.Blocks(title=title, description=description, article=article) as demo:
-        # Add your interface components
-        gr.Markdown(description)
         with gr.Row():
             lang_dropdown = gr.Dropdown(
-                choices=["Urdu", "Punjabi", "Sindhi"],
-                value="Urdu",
                 label="Select Language"
             )
             gender_dropdown = gr.Dropdown(
                 choices=["Male", "Female"],
-                value="Female",
                 label="Speaker Gender"
             )
             emotion_dropdown = gr.Dropdown(
-                choices=["Neutral", "Happy", "Sad", "Anger", "Narration", "Command"],
-                value="Neutral",
-                label="Emotion"
             )
         # Textbox for text input
@@ -118,9 +139,9 @@ def app():
         with gr.Row():
             for sample in sample_inputs:
                 gr.Button(value=f"Use Sample: {sample}").click(
-                    fn=lambda x: x,
-                    inputs=[gr.Textbox(value=sample, visible=False)],
-                    outputs=text_input
                 )
         with gr.Row():
@@ -140,12 +161,8 @@ def app():
             fn=generate_description,
             inputs=[
                 lang_dropdown, gender_dropdown, emotion_dropdown,
-                gr.Dropdown(choices=["Clear", "Slightly Noisy"], value="Clear", label="Background Noise"),
-                gr.Dropdown(choices=["Close-Sounding", "Distant-Sounding"], value="Close-Sounding", label="Reverberation"),
-                gr.Dropdown(choices=["Expressive", "Slightly Expressive", "Monotone"], value="Expressive", label="Expressivity"),
-                gr.Dropdown(choices=["High", "Low", "Balanced"], value="Balanced", label="Pitch"),
-                gr.Dropdown(choices=["Slow", "Moderate", "Fast"], value="Moderate", label="Speaking Rate"),
-                gr.Dropdown(choices=["Basic", "Refined"], value="Refined", label="Voice Quality")
             ],
             outputs=caption_output
         )

 import soundfile as sf
 import tempfile
 # Load model and tokenizers at startup (on CPU initially)
 print("Loading model and tokenizers...")
 model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
 description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
 print("Model and tokenizers loaded.")
+# Supported languages and default settings
+languages = {
+    "Urdu": "A female speaker delivers a clear and expressive speech in Urdu.",
+    "Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.",
+    "Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.",
+}
+emotions = [
+    "Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation",
+    "Disgust", "Fear", "News", "Proper Noun", "Surprise"
+]
+default_language = "Urdu"
+default_gender = "Female"
+default_emotion = "Neutral"
 # Pre-defined sample inputs
 sample_inputs = [
     " آسٹریلوی قانون سازوں نے فیس بک، انسٹاگرام اور ایکس جیسی مشہور سماجی ویب سائٹس کے خلاف دنیا کے مشکل ترین کریک ڈاؤن کی منظوری دیتے ہوئے 16 سال سے کم عمر افراد کے لیے سوشل میڈیا پر پابندی کا تاریخی قانون منظور کرلیا۔ ",
 # Gradio Interface
 def app():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi")
+        gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.")
         with gr.Row():
             lang_dropdown = gr.Dropdown(
+                choices=list(languages.keys()),
+                value=default_language,
                 label="Select Language"
             )
             gender_dropdown = gr.Dropdown(
                 choices=["Male", "Female"],
+                value=default_gender,
                 label="Speaker Gender"
             )
             emotion_dropdown = gr.Dropdown(
+                choices=emotions,
+                value=default_emotion,
+                label="Select Emotion"
+            )
+        with gr.Row():
+            noise_dropdown = gr.Dropdown(
+                choices=["Clear", "Slightly Noisy"],
+                value="Clear",
+                label="Background Noise"
+            )
+            reverb_dropdown = gr.Dropdown(
+                choices=["Close-Sounding", "Distant-Sounding"],
+                value="Close-Sounding",
+                label="Reverberation"
+            )
+            expressivity_dropdown = gr.Dropdown(
+                choices=["Expressive", "Slightly Expressive", "Monotone"],
+                value="Expressive",
+                label="Expressivity"
+            )
+            pitch_dropdown = gr.Dropdown(
+                choices=["High", "Low", "Balanced"],
+                value="Balanced",
+                label="Pitch"
+            )
+            rate_dropdown = gr.Dropdown(
+                choices=["Slow", "Moderate", "Fast"],
+                value="Moderate",
+                label="Speaking Rate"
+            )
+            quality_dropdown = gr.Dropdown(
+                choices=["Basic", "Refined"],
+                value="Refined",
+                label="Voice Quality"
             )
         # Textbox for text input
         with gr.Row():
             for sample in sample_inputs:
                 gr.Button(value=f"Use Sample: {sample}").click(
+                    fn=lambda x: x,  # Return the sample text
+                    inputs=[gr.Textbox(value=sample, visible=False)],  # Pass sample as input
+                    outputs=text_input  # Update the text input
                 )
         with gr.Row():
             fn=generate_description,
             inputs=[
                 lang_dropdown, gender_dropdown, emotion_dropdown,
+                noise_dropdown, reverb_dropdown, expressivity_dropdown,
+                pitch_dropdown, rate_dropdown, quality_dropdown
             ],
             outputs=caption_output
         )