PuristanLabs1 commited on
Commit
1e8fb53
·
verified ·
1 Parent(s): 2166b1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -43
app.py CHANGED
@@ -6,31 +6,6 @@ from transformers import AutoTokenizer
6
  import soundfile as sf
7
  import tempfile
8
 
9
- # Title and description for the Space
10
- title = "Indic Parler-TTS: Multilingual Speech Synthesis"
11
- description = """
12
- Indic Parler-TTS lets you generate high-quality, natural-sounding speech in Urdu, Punjabi, and Sindhi.
13
- <br/>
14
- ### How to Use:
15
- 1. **Select the Language**: Choose Urdu, Punjabi, or Sindhi.
16
- 2. **Enter Text**: Type your text in the input box or select from pre-defined samples.
17
- 3. **Customize Speech**: Adjust settings like speaker gender, emotion, pitch, and speaking rate.
18
- 4. **Generate Caption**: Click the 'Generate Caption' button to see a detailed description of the speech.
19
- 5. **Generate Speech**: Click the 'Generate Speech' button to produce the audio output.
20
- <br/>
21
- Enjoy generating expressive, multilingual speech!
22
- """
23
-
24
- article = """
25
- <div style='margin:20px auto;'>
26
- <p>This demo uses the Indic Parler-TTS model, developed by AI4Bharat in collaboration with Hugging Face. See the
27
- <a href="https://huggingface.co/ai4bharat/indic-parler-tts">model card</a> for more details.</p>
28
- <p>For faster inference, duplicate this space and upgrade to GPU via the settings:</p>
29
- <a href="https://huggingface.co/spaces/your-space-name?duplicate=true">
30
- <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
31
- </div>
32
- """
33
-
34
  # Load model and tokenizers at startup (on CPU initially)
35
  print("Loading model and tokenizers...")
36
  model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
@@ -38,6 +13,20 @@ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
38
  description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
39
  print("Model and tokenizers loaded.")
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Pre-defined sample inputs
42
  sample_inputs = [
43
  " آسٹریلوی قانون سازوں نے فیس بک، انسٹاگرام اور ایکس جیسی مشہور سماجی ویب سائٹس کے خلاف دنیا کے مشکل ترین کریک ڈاؤن کی منظوری دیتے ہوئے 16 سال سے کم عمر افراد کے لیے سوشل میڈیا پر پابندی کا تاریخی قانون منظور کرلیا۔ ",
@@ -86,25 +75,57 @@ def generate_audio(text, description):
86
 
87
  # Gradio Interface
88
  def app():
89
- with gr.Blocks(title=title, description=description, article=article) as demo:
90
- # Add your interface components
91
- gr.Markdown(description)
92
 
93
  with gr.Row():
94
  lang_dropdown = gr.Dropdown(
95
- choices=["Urdu", "Punjabi", "Sindhi"],
96
- value="Urdu",
97
  label="Select Language"
98
  )
99
  gender_dropdown = gr.Dropdown(
100
  choices=["Male", "Female"],
101
- value="Female",
102
  label="Speaker Gender"
103
  )
104
  emotion_dropdown = gr.Dropdown(
105
- choices=["Neutral", "Happy", "Sad", "Anger", "Narration", "Command"],
106
- value="Neutral",
107
- label="Emotion"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  )
109
 
110
  # Textbox for text input
@@ -118,9 +139,9 @@ def app():
118
  with gr.Row():
119
  for sample in sample_inputs:
120
  gr.Button(value=f"Use Sample: {sample}").click(
121
- fn=lambda x: x,
122
- inputs=[gr.Textbox(value=sample, visible=False)],
123
- outputs=text_input
124
  )
125
 
126
  with gr.Row():
@@ -140,12 +161,8 @@ def app():
140
  fn=generate_description,
141
  inputs=[
142
  lang_dropdown, gender_dropdown, emotion_dropdown,
143
- gr.Dropdown(choices=["Clear", "Slightly Noisy"], value="Clear", label="Background Noise"),
144
- gr.Dropdown(choices=["Close-Sounding", "Distant-Sounding"], value="Close-Sounding", label="Reverberation"),
145
- gr.Dropdown(choices=["Expressive", "Slightly Expressive", "Monotone"], value="Expressive", label="Expressivity"),
146
- gr.Dropdown(choices=["High", "Low", "Balanced"], value="Balanced", label="Pitch"),
147
- gr.Dropdown(choices=["Slow", "Moderate", "Fast"], value="Moderate", label="Speaking Rate"),
148
- gr.Dropdown(choices=["Basic", "Refined"], value="Refined", label="Voice Quality")
149
  ],
150
  outputs=caption_output
151
  )
 
6
  import soundfile as sf
7
  import tempfile
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Load model and tokenizers at startup (on CPU initially)
10
  print("Loading model and tokenizers...")
11
  model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
 
13
  description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
14
  print("Model and tokenizers loaded.")
15
 
16
+ # Supported languages and default settings
17
+ languages = {
18
+ "Urdu": "A female speaker delivers a clear and expressive speech in Urdu.",
19
+ "Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.",
20
+ "Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.",
21
+ }
22
+ emotions = [
23
+ "Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation",
24
+ "Disgust", "Fear", "News", "Proper Noun", "Surprise"
25
+ ]
26
+ default_language = "Urdu"
27
+ default_gender = "Female"
28
+ default_emotion = "Neutral"
29
+
30
  # Pre-defined sample inputs
31
  sample_inputs = [
32
  " آسٹریلوی قانون سازوں نے فیس بک، انسٹاگرام اور ایکس جیسی مشہور سماجی ویب سائٹس کے خلاف دنیا کے مشکل ترین کریک ڈاؤن کی منظوری دیتے ہوئے 16 سال سے کم عمر افراد کے لیے سوشل میڈیا پر پابندی کا تاریخی قانون منظور کرلیا۔ ",
 
75
 
76
  # Gradio Interface
77
  def app():
78
+ with gr.Blocks() as demo:
79
+ gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi")
80
+ gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.")
81
 
82
  with gr.Row():
83
  lang_dropdown = gr.Dropdown(
84
+ choices=list(languages.keys()),
85
+ value=default_language,
86
  label="Select Language"
87
  )
88
  gender_dropdown = gr.Dropdown(
89
  choices=["Male", "Female"],
90
+ value=default_gender,
91
  label="Speaker Gender"
92
  )
93
  emotion_dropdown = gr.Dropdown(
94
+ choices=emotions,
95
+ value=default_emotion,
96
+ label="Select Emotion"
97
+ )
98
+
99
+ with gr.Row():
100
+ noise_dropdown = gr.Dropdown(
101
+ choices=["Clear", "Slightly Noisy"],
102
+ value="Clear",
103
+ label="Background Noise"
104
+ )
105
+ reverb_dropdown = gr.Dropdown(
106
+ choices=["Close-Sounding", "Distant-Sounding"],
107
+ value="Close-Sounding",
108
+ label="Reverberation"
109
+ )
110
+ expressivity_dropdown = gr.Dropdown(
111
+ choices=["Expressive", "Slightly Expressive", "Monotone"],
112
+ value="Expressive",
113
+ label="Expressivity"
114
+ )
115
+ pitch_dropdown = gr.Dropdown(
116
+ choices=["High", "Low", "Balanced"],
117
+ value="Balanced",
118
+ label="Pitch"
119
+ )
120
+ rate_dropdown = gr.Dropdown(
121
+ choices=["Slow", "Moderate", "Fast"],
122
+ value="Moderate",
123
+ label="Speaking Rate"
124
+ )
125
+ quality_dropdown = gr.Dropdown(
126
+ choices=["Basic", "Refined"],
127
+ value="Refined",
128
+ label="Voice Quality"
129
  )
130
 
131
  # Textbox for text input
 
139
  with gr.Row():
140
  for sample in sample_inputs:
141
  gr.Button(value=f"Use Sample: {sample}").click(
142
+ fn=lambda x: x, # Return the sample text
143
+ inputs=[gr.Textbox(value=sample, visible=False)], # Pass sample as input
144
+ outputs=text_input # Update the text input
145
  )
146
 
147
  with gr.Row():
 
161
  fn=generate_description,
162
  inputs=[
163
  lang_dropdown, gender_dropdown, emotion_dropdown,
164
+ noise_dropdown, reverb_dropdown, expressivity_dropdown,
165
+ pitch_dropdown, rate_dropdown, quality_dropdown
 
 
 
 
166
  ],
167
  outputs=caption_output
168
  )