Spaces:
Running
on
Zero
Running
on
Zero
PuristanLabs1
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -6,31 +6,6 @@ from transformers import AutoTokenizer
|
|
6 |
import soundfile as sf
|
7 |
import tempfile
|
8 |
|
9 |
-
# Title and description for the Space
|
10 |
-
title = "Indic Parler-TTS: Multilingual Speech Synthesis"
|
11 |
-
description = """
|
12 |
-
Indic Parler-TTS lets you generate high-quality, natural-sounding speech in Urdu, Punjabi, and Sindhi.
|
13 |
-
<br/>
|
14 |
-
### How to Use:
|
15 |
-
1. **Select the Language**: Choose Urdu, Punjabi, or Sindhi.
|
16 |
-
2. **Enter Text**: Type your text in the input box or select from pre-defined samples.
|
17 |
-
3. **Customize Speech**: Adjust settings like speaker gender, emotion, pitch, and speaking rate.
|
18 |
-
4. **Generate Caption**: Click the 'Generate Caption' button to see a detailed description of the speech.
|
19 |
-
5. **Generate Speech**: Click the 'Generate Speech' button to produce the audio output.
|
20 |
-
<br/>
|
21 |
-
Enjoy generating expressive, multilingual speech!
|
22 |
-
"""
|
23 |
-
|
24 |
-
article = """
|
25 |
-
<div style='margin:20px auto;'>
|
26 |
-
<p>This demo uses the Indic Parler-TTS model, developed by AI4Bharat in collaboration with Hugging Face. See the
|
27 |
-
<a href="https://huggingface.co/ai4bharat/indic-parler-tts">model card</a> for more details.</p>
|
28 |
-
<p>For faster inference, duplicate this space and upgrade to GPU via the settings:</p>
|
29 |
-
<a href="https://huggingface.co/spaces/your-space-name?duplicate=true">
|
30 |
-
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
|
31 |
-
</div>
|
32 |
-
"""
|
33 |
-
|
34 |
# Load model and tokenizers at startup (on CPU initially)
|
35 |
print("Loading model and tokenizers...")
|
36 |
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
|
@@ -38,6 +13,20 @@ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
|
|
38 |
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
|
39 |
print("Model and tokenizers loaded.")
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# Pre-defined sample inputs
|
42 |
sample_inputs = [
|
43 |
" آسٹریلوی قانون سازوں نے فیس بک، انسٹاگرام اور ایکس جیسی مشہور سماجی ویب سائٹس کے خلاف دنیا کے مشکل ترین کریک ڈاؤن کی منظوری دیتے ہوئے 16 سال سے کم عمر افراد کے لیے سوشل میڈیا پر پابندی کا تاریخی قانون منظور کرلیا۔ ",
|
@@ -86,25 +75,57 @@ def generate_audio(text, description):
|
|
86 |
|
87 |
# Gradio Interface
|
88 |
def app():
|
89 |
-
with gr.Blocks(
|
90 |
-
#
|
91 |
-
gr.Markdown(
|
92 |
|
93 |
with gr.Row():
|
94 |
lang_dropdown = gr.Dropdown(
|
95 |
-
choices=
|
96 |
-
value=
|
97 |
label="Select Language"
|
98 |
)
|
99 |
gender_dropdown = gr.Dropdown(
|
100 |
choices=["Male", "Female"],
|
101 |
-
value=
|
102 |
label="Speaker Gender"
|
103 |
)
|
104 |
emotion_dropdown = gr.Dropdown(
|
105 |
-
choices=
|
106 |
-
value=
|
107 |
-
label="Emotion"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
)
|
109 |
|
110 |
# Textbox for text input
|
@@ -118,9 +139,9 @@ def app():
|
|
118 |
with gr.Row():
|
119 |
for sample in sample_inputs:
|
120 |
gr.Button(value=f"Use Sample: {sample}").click(
|
121 |
-
fn=lambda x: x,
|
122 |
-
inputs=[gr.Textbox(value=sample, visible=False)],
|
123 |
-
outputs=text_input
|
124 |
)
|
125 |
|
126 |
with gr.Row():
|
@@ -140,12 +161,8 @@ def app():
|
|
140 |
fn=generate_description,
|
141 |
inputs=[
|
142 |
lang_dropdown, gender_dropdown, emotion_dropdown,
|
143 |
-
|
144 |
-
|
145 |
-
gr.Dropdown(choices=["Expressive", "Slightly Expressive", "Monotone"], value="Expressive", label="Expressivity"),
|
146 |
-
gr.Dropdown(choices=["High", "Low", "Balanced"], value="Balanced", label="Pitch"),
|
147 |
-
gr.Dropdown(choices=["Slow", "Moderate", "Fast"], value="Moderate", label="Speaking Rate"),
|
148 |
-
gr.Dropdown(choices=["Basic", "Refined"], value="Refined", label="Voice Quality")
|
149 |
],
|
150 |
outputs=caption_output
|
151 |
)
|
|
|
6 |
import soundfile as sf
|
7 |
import tempfile
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# Load model and tokenizers at startup (on CPU initially)
|
10 |
print("Loading model and tokenizers...")
|
11 |
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
|
|
|
13 |
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
|
14 |
print("Model and tokenizers loaded.")
|
15 |
|
16 |
+
# Supported languages and default settings
|
17 |
+
languages = {
|
18 |
+
"Urdu": "A female speaker delivers a clear and expressive speech in Urdu.",
|
19 |
+
"Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.",
|
20 |
+
"Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.",
|
21 |
+
}
|
22 |
+
emotions = [
|
23 |
+
"Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation",
|
24 |
+
"Disgust", "Fear", "News", "Proper Noun", "Surprise"
|
25 |
+
]
|
26 |
+
default_language = "Urdu"
|
27 |
+
default_gender = "Female"
|
28 |
+
default_emotion = "Neutral"
|
29 |
+
|
30 |
# Pre-defined sample inputs
|
31 |
sample_inputs = [
|
32 |
" آسٹریلوی قانون سازوں نے فیس بک، انسٹاگرام اور ایکس جیسی مشہور سماجی ویب سائٹس کے خلاف دنیا کے مشکل ترین کریک ڈاؤن کی منظوری دیتے ہوئے 16 سال سے کم عمر افراد کے لیے سوشل میڈیا پر پابندی کا تاریخی قانون منظور کرلیا۔ ",
|
|
|
75 |
|
76 |
# Gradio Interface
|
77 |
def app():
|
78 |
+
with gr.Blocks() as demo:
|
79 |
+
gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi")
|
80 |
+
gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.")
|
81 |
|
82 |
with gr.Row():
|
83 |
lang_dropdown = gr.Dropdown(
|
84 |
+
choices=list(languages.keys()),
|
85 |
+
value=default_language,
|
86 |
label="Select Language"
|
87 |
)
|
88 |
gender_dropdown = gr.Dropdown(
|
89 |
choices=["Male", "Female"],
|
90 |
+
value=default_gender,
|
91 |
label="Speaker Gender"
|
92 |
)
|
93 |
emotion_dropdown = gr.Dropdown(
|
94 |
+
choices=emotions,
|
95 |
+
value=default_emotion,
|
96 |
+
label="Select Emotion"
|
97 |
+
)
|
98 |
+
|
99 |
+
with gr.Row():
|
100 |
+
noise_dropdown = gr.Dropdown(
|
101 |
+
choices=["Clear", "Slightly Noisy"],
|
102 |
+
value="Clear",
|
103 |
+
label="Background Noise"
|
104 |
+
)
|
105 |
+
reverb_dropdown = gr.Dropdown(
|
106 |
+
choices=["Close-Sounding", "Distant-Sounding"],
|
107 |
+
value="Close-Sounding",
|
108 |
+
label="Reverberation"
|
109 |
+
)
|
110 |
+
expressivity_dropdown = gr.Dropdown(
|
111 |
+
choices=["Expressive", "Slightly Expressive", "Monotone"],
|
112 |
+
value="Expressive",
|
113 |
+
label="Expressivity"
|
114 |
+
)
|
115 |
+
pitch_dropdown = gr.Dropdown(
|
116 |
+
choices=["High", "Low", "Balanced"],
|
117 |
+
value="Balanced",
|
118 |
+
label="Pitch"
|
119 |
+
)
|
120 |
+
rate_dropdown = gr.Dropdown(
|
121 |
+
choices=["Slow", "Moderate", "Fast"],
|
122 |
+
value="Moderate",
|
123 |
+
label="Speaking Rate"
|
124 |
+
)
|
125 |
+
quality_dropdown = gr.Dropdown(
|
126 |
+
choices=["Basic", "Refined"],
|
127 |
+
value="Refined",
|
128 |
+
label="Voice Quality"
|
129 |
)
|
130 |
|
131 |
# Textbox for text input
|
|
|
139 |
with gr.Row():
|
140 |
for sample in sample_inputs:
|
141 |
gr.Button(value=f"Use Sample: {sample}").click(
|
142 |
+
fn=lambda x: x, # Return the sample text
|
143 |
+
inputs=[gr.Textbox(value=sample, visible=False)], # Pass sample as input
|
144 |
+
outputs=text_input # Update the text input
|
145 |
)
|
146 |
|
147 |
with gr.Row():
|
|
|
161 |
fn=generate_description,
|
162 |
inputs=[
|
163 |
lang_dropdown, gender_dropdown, emotion_dropdown,
|
164 |
+
noise_dropdown, reverb_dropdown, expressivity_dropdown,
|
165 |
+
pitch_dropdown, rate_dropdown, quality_dropdown
|
|
|
|
|
|
|
|
|
166 |
],
|
167 |
outputs=caption_output
|
168 |
)
|