PuristanLabs1 commited on
Commit
c611518
·
verified ·
1 Parent(s): 994ee4a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from parler_tts import ParlerTTSForConditionalGeneration
4
+ from transformers import AutoTokenizer
5
+ import soundfile as sf
6
+ import tempfile
7
+
8
+ # Load the model and tokenizers
9
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
+ model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
11
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
12
+ description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
13
+
14
+ # Supported languages and default settings
15
+ languages = {
16
+ "Urdu": "A female speaker delivers a clear and expressive speech in Urdu.",
17
+ "Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.",
18
+ "Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.",
19
+ }
20
+ emotions = [
21
+ "Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation",
22
+ "Disgust", "Fear", "News", "Proper Noun", "Surprise"
23
+ ]
24
+ default_language = "Urdu"
25
+ default_gender = "Female"
26
+ default_emotion = "Neutral"
27
+
28
+ # Generate description function
29
+ def generate_description(language, gender, emotion, noise, reverb, expressivity, pitch, rate, quality):
30
+ description = (
31
+ f"A {gender.lower()} speaker delivers a {emotion.lower()} and {expressivity.lower()} speech "
32
+ f"with a {pitch.lower()} pitch and a {rate.lower()} speaking rate. "
33
+ f"The audio has {noise.lower()} background noise, {reverb.lower()} reverberation, "
34
+ f"and {quality.lower()} voice quality. The text is in {language}."
35
+ )
36
+ return description
37
+
38
+ # Generate audio function
39
+ def generate_audio(text, description):
40
+ # Prepare model inputs
41
+ input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to(device)
42
+ prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
43
+
44
+ # Generate audio
45
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
46
+ audio_arr = generation.cpu().numpy().squeeze()
47
+
48
+ # Save audio to a temporary file
49
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
50
+ sf.write(f.name, audio_arr, model.config.sampling_rate)
51
+ audio_path = f.name
52
+
53
+ return audio_path
54
+
55
+ # Gradio Interface
56
+ def app():
57
+ with gr.Blocks() as demo:
58
+ gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi")
59
+ gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.")
60
+
61
+ with gr.Row():
62
+ lang_dropdown = gr.Dropdown(
63
+ choices=list(languages.keys()),
64
+ value=default_language,
65
+ label="Select Language"
66
+ )
67
+ gender_dropdown = gr.Dropdown(
68
+ choices=["Male", "Female"],
69
+ value=default_gender,
70
+ label="Speaker Gender"
71
+ )
72
+ emotion_dropdown = gr.Dropdown(
73
+ choices=emotions,
74
+ value=default_emotion,
75
+ label="Select Emotion"
76
+ )
77
+
78
+ with gr.Row():
79
+ noise_dropdown = gr.Dropdown(
80
+ choices=["Clear", "Slightly Noisy"],
81
+ value="Clear",
82
+ label="Background Noise"
83
+ )
84
+ reverb_dropdown = gr.Dropdown(
85
+ choices=["Close-Sounding", "Distant-Sounding"],
86
+ value="Close-Sounding",
87
+ label="Reverberation"
88
+ )
89
+ expressivity_dropdown = gr.Dropdown(
90
+ choices=["Expressive", "Slightly Expressive", "Monotone"],
91
+ value="Expressive",
92
+ label="Expressivity"
93
+ )
94
+ pitch_dropdown = gr.Dropdown(
95
+ choices=["High", "Low", "Balanced"],
96
+ value="Balanced",
97
+ label="Pitch"
98
+ )
99
+ rate_dropdown = gr.Dropdown(
100
+ choices=["Slow", "Moderate", "Fast"],
101
+ value="Moderate",
102
+ label="Speaking Rate"
103
+ )
104
+ quality_dropdown = gr.Dropdown(
105
+ choices=["Basic", "Refined"],
106
+ value="Refined",
107
+ label="Voice Quality"
108
+ )
109
+
110
+ with gr.Row():
111
+ text_input = gr.Textbox(
112
+ label="Enter Text",
113
+ placeholder="Type your text here...",
114
+ lines=5
115
+ )
116
+
117
+ with gr.Row():
118
+ generate_caption_button = gr.Button("Generate Caption/Description")
119
+ caption_output = gr.Textbox(
120
+ label="Generated Caption/Description",
121
+ placeholder="The generated caption will appear here...",
122
+ lines=5
123
+ )
124
+
125
+ with gr.Row():
126
+ generate_audio_button = gr.Button("Generate Speech")
127
+ audio_output = gr.Audio(label="Generated Audio")
128
+
129
+ # Link actions to buttons
130
+ generate_caption_button.click(
131
+ fn=generate_description,
132
+ inputs=[
133
+ lang_dropdown, gender_dropdown, emotion_dropdown,
134
+ noise_dropdown, reverb_dropdown, expressivity_dropdown,
135
+ pitch_dropdown, rate_dropdown, quality_dropdown
136
+ ],
137
+ outputs=caption_output
138
+ )
139
+
140
+ generate_audio_button.click(
141
+ fn=generate_audio,
142
+ inputs=[text_input, caption_output],
143
+ outputs=audio_output
144
+ )
145
+
146
+ return demo
147
+
148
+ # Run the app
149
+ app().launch()