Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,82 +1,78 @@
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
import torch
|
|
|
4 |
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
|
5 |
from string import punctuation
|
6 |
import re
|
7 |
-
|
8 |
-
|
9 |
from parler_tts import ParlerTTSForConditionalGeneration
|
10 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
11 |
|
|
|
12 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
18 |
text_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
19 |
description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
|
20 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
21 |
|
22 |
-
|
23 |
SAMPLE_RATE = feature_extractor.sampling_rate
|
24 |
SEED = 42
|
25 |
|
26 |
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
|
27 |
-
default_description = "a woman with a slightly low-
|
28 |
examples = [
|
29 |
-
# French
|
30 |
[
|
31 |
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
|
32 |
-
"a woman with a slightly low-
|
33 |
-
|
34 |
-
],
|
35 |
-
# Spanish
|
36 |
-
[
|
37 |
-
"La voz es el reflejo del alma en el espejo del tiempo.",
|
38 |
-
"a man with a moderate pitch voice speaks slowly with a slightly animated delivery in a very close- sounding environment with minimal background noise.",
|
39 |
-
None,
|
40 |
-
],
|
41 |
-
# Italian
|
42 |
-
[
|
43 |
-
"La voce umana è la più bella musica che esista al mondo.",
|
44 |
-
"a man with a moderate pitch speaks slowly in a very noisy environment that sounds very distant, delivering his words in a monotone manner.",
|
45 |
-
None,
|
46 |
-
],
|
47 |
-
# Portuguese
|
48 |
-
[
|
49 |
-
"A voz é o espelho da alma e o som do coração.",
|
50 |
-
"a man speaks slowly in a distant- sounding environment with a clean audio quality, delivering his message in a monotone voice at a moderate pitch. ",
|
51 |
-
None,
|
52 |
-
],
|
53 |
-
# Polish
|
54 |
-
[
|
55 |
-
"Głos ludzki jest najpiękniejszym instrumentem świata.",
|
56 |
-
"a man with a moderate pitch speaks in a monotone manner at a slightly slow pace, but the recording is quite noisy and sounds very distant.",
|
57 |
-
None,
|
58 |
-
],
|
59 |
-
# German
|
60 |
-
[
|
61 |
-
"Die menschliche Stimme ist das schönste Instrument der Welt.",
|
62 |
-
"a man with a moderate pitch speaks slowly in a noisy environment with a flat tone of voice, creating a slightly close- sounding effect.",
|
63 |
-
None,
|
64 |
-
],
|
65 |
-
# Dutch
|
66 |
-
[
|
67 |
-
"De menselijke stem is het mooiste instrument dat er bestaat.",
|
68 |
-
"a man with a moderate pitch speaks slightly slowly with an expressive and animated delivery in a very close- sounding environment with a bit of background noise.",
|
69 |
None,
|
70 |
],
|
71 |
-
# English
|
72 |
[
|
73 |
"The human voice is nature's most perfect instrument.",
|
74 |
-
"
|
|
|
75 |
None,
|
76 |
],
|
77 |
]
|
|
|
78 |
number_normalizer = EnglishNumberNormalizer()
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def preprocess(text):
|
81 |
text = number_normalizer(text).strip()
|
82 |
text = text.replace("-", " ")
|
@@ -87,7 +83,6 @@ def preprocess(text):
|
|
87 |
|
88 |
def separate_abb(chunk):
|
89 |
chunk = chunk.replace(".","")
|
90 |
-
print(chunk)
|
91 |
return " ".join(chunk)
|
92 |
|
93 |
abbreviations = re.findall(abbreviations_pattern, text)
|
@@ -97,18 +92,22 @@ def preprocess(text):
|
|
97 |
return text
|
98 |
|
99 |
@spaces.GPU
|
100 |
-
def gen_tts(text, description):
|
101 |
-
|
|
|
102 |
prompt = text_tokenizer(preprocess(text), return_tensors="pt").to(device)
|
103 |
|
104 |
set_seed(SEED)
|
105 |
generation = model.generate(
|
106 |
-
input_ids=inputs.input_ids,
|
|
|
|
|
|
|
|
|
|
|
107 |
)
|
108 |
audio_arr = generation.cpu().numpy().squeeze()
|
109 |
-
|
110 |
-
return SAMPLE_RATE, audio_arr
|
111 |
-
|
112 |
|
113 |
css = """
|
114 |
#share-btn-container {
|
@@ -146,15 +145,12 @@ css = """
|
|
146 |
display: none !important;
|
147 |
}
|
148 |
"""
|
|
|
149 |
with gr.Blocks(css=css) as block:
|
150 |
gr.HTML(
|
151 |
"""
|
152 |
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
|
153 |
-
<div
|
154 |
-
style="
|
155 |
-
display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
|
156 |
-
"
|
157 |
-
>
|
158 |
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
|
159 |
Multi Parler-TTS 🗣️
|
160 |
</h1>
|
@@ -163,40 +159,59 @@ with gr.Blocks(css=css) as block:
|
|
163 |
"""
|
164 |
)
|
165 |
gr.HTML(
|
166 |
-
|
167 |
-
<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
|
168 |
high-fidelity text-to-speech (TTS) models.</p>
|
169 |
-
<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt
|
170 |
-
|
171 |
-
<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
|
172 |
-
<p><b>Note:</b> you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker") </p>
|
173 |
-
"""
|
174 |
)
|
|
|
175 |
with gr.Row():
|
176 |
with gr.Column():
|
177 |
-
input_text = gr.Textbox(
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
with gr.Column():
|
181 |
-
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
-
inputs = [input_text, description]
|
184 |
-
outputs = [audio_out]
|
185 |
-
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
|
186 |
-
gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
|
187 |
gr.HTML(
|
188 |
-
"""
|
189 |
-
<p>Tips for ensuring good generation:
|
190 |
<ul>
|
191 |
<li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
|
192 |
-
<li>Punctuation can be used to control the prosody of the generations
|
193 |
<li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
|
194 |
</ul>
|
195 |
-
</p>
|
196 |
-
|
197 |
-
"""
|
198 |
)
|
199 |
|
200 |
-
|
201 |
block.queue()
|
202 |
block.launch(share=True)
|
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
5 |
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
|
6 |
from string import punctuation
|
7 |
import re
|
|
|
|
|
8 |
from parler_tts import ParlerTTSForConditionalGeneration
|
9 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
10 |
|
11 |
+
# Device setup
|
12 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
+
# SmolLM setup
|
15 |
+
checkpoint = "HuggingFaceTB/SmolLM-360M"
|
16 |
+
smol_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
17 |
+
smol_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
|
18 |
|
19 |
+
# Original model setup
|
20 |
+
repo_id = "ylacombe/p-m-e"
|
21 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
22 |
text_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
23 |
description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
|
24 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
25 |
|
|
|
26 |
SAMPLE_RATE = feature_extractor.sampling_rate
|
27 |
SEED = 42
|
28 |
|
29 |
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
|
30 |
+
default_description = "a woman with a slightly low-pitched voice speaks slowly in a clear and close-sounding environment, but her delivery is quite monotone."
|
31 |
examples = [
|
|
|
32 |
[
|
33 |
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
|
34 |
+
"a woman with a slightly low-pitched voice speaks slowly in a clear and close-sounding environment, but her delivery is quite monotone.",
|
35 |
+
True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
None,
|
37 |
],
|
|
|
38 |
[
|
39 |
"The human voice is nature's most perfect instrument.",
|
40 |
+
"A woman with a slightly low-pitched voice speaks slowly in a very distant-sounding environment with a clean audio quality, delivering her message in a very monotone manner.",
|
41 |
+
True,
|
42 |
None,
|
43 |
],
|
44 |
]
|
45 |
+
|
46 |
number_normalizer = EnglishNumberNormalizer()
|
47 |
|
48 |
+
def format_description(raw_description, do_format=True):
|
49 |
+
if not do_format:
|
50 |
+
return raw_description
|
51 |
+
|
52 |
+
prompt = f"""Format this voice description to match exactly:
|
53 |
+
"a [gender] with a [pitch] voice speaks [speed] in a [environment], [delivery style]"
|
54 |
+
Where:
|
55 |
+
- gender: man/woman
|
56 |
+
- pitch: slightly low-pitched/moderate pitch/high-pitched
|
57 |
+
- speed: slowly/moderately/quickly
|
58 |
+
- environment: close-sounding and clear/distant-sounding and noisy
|
59 |
+
- delivery style: with monotone delivery/with animated delivery
|
60 |
+
|
61 |
+
Description to format: {raw_description}
|
62 |
+
Formatted description:"""
|
63 |
+
|
64 |
+
inputs = smol_tokenizer.encode(prompt, return_tensors="pt").to(device)
|
65 |
+
outputs = smol_model.generate(
|
66 |
+
inputs,
|
67 |
+
max_length=200,
|
68 |
+
num_return_sequences=1,
|
69 |
+
temperature=0.7,
|
70 |
+
do_sample=True,
|
71 |
+
pad_token_id=smol_tokenizer.eos_token_id
|
72 |
+
)
|
73 |
+
formatted = smol_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
74 |
+
return formatted.split("Formatted description:")[-1].strip()
|
75 |
+
|
76 |
def preprocess(text):
|
77 |
text = number_normalizer(text).strip()
|
78 |
text = text.replace("-", " ")
|
|
|
83 |
|
84 |
def separate_abb(chunk):
|
85 |
chunk = chunk.replace(".","")
|
|
|
86 |
return " ".join(chunk)
|
87 |
|
88 |
abbreviations = re.findall(abbreviations_pattern, text)
|
|
|
92 |
return text
|
93 |
|
94 |
@spaces.GPU
|
95 |
+
def gen_tts(text, description, do_format=True):
|
96 |
+
formatted_desc = format_description(description, do_format)
|
97 |
+
inputs = description_tokenizer(formatted_desc.strip(), return_tensors="pt").to(device)
|
98 |
prompt = text_tokenizer(preprocess(text), return_tensors="pt").to(device)
|
99 |
|
100 |
set_seed(SEED)
|
101 |
generation = model.generate(
|
102 |
+
input_ids=inputs.input_ids,
|
103 |
+
prompt_input_ids=prompt.input_ids,
|
104 |
+
attention_mask=inputs.attention_mask,
|
105 |
+
prompt_attention_mask=prompt.attention_mask,
|
106 |
+
do_sample=True,
|
107 |
+
temperature=1.0
|
108 |
)
|
109 |
audio_arr = generation.cpu().numpy().squeeze()
|
110 |
+
return formatted_desc, (SAMPLE_RATE, audio_arr)
|
|
|
|
|
111 |
|
112 |
css = """
|
113 |
#share-btn-container {
|
|
|
145 |
display: none !important;
|
146 |
}
|
147 |
"""
|
148 |
+
|
149 |
with gr.Blocks(css=css) as block:
|
150 |
gr.HTML(
|
151 |
"""
|
152 |
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
|
153 |
+
<div style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;">
|
|
|
|
|
|
|
|
|
154 |
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
|
155 |
Multi Parler-TTS 🗣️
|
156 |
</h1>
|
|
|
159 |
"""
|
160 |
)
|
161 |
gr.HTML(
|
162 |
+
"""<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
|
|
|
163 |
high-fidelity text-to-speech (TTS) models.</p>
|
164 |
+
<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt.</p>
|
165 |
+
<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>"""
|
|
|
|
|
|
|
166 |
)
|
167 |
+
|
168 |
with gr.Row():
|
169 |
with gr.Column():
|
170 |
+
input_text = gr.Textbox(
|
171 |
+
label="Input Text",
|
172 |
+
lines=2,
|
173 |
+
value=default_text
|
174 |
+
)
|
175 |
+
raw_description = gr.Textbox(
|
176 |
+
label="Voice Description",
|
177 |
+
lines=2,
|
178 |
+
value=default_description
|
179 |
+
)
|
180 |
+
do_format = gr.Checkbox(
|
181 |
+
label="Reformat description using SmolLM",
|
182 |
+
value=True
|
183 |
+
)
|
184 |
+
formatted_description = gr.Textbox(
|
185 |
+
label="Used Description",
|
186 |
+
lines=2
|
187 |
+
)
|
188 |
+
generate_button = gr.Button("Generate Audio", variant="primary")
|
189 |
with gr.Column():
|
190 |
+
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy")
|
191 |
+
|
192 |
+
generate_button.click(
|
193 |
+
fn=gen_tts,
|
194 |
+
inputs=[input_text, raw_description, do_format],
|
195 |
+
outputs=[formatted_description, audio_out]
|
196 |
+
)
|
197 |
+
|
198 |
+
gr.Examples(
|
199 |
+
examples=examples,
|
200 |
+
fn=gen_tts,
|
201 |
+
inputs=[input_text, raw_description, do_format],
|
202 |
+
outputs=[formatted_description, audio_out],
|
203 |
+
cache_examples=True
|
204 |
+
)
|
205 |
|
|
|
|
|
|
|
|
|
206 |
gr.HTML(
|
207 |
+
"""<p>Tips for ensuring good generation:
|
|
|
208 |
<ul>
|
209 |
<li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
|
210 |
+
<li>Punctuation can be used to control the prosody of the generations</li>
|
211 |
<li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
|
212 |
</ul>
|
213 |
+
</p>"""
|
|
|
|
|
214 |
)
|
215 |
|
|
|
216 |
block.queue()
|
217 |
block.launch(share=True)
|