PHBJT commited on
Commit
d842ce0
1 Parent(s): a2e3b0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -80
app.py CHANGED
@@ -1,82 +1,78 @@
1
  import spaces
2
  import gradio as gr
3
  import torch
 
4
  from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
5
  from string import punctuation
6
  import re
7
-
8
-
9
  from parler_tts import ParlerTTSForConditionalGeneration
10
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
11
 
 
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
 
 
 
 
14
 
15
- repo_id = "ylacombe/p-m-e"
16
-
17
  model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
18
  text_tokenizer = AutoTokenizer.from_pretrained(repo_id)
19
  description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
20
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
21
 
22
-
23
  SAMPLE_RATE = feature_extractor.sampling_rate
24
  SEED = 42
25
 
26
  default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
27
- default_description = "a woman with a slightly low- pitched voice speaks slowly in a clear and close- sounding environment, but her delivery is quite monotone."
28
  examples = [
29
- # French
30
  [
31
  "La voix humaine est un instrument de musique au-dessus de tous les autres.",
32
- "a woman with a slightly low- pitched voice speaks slowly in a clear and close- sounding environment, but her delivery is quite monotone.",
33
- None,
34
- ],
35
- # Spanish
36
- [
37
- "La voz es el reflejo del alma en el espejo del tiempo.",
38
- "a man with a moderate pitch voice speaks slowly with a slightly animated delivery in a very close- sounding environment with minimal background noise.",
39
- None,
40
- ],
41
- # Italian
42
- [
43
- "La voce umana è la più bella musica che esista al mondo.",
44
- "a man with a moderate pitch speaks slowly in a very noisy environment that sounds very distant, delivering his words in a monotone manner.",
45
- None,
46
- ],
47
- # Portuguese
48
- [
49
- "A voz é o espelho da alma e o som do coração.",
50
- "a man speaks slowly in a distant- sounding environment with a clean audio quality, delivering his message in a monotone voice at a moderate pitch. ",
51
- None,
52
- ],
53
- # Polish
54
- [
55
- "Głos ludzki jest najpiękniejszym instrumentem świata.",
56
- "a man with a moderate pitch speaks in a monotone manner at a slightly slow pace, but the recording is quite noisy and sounds very distant.",
57
- None,
58
- ],
59
- # German
60
- [
61
- "Die menschliche Stimme ist das schönste Instrument der Welt.",
62
- "a man with a moderate pitch speaks slowly in a noisy environment with a flat tone of voice, creating a slightly close- sounding effect.",
63
- None,
64
- ],
65
- # Dutch
66
- [
67
- "De menselijke stem is het mooiste instrument dat er bestaat.",
68
- "a man with a moderate pitch speaks slightly slowly with an expressive and animated delivery in a very close- sounding environment with a bit of background noise.",
69
  None,
70
  ],
71
- # English
72
  [
73
  "The human voice is nature's most perfect instrument.",
74
- "Aa woman with a slightly low- pitched voice speaks slowly in a very distant- sounding environment with a clean audio quality, delivering her message in a very monotone manner.",
 
75
  None,
76
  ],
77
  ]
 
78
  number_normalizer = EnglishNumberNormalizer()
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def preprocess(text):
81
  text = number_normalizer(text).strip()
82
  text = text.replace("-", " ")
@@ -87,7 +83,6 @@ def preprocess(text):
87
 
88
  def separate_abb(chunk):
89
  chunk = chunk.replace(".","")
90
- print(chunk)
91
  return " ".join(chunk)
92
 
93
  abbreviations = re.findall(abbreviations_pattern, text)
@@ -97,18 +92,22 @@ def preprocess(text):
97
  return text
98
 
99
  @spaces.GPU
100
- def gen_tts(text, description):
101
- inputs = description_tokenizer(description.strip(), return_tensors="pt").to(device)
 
102
  prompt = text_tokenizer(preprocess(text), return_tensors="pt").to(device)
103
 
104
  set_seed(SEED)
105
  generation = model.generate(
106
- input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
 
 
 
 
 
107
  )
108
  audio_arr = generation.cpu().numpy().squeeze()
109
-
110
- return SAMPLE_RATE, audio_arr
111
-
112
 
113
  css = """
114
  #share-btn-container {
@@ -146,15 +145,12 @@ css = """
146
  display: none !important;
147
  }
148
  """
 
149
  with gr.Blocks(css=css) as block:
150
  gr.HTML(
151
  """
152
  <div style="text-align: center; max-width: 700px; margin: 0 auto;">
153
- <div
154
- style="
155
- display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
156
- "
157
- >
158
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
159
  Multi Parler-TTS 🗣️
160
  </h1>
@@ -163,40 +159,59 @@ with gr.Blocks(css=css) as block:
163
  """
164
  )
165
  gr.HTML(
166
- f"""
167
- <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
168
  high-fidelity text-to-speech (TTS) models.</p>
169
- <p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation). </p>
170
-
171
- <p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
172
- <p><b>Note:</b> you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker") </p>
173
- """
174
  )
 
175
  with gr.Row():
176
  with gr.Column():
177
- input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
178
- description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
179
- run_button = gr.Button("Generate Audio", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  with gr.Column():
181
- audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- inputs = [input_text, description]
184
- outputs = [audio_out]
185
- run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
186
- gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
187
  gr.HTML(
188
- """
189
- <p>Tips for ensuring good generation:
190
  <ul>
191
  <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
192
- <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
193
  <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
194
  </ul>
195
- </p>
196
-
197
- """
198
  )
199
 
200
-
201
  block.queue()
202
  block.launch(share=True)
 
1
  import spaces
2
  import gradio as gr
3
  import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
  from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
6
  from string import punctuation
7
  import re
 
 
8
  from parler_tts import ParlerTTSForConditionalGeneration
9
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
10
 
11
+ # Device setup
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
+ # SmolLM setup
15
+ checkpoint = "HuggingFaceTB/SmolLM-360M"
16
+ smol_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
17
+ smol_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
18
 
19
+ # Original model setup
20
+ repo_id = "ylacombe/p-m-e"
21
  model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
22
  text_tokenizer = AutoTokenizer.from_pretrained(repo_id)
23
  description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
24
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
25
 
 
26
  SAMPLE_RATE = feature_extractor.sampling_rate
27
  SEED = 42
28
 
29
  default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
30
+ default_description = "a woman with a slightly low-pitched voice speaks slowly in a clear and close-sounding environment, but her delivery is quite monotone."
31
  examples = [
 
32
  [
33
  "La voix humaine est un instrument de musique au-dessus de tous les autres.",
34
+ "a woman with a slightly low-pitched voice speaks slowly in a clear and close-sounding environment, but her delivery is quite monotone.",
35
+ True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  None,
37
  ],
 
38
  [
39
  "The human voice is nature's most perfect instrument.",
40
+ "A woman with a slightly low-pitched voice speaks slowly in a very distant-sounding environment with a clean audio quality, delivering her message in a very monotone manner.",
41
+ True,
42
  None,
43
  ],
44
  ]
45
+
46
  number_normalizer = EnglishNumberNormalizer()
47
 
48
+ def format_description(raw_description, do_format=True):
49
+ if not do_format:
50
+ return raw_description
51
+
52
+ prompt = f"""Format this voice description to match exactly:
53
+ "a [gender] with a [pitch] voice speaks [speed] in a [environment], [delivery style]"
54
+ Where:
55
+ - gender: man/woman
56
+ - pitch: slightly low-pitched/moderate pitch/high-pitched
57
+ - speed: slowly/moderately/quickly
58
+ - environment: close-sounding and clear/distant-sounding and noisy
59
+ - delivery style: with monotone delivery/with animated delivery
60
+
61
+ Description to format: {raw_description}
62
+ Formatted description:"""
63
+
64
+ inputs = smol_tokenizer.encode(prompt, return_tensors="pt").to(device)
65
+ outputs = smol_model.generate(
66
+ inputs,
67
+ max_length=200,
68
+ num_return_sequences=1,
69
+ temperature=0.7,
70
+ do_sample=True,
71
+ pad_token_id=smol_tokenizer.eos_token_id
72
+ )
73
+ formatted = smol_tokenizer.decode(outputs[0], skip_special_tokens=True)
74
+ return formatted.split("Formatted description:")[-1].strip()
75
+
76
  def preprocess(text):
77
  text = number_normalizer(text).strip()
78
  text = text.replace("-", " ")
 
83
 
84
  def separate_abb(chunk):
85
  chunk = chunk.replace(".","")
 
86
  return " ".join(chunk)
87
 
88
  abbreviations = re.findall(abbreviations_pattern, text)
 
92
  return text
93
 
94
  @spaces.GPU
95
+ def gen_tts(text, description, do_format=True):
96
+ formatted_desc = format_description(description, do_format)
97
+ inputs = description_tokenizer(formatted_desc.strip(), return_tensors="pt").to(device)
98
  prompt = text_tokenizer(preprocess(text), return_tensors="pt").to(device)
99
 
100
  set_seed(SEED)
101
  generation = model.generate(
102
+ input_ids=inputs.input_ids,
103
+ prompt_input_ids=prompt.input_ids,
104
+ attention_mask=inputs.attention_mask,
105
+ prompt_attention_mask=prompt.attention_mask,
106
+ do_sample=True,
107
+ temperature=1.0
108
  )
109
  audio_arr = generation.cpu().numpy().squeeze()
110
+ return formatted_desc, (SAMPLE_RATE, audio_arr)
 
 
111
 
112
  css = """
113
  #share-btn-container {
 
145
  display: none !important;
146
  }
147
  """
148
+
149
  with gr.Blocks(css=css) as block:
150
  gr.HTML(
151
  """
152
  <div style="text-align: center; max-width: 700px; margin: 0 auto;">
153
+ <div style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;">
 
 
 
 
154
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
155
  Multi Parler-TTS 🗣️
156
  </h1>
 
159
  """
160
  )
161
  gr.HTML(
162
+ """<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
 
163
  high-fidelity text-to-speech (TTS) models.</p>
164
+ <p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt.</p>
165
+ <p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>"""
 
 
 
166
  )
167
+
168
  with gr.Row():
169
  with gr.Column():
170
+ input_text = gr.Textbox(
171
+ label="Input Text",
172
+ lines=2,
173
+ value=default_text
174
+ )
175
+ raw_description = gr.Textbox(
176
+ label="Voice Description",
177
+ lines=2,
178
+ value=default_description
179
+ )
180
+ do_format = gr.Checkbox(
181
+ label="Reformat description using SmolLM",
182
+ value=True
183
+ )
184
+ formatted_description = gr.Textbox(
185
+ label="Used Description",
186
+ lines=2
187
+ )
188
+ generate_button = gr.Button("Generate Audio", variant="primary")
189
  with gr.Column():
190
+ audio_out = gr.Audio(label="Parler-TTS generation", type="numpy")
191
+
192
+ generate_button.click(
193
+ fn=gen_tts,
194
+ inputs=[input_text, raw_description, do_format],
195
+ outputs=[formatted_description, audio_out]
196
+ )
197
+
198
+ gr.Examples(
199
+ examples=examples,
200
+ fn=gen_tts,
201
+ inputs=[input_text, raw_description, do_format],
202
+ outputs=[formatted_description, audio_out],
203
+ cache_examples=True
204
+ )
205
 
 
 
 
 
206
  gr.HTML(
207
+ """<p>Tips for ensuring good generation:
 
208
  <ul>
209
  <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
210
+ <li>Punctuation can be used to control the prosody of the generations</li>
211
  <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
212
  </ul>
213
+ </p>"""
 
 
214
  )
215
 
 
216
  block.queue()
217
  block.launch(share=True)