PHBJT commited on
Commit
85185da
1 Parent(s): 364343c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -33
app.py CHANGED
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
 
15
- repo_id = "PHBJT/parler_french_tts_mini_v0.1"
16
 
17
  model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
18
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
@@ -22,36 +22,30 @@ feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
22
  SAMPLE_RATE = feature_extractor.sampling_rate
23
  SEED = 42
24
 
25
- default_text = "All of the data, pre-processing, training code, and weights are released publicly under a permissive license, enabling the community to build on our work and develop their own powerful models."
26
- default_description = "Laura's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
27
  examples = [
28
  [
29
- "This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
30
- "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
31
  None,
32
  ],
33
  [
34
- '''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
35
- "Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
36
- None
37
- ],
38
- [
39
- "'This is the best time of my life, Bartley,' she said happily.",
40
- "A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
41
  None,
42
  ],
43
  [
44
- "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
45
- "A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
46
- None
47
  ],
48
  [
49
- "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
50
- "In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
51
  None,
52
  ],
53
  ]
54
-
55
  number_normalizer = EnglishNumberNormalizer()
56
 
57
  def preprocess(text):
@@ -133,7 +127,7 @@ with gr.Blocks(css=css) as block:
133
  "
134
  >
135
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
136
- Parler-TTS 🗣️
137
  </h1>
138
  </div>
139
  </div>
@@ -141,15 +135,14 @@ with gr.Blocks(css=css) as block:
141
  )
142
  gr.HTML(
143
  f"""
144
- <p><a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> is a training and inference library for
145
- high-fidelity text-to-speech (TTS) models.</p>
146
- <p>The models demonstrated here, Parler-TTS <a href="https://huggingface.co/parler-tts/parler-tts-mini-v1">Mini v1</a> and <a href="https://huggingface.co/parler-tts/parler-tts-large-v1">Large v1</a>,
147
- are trained using 45k hours of narrated English audiobooks. It generates high-quality speech
148
- with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).</p>
 
 
149
 
150
- <p>By default, Parler-TTS generates 🎲 random voice. To ensure 🎯 <b> speaker consistency </b> across generations, these checkpoints were also trained on 34 speakers, characterized by name (e.g. Jon, Lea, Gary, Jenna, Mike, Laura).</p>
151
-
152
- <p>To take advantage of this, simply adapt your text description to specify which speaker to use: `Jon's voice is monotone...`</p>
153
  """
154
  )
155
  with gr.Row():
@@ -173,13 +166,9 @@ with gr.Blocks(css=css) as block:
173
  <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
174
  </ul>
175
  </p>
176
-
177
- <p>Parler-TTS can be much faster. We give some tips on how to generate much more quickly in this <a href="https://github.com/huggingface/parler-tts/blob/main/INFERENCE.md"> inference guide</a>. Think SDPA, torch.compile, batching and streaming!</p>
178
 
179
- <p>If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
180
- <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub.</p>
181
-
182
- <p>The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
183
  """
184
  )
185
 
 
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
 
15
+ repo_id = "PHBJT/french_parler_tts_mini_v0.1"
16
 
17
  model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
18
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
 
22
  SAMPLE_RATE = feature_extractor.sampling_rate
23
  SEED = 42
24
 
25
+ default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
26
+ default_description = "The voice speaks slowly with a very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
27
  examples = [
28
  [
29
+ "La voix humaine est un instrument de musique au-dessus de tous les autres.",
30
+ "The voice speaks slowly with a very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
31
  None,
32
  ],
33
  [
34
+ "Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
35
+ "A slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
 
 
 
 
 
36
  None,
37
  ],
38
  [
39
+ "La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
40
+ "A monotone yet slightly fast delivery, with a very close recording that almost has no background noise.",
41
+ None,
42
  ],
43
  [
44
+ "Le progrès fait naître plus de besoins qu'il n'en satisfait.",
45
+ "In a very poor recording quality, the voice delivers slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. The voice is slightly higher pitched than average.",
46
  None,
47
  ],
48
  ]
 
49
  number_normalizer = EnglishNumberNormalizer()
50
 
51
  def preprocess(text):
 
127
  "
128
  >
129
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
130
+ French Parler-TTS 🗣️
131
  </h1>
132
  </div>
133
  </div>
 
135
  )
136
  gr.HTML(
137
  f"""
138
+ <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
139
+ high-fidelity text-to-speech (TTS) models.</p>
140
+ <p>The model demonstrated here, French Parler-TTS <a href="https://huggingface.co/PHBJT/french_parler_tts_mini_v0.1">Mini v0.1 French</a>,
141
+ has been fine-tuned on a French dataset. It generates high-quality male speech
142
+ with features that can be controlled using a simple text prompt (e.g. background noise, speaking rate, pitch and reverberation). Please note that this model currently supports only male voices (due to limitations on the dataset).</p>
143
+
144
+ <p>By default, Parler-TTS generates 🎲 random male voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
145
 
 
 
 
146
  """
147
  )
148
  with gr.Row():
 
166
  <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
167
  </ul>
168
  </p>
 
 
169
 
170
+ <p>If you want to find out more about how this model was trained and even fine tune Parler TTS in any language, check-out <a href=">this</a> post
171
+
 
 
172
  """
173
  )
174