Spaces:
Runtime error
Runtime error
divakaivan
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -6,12 +6,12 @@ import torch
|
|
6 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
7 |
|
8 |
|
9 |
-
checkpoint = "
|
10 |
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
11 |
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
12 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
|
14 |
-
|
15 |
speaker_embeddings = {
|
16 |
"BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
|
17 |
"CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
|
@@ -62,14 +62,10 @@ title = "SpeechT5: Speech Synthesis"
|
|
62 |
description = """
|
63 |
The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
|
64 |
By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
|
65 |
-
|
66 |
SpeechT5 can be fine-tuned for different speech tasks. This space demonstrates the <b>text-to-speech</b> (TTS) checkpoint for the English language.
|
67 |
-
|
68 |
See also the <a href="https://huggingface.co/spaces/Matthijs/speecht5-asr-demo">speech recognition (ASR) demo</a>
|
69 |
and the <a href="https://huggingface.co/spaces/Matthijs/speecht5-vc-demo">voice conversion demo</a>.
|
70 |
-
|
71 |
Refer to <a href="https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ">this Colab notebook</a> to learn how to fine-tune the SpeechT5 TTS model on your own dataset or language.
|
72 |
-
|
73 |
<b>How to use:</b> Enter some English text and choose a speaker. The output is a mel spectrogram, which is converted to a mono 16 kHz waveform by the
|
74 |
HiFi-GAN vocoder. Because the model always applies random dropout, each attempt will give slightly different results.
|
75 |
The <em>Surprise Me!</em> option creates a completely randomized speaker.
|
@@ -77,11 +73,9 @@ The <em>Surprise Me!</em> option creates a completely randomized speaker.
|
|
77 |
|
78 |
article = """
|
79 |
<div style='margin:20px auto;'>
|
80 |
-
|
81 |
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
|
82 |
<a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
|
83 |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
|
84 |
-
|
85 |
<pre>
|
86 |
@article{Ao2021SpeechT5,
|
87 |
title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
|
@@ -92,9 +86,7 @@ article = """
|
|
92 |
year={2021}
|
93 |
}
|
94 |
</pre>
|
95 |
-
|
96 |
<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>
|
97 |
-
|
98 |
</div>
|
99 |
"""
|
100 |
|
@@ -111,6 +103,15 @@ gr.Interface(
|
|
111 |
fn=predict,
|
112 |
inputs=[
|
113 |
gr.Text(label="Input Text"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
],
|
115 |
outputs=[
|
116 |
gr.Audio(label="Generated Speech", type="numpy"),
|
|
|
6 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
7 |
|
8 |
|
9 |
+
checkpoint = "microsoft/speecht5_tts"
|
10 |
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
11 |
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
12 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
|
14 |
+
|
15 |
speaker_embeddings = {
|
16 |
"BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
|
17 |
"CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
|
|
|
62 |
description = """
|
63 |
The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
|
64 |
By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
|
|
|
65 |
SpeechT5 can be fine-tuned for different speech tasks. This space demonstrates the <b>text-to-speech</b> (TTS) checkpoint for the English language.
|
|
|
66 |
See also the <a href="https://huggingface.co/spaces/Matthijs/speecht5-asr-demo">speech recognition (ASR) demo</a>
|
67 |
and the <a href="https://huggingface.co/spaces/Matthijs/speecht5-vc-demo">voice conversion demo</a>.
|
|
|
68 |
Refer to <a href="https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ">this Colab notebook</a> to learn how to fine-tune the SpeechT5 TTS model on your own dataset or language.
|
|
|
69 |
<b>How to use:</b> Enter some English text and choose a speaker. The output is a mel spectrogram, which is converted to a mono 16 kHz waveform by the
|
70 |
HiFi-GAN vocoder. Because the model always applies random dropout, each attempt will give slightly different results.
|
71 |
The <em>Surprise Me!</em> option creates a completely randomized speaker.
|
|
|
73 |
|
74 |
article = """
|
75 |
<div style='margin:20px auto;'>
|
|
|
76 |
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
|
77 |
<a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
|
78 |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
|
|
|
79 |
<pre>
|
80 |
@article{Ao2021SpeechT5,
|
81 |
title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
|
|
|
86 |
year={2021}
|
87 |
}
|
88 |
</pre>
|
|
|
89 |
<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>
|
|
|
90 |
</div>
|
91 |
"""
|
92 |
|
|
|
103 |
fn=predict,
|
104 |
inputs=[
|
105 |
gr.Text(label="Input Text"),
|
106 |
+
gr.Radio(label="Speaker", choices=[
|
107 |
+
"BDL (male)",
|
108 |
+
"CLB (female)",
|
109 |
+
"KSP (male)",
|
110 |
+
"RMS (male)",
|
111 |
+
"SLT (female)",
|
112 |
+
"Surprise Me!"
|
113 |
+
],
|
114 |
+
value="BDL (male)"),
|
115 |
],
|
116 |
outputs=[
|
117 |
gr.Audio(label="Generated Speech", type="numpy"),
|