Spaces:
Running
on
Zero
Running
on
Zero
non-native langs
Browse files- app.py +34 -8
- styletts2importable.py +26 -4
app.py
CHANGED
@@ -47,21 +47,24 @@ for v in voicelist:
|
|
47 |
if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
|
48 |
|
49 |
@spaces.GPU(duration=10)
|
50 |
-
def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
51 |
if text.strip() == "":
|
52 |
raise gr.Error("You must enter some text")
|
53 |
if len(text) > 50000:
|
54 |
raise gr.Error("Text must be <50k characters")
|
55 |
-
print("*** saying ***")
|
56 |
-
print(text)
|
57 |
-
print("*** end ***")
|
58 |
texts = txtsplit(text)
|
59 |
v = voice.lower()
|
60 |
audios = []
|
|
|
61 |
for t in progress.tqdm(texts):
|
62 |
print(t)
|
63 |
-
|
64 |
-
|
|
|
|
|
65 |
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
66 |
# if password == os.environ['ACCESS_CODE']:
|
67 |
# if text.strip() == "":
|
@@ -99,7 +102,6 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
|
|
99 |
print("*** end ***")
|
100 |
texts = txtsplit(text)
|
101 |
audios = []
|
102 |
-
# vs = styletts2importable.compute_style(voice)
|
103 |
vs = styletts2importable.compute_style(voice)
|
104 |
# print(vs)
|
105 |
for t in progress.tqdm(texts):
|
@@ -135,12 +137,36 @@ with gr.Blocks() as vctk:
|
|
135 |
with gr.Column(scale=1):
|
136 |
inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
|
137 |
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
|
139 |
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
|
140 |
with gr.Column(scale=1):
|
141 |
btn = gr.Button("Synthesize", variant="primary")
|
142 |
audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
|
143 |
-
|
|
|
144 |
with gr.Blocks() as clone:
|
145 |
with gr.Row():
|
146 |
with gr.Column(scale=1):
|
|
|
47 |
if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
|
48 |
|
49 |
@spaces.GPU(duration=10)
|
50 |
+
def synthesize(text, voice, lang, lngsteps, password, progress=gr.Progress()):
|
51 |
if text.strip() == "":
|
52 |
raise gr.Error("You must enter some text")
|
53 |
if len(text) > 50000:
|
54 |
raise gr.Error("Text must be <50k characters")
|
55 |
+
# print("*** saying ***")
|
56 |
+
# print(text)
|
57 |
+
# print("*** end ***")
|
58 |
texts = txtsplit(text)
|
59 |
v = voice.lower()
|
60 |
audios = []
|
61 |
+
ipa_results = ''
|
62 |
for t in progress.tqdm(texts):
|
63 |
print(t)
|
64 |
+
audio, ipa = styletts2importable.inference(t, voices[v], lang=lang, alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1)
|
65 |
+
audios.append(audio)
|
66 |
+
ipa_results += ipa
|
67 |
+
return (24000, np.concatenate(audios)), ipa_results
|
68 |
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
69 |
# if password == os.environ['ACCESS_CODE']:
|
70 |
# if text.strip() == "":
|
|
|
102 |
print("*** end ***")
|
103 |
texts = txtsplit(text)
|
104 |
audios = []
|
|
|
105 |
vs = styletts2importable.compute_style(voice)
|
106 |
# print(vs)
|
107 |
for t in progress.tqdm(texts):
|
|
|
137 |
with gr.Column(scale=1):
|
138 |
inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
|
139 |
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
|
140 |
+
lang =gr.Dropdown(
|
141 |
+
[
|
142 |
+
['English', 'en-us'],
|
143 |
+
['Czech (Non-native)', 'cs'],
|
144 |
+
['Danish (Non-native)', 'da'],
|
145 |
+
['Dutch (Non-native)', 'nl'],
|
146 |
+
['Estonian (Non-native)', 'et'],
|
147 |
+
['Finnish (Non-native)', 'fi'],
|
148 |
+
['French (Non-native)', 'fr'],
|
149 |
+
['German (Non-native)', 'de'],
|
150 |
+
['Greek (Non-native)', 'el'],
|
151 |
+
['Italian (Non-native)', 'it'],
|
152 |
+
['Norwegian (Non-native)', 'no'],
|
153 |
+
['Polish (Non-native)', 'pl'],
|
154 |
+
['Portuguese (Non-native)', 'pt'],
|
155 |
+
['Russian (Non-native)', 'ru'],
|
156 |
+
['Slovene (Non-native)', 'sl'],
|
157 |
+
['Spanish (Non-native)', 'es'],
|
158 |
+
['Swedish (Non-native)', 'sv'],
|
159 |
+
['Turkish (Non-native)', 'tr'],
|
160 |
+
],
|
161 |
+
label="Language",
|
162 |
+
)
|
163 |
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
|
164 |
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
|
165 |
with gr.Column(scale=1):
|
166 |
btn = gr.Button("Synthesize", variant="primary")
|
167 |
audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
|
168 |
+
ipa_result = gr.Textbox(label="IPA", interactive=False)
|
169 |
+
btn.click(synthesize, inputs=[inp, voice, lang, multispeakersteps], outputs=[audio, ipa_result], concurrency_limit=4)
|
170 |
with gr.Blocks() as clone:
|
171 |
with gr.Row():
|
172 |
with gr.Column(scale=1):
|
styletts2importable.py
CHANGED
@@ -135,7 +135,28 @@ sampler = DiffusionSampler(
|
|
135 |
clamp=False
|
136 |
)
|
137 |
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
text = text.strip()
|
140 |
|
141 |
# search for IPA within []
|
@@ -148,8 +169,9 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
|
|
148 |
if (ipa_sections is not None):
|
149 |
text = re.sub(regex, '[]', text, 0, re.MULTILINE)
|
150 |
|
151 |
-
|
152 |
-
ps =
|
|
|
153 |
ps = ' '.join(ps)
|
154 |
|
155 |
# add the IPA back
|
@@ -219,7 +241,7 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
|
|
219 |
F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
220 |
|
221 |
|
222 |
-
return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later
|
223 |
|
224 |
def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
|
225 |
text = text.strip()
|
|
|
135 |
clamp=False
|
136 |
)
|
137 |
|
138 |
+
LANG_NAMES = {
|
139 |
+
'en-us': 'english',
|
140 |
+
'cs': 'czech',
|
141 |
+
'da': 'danish',
|
142 |
+
'nl': 'dutch',
|
143 |
+
'et': 'estonian',
|
144 |
+
'fi': 'finnish',
|
145 |
+
'fr': 'french',
|
146 |
+
'de': 'german',
|
147 |
+
'el': 'greek',
|
148 |
+
'it': 'italian',
|
149 |
+
'no': 'norwegian',
|
150 |
+
'pl': 'polish',
|
151 |
+
'pt': 'portuguese',
|
152 |
+
'ru': 'russian',
|
153 |
+
'sl': 'slovene',
|
154 |
+
'es': 'spanish',
|
155 |
+
'sv': 'swedish',
|
156 |
+
'tr': 'turkish',
|
157 |
+
}
|
158 |
+
|
159 |
+
def inference(text, ref_s, lang='en-us', alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
|
160 |
text = text.strip()
|
161 |
|
162 |
# search for IPA within []
|
|
|
169 |
if (ipa_sections is not None):
|
170 |
text = re.sub(regex, '[]', text, 0, re.MULTILINE)
|
171 |
|
172 |
+
local_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True)
|
173 |
+
ps = local_phonemizer.phonemize([text])
|
174 |
+
ps = word_tokenize(ps[0], language=LANG_NAMES[lang])
|
175 |
ps = ' '.join(ps)
|
176 |
|
177 |
# add the IPA back
|
|
|
241 |
F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
242 |
|
243 |
|
244 |
+
return out.squeeze().cpu().numpy()[..., :-50], ps # weird pulse at the end of the model, need to be fixed later
|
245 |
|
246 |
def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
|
247 |
text = text.strip()
|