Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
import librosa
|
3 |
import numpy as np
|
4 |
import torch
|
@@ -7,14 +8,14 @@ import commons
|
|
7 |
import utils
|
8 |
import gradio as gr
|
9 |
from models import SynthesizerTrn
|
10 |
-
from text import text_to_sequence
|
11 |
from mel_processing import spectrogram_torch
|
12 |
|
13 |
-
limitation =
|
14 |
|
15 |
|
16 |
-
def get_text(text, hps):
|
17 |
-
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
|
18 |
if hps.data.add_blank:
|
19 |
text_norm = commons.intersperse(text_norm, 0)
|
20 |
text_norm = LongTensor(text_norm)
|
@@ -22,11 +23,11 @@ def get_text(text, hps):
|
|
22 |
|
23 |
|
24 |
def create_tts_fn(model, hps, speaker_ids):
|
25 |
-
def tts_fn(text, speaker, speed):
|
26 |
-
if limitation and len(text) > 60:
|
27 |
return "Error: Text is too long", None
|
28 |
speaker_id = speaker_ids[speaker]
|
29 |
-
stn_tst = get_text(text, hps)
|
30 |
with no_grad():
|
31 |
x_tst = stn_tst.unsqueeze(0)
|
32 |
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
@@ -72,6 +73,24 @@ def create_vc_fn(model, hps, speaker_ids):
|
|
72 |
return vc_fn
|
73 |
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
if __name__ == '__main__':
|
76 |
models = []
|
77 |
with open("saved_model/names.json", "r", encoding="utf-8") as f:
|
@@ -92,10 +111,10 @@ if __name__ == '__main__':
|
|
92 |
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
|
93 |
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
|
94 |
|
95 |
-
models.append((models_name, cover_path, speakers,
|
96 |
create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids)))
|
97 |
|
98 |
-
app = gr.Blocks()
|
99 |
|
100 |
with app:
|
101 |
gr.Markdown("# Moe Japanese TTS And Voice Conversion Using VITS Model\n\n"
|
@@ -107,7 +126,7 @@ if __name__ == '__main__':
|
|
107 |
with gr.Tabs():
|
108 |
with gr.TabItem("TTS"):
|
109 |
with gr.Tabs():
|
110 |
-
for i, (model_name, cover_path, speakers, tts_fn, vc_fn) in enumerate(models):
|
111 |
with gr.TabItem(f"model{i}"):
|
112 |
with gr.Column():
|
113 |
gr.Markdown(f"## {model_name}\n\n"
|
@@ -116,14 +135,31 @@ if __name__ == '__main__':
|
|
116 |
tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
|
117 |
type="index", value=speakers[0])
|
118 |
tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
tts_submit = gr.Button("Generate", variant="primary")
|
120 |
tts_output1 = gr.Textbox(label="Output Message")
|
121 |
tts_output2 = gr.Audio(label="Output Audio")
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
[tts_output1, tts_output2])
|
|
|
|
|
|
|
124 |
with gr.TabItem("Voice Conversion"):
|
125 |
with gr.Tabs():
|
126 |
-
for i, (model_name, cover_path, speakers, tts_fn, vc_fn) in enumerate(models):
|
127 |
with gr.TabItem(f"model{i}"):
|
128 |
gr.Markdown(f"## {model_name}\n\n"
|
129 |
f"![cover](file/{cover_path})")
|
@@ -136,4 +172,4 @@ if __name__ == '__main__':
|
|
136 |
vc_output1 = gr.Textbox(label="Output Message")
|
137 |
vc_output2 = gr.Audio(label="Output Audio")
|
138 |
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
|
139 |
-
app.launch(
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import librosa
|
4 |
import numpy as np
|
5 |
import torch
|
|
|
8 |
import utils
|
9 |
import gradio as gr
|
10 |
from models import SynthesizerTrn
|
11 |
+
from text import text_to_sequence, _clean_text
|
12 |
from mel_processing import spectrogram_torch
|
13 |
|
14 |
+
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
15 |
|
16 |
|
17 |
+
def get_text(text, hps, is_phoneme):
|
18 |
+
text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners)
|
19 |
if hps.data.add_blank:
|
20 |
text_norm = commons.intersperse(text_norm, 0)
|
21 |
text_norm = LongTensor(text_norm)
|
|
|
23 |
|
24 |
|
25 |
def create_tts_fn(model, hps, speaker_ids):
|
26 |
+
def tts_fn(text, speaker, speed, is_phoneme):
|
27 |
+
if limitation and ((len(text) > 60 and not is_phoneme) or (len(text) > 120 and is_phoneme)):
|
28 |
return "Error: Text is too long", None
|
29 |
speaker_id = speaker_ids[speaker]
|
30 |
+
stn_tst = get_text(text, hps, is_phoneme)
|
31 |
with no_grad():
|
32 |
x_tst = stn_tst.unsqueeze(0)
|
33 |
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
|
|
73 |
return vc_fn
|
74 |
|
75 |
|
76 |
+
css = """
|
77 |
+
#advanced-btn {
|
78 |
+
color: white;
|
79 |
+
border-color: black;
|
80 |
+
background: black;
|
81 |
+
font-size: .7rem !important;
|
82 |
+
line-height: 19px;
|
83 |
+
margin-top: 24px;
|
84 |
+
margin-bottom: 12px;
|
85 |
+
padding: 2px 8px;
|
86 |
+
border-radius: 14px !important;
|
87 |
+
}
|
88 |
+
#advanced-options {
|
89 |
+
display: none;
|
90 |
+
margin-bottom: 20px;
|
91 |
+
}
|
92 |
+
"""
|
93 |
+
|
94 |
if __name__ == '__main__':
|
95 |
models = []
|
96 |
with open("saved_model/names.json", "r", encoding="utf-8") as f:
|
|
|
111 |
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
|
112 |
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
|
113 |
|
114 |
+
models.append((models_name, cover_path, speakers, hps.symbols,
|
115 |
create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids)))
|
116 |
|
117 |
+
app = gr.Blocks(css=css)
|
118 |
|
119 |
with app:
|
120 |
gr.Markdown("# Moe Japanese TTS And Voice Conversion Using VITS Model\n\n"
|
|
|
126 |
with gr.Tabs():
|
127 |
with gr.TabItem("TTS"):
|
128 |
with gr.Tabs():
|
129 |
+
for i, (model_name, cover_path, speakers, symbols, tts_fn, vc_fn) in enumerate(models):
|
130 |
with gr.TabItem(f"model{i}"):
|
131 |
with gr.Column():
|
132 |
gr.Markdown(f"## {model_name}\n\n"
|
|
|
135 |
tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
|
136 |
type="index", value=speakers[0])
|
137 |
tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
|
138 |
+
advanced_button = gr.Button("Advanced options", elem_id="advanced-btn")
|
139 |
+
advanced_options = gr.Column()
|
140 |
+
advanced_options.elem_id = "advanced-options"
|
141 |
+
with advanced_options:
|
142 |
+
phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
|
143 |
+
to_phoneme_btn = gr.Button("Covert text to phoneme")
|
144 |
+
phoneme_list = gr.Json(label="Phoneme list", value=symbols, elem_id="phoneme_list")
|
145 |
+
|
146 |
tts_submit = gr.Button("Generate", variant="primary")
|
147 |
tts_output1 = gr.Textbox(label="Output Message")
|
148 |
tts_output2 = gr.Audio(label="Output Audio")
|
149 |
+
advanced_button.click(None, [], [],
|
150 |
+
_js="""
|
151 |
+
() => {
|
152 |
+
const options = document.querySelector("body > gradio-app").shadowRoot.querySelector("#advanced-options");
|
153 |
+
options.style.display = ["none", ""].includes(options.style.display) ? "flex" : "none";
|
154 |
+
}""")
|
155 |
+
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
|
156 |
[tts_output1, tts_output2])
|
157 |
+
to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
|
158 |
+
[tts_input1], [tts_input1])
|
159 |
+
|
160 |
with gr.TabItem("Voice Conversion"):
|
161 |
with gr.Tabs():
|
162 |
+
for i, (model_name, cover_path, speakers, symbols, tts_fn, vc_fn) in enumerate(models):
|
163 |
with gr.TabItem(f"model{i}"):
|
164 |
gr.Markdown(f"## {model_name}\n\n"
|
165 |
f"![cover](file/{cover_path})")
|
|
|
172 |
vc_output1 = gr.Textbox(label="Output Message")
|
173 |
vc_output2 = gr.Audio(label="Output Audio")
|
174 |
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
|
175 |
+
app.launch()
|