Update voice_processing.py
Browse files- voice_processing.py +23 -4
voice_processing.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import datetime
|
2 |
import logging
|
3 |
import os
|
@@ -33,7 +34,7 @@ limitation = os.getenv("SYSTEM") == "spaces"
|
|
33 |
config = Config()
|
34 |
|
35 |
# Edge TTS
|
36 |
-
tts_voice_list = edge_tts.list_voices()
|
37 |
tts_voices = ["mn-MN-BataaNeural", "mn-MN-YesuiNeural"] # Specific voices
|
38 |
|
39 |
# RVC models
|
@@ -117,7 +118,14 @@ def get_model_names():
|
|
117 |
model_root = "weights" # Assuming this is where your models are stored
|
118 |
return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
|
119 |
|
120 |
-
def tts(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
# Default values for parameters used in EdgeTTS
|
122 |
speed = 0 # Default speech speed
|
123 |
f0_up_key = 0 # Default pitch adjustment
|
@@ -130,6 +138,7 @@ def tts(model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploade
|
|
130 |
|
131 |
edge_output_filename = get_unique_filename("mp3")
|
132 |
|
|
|
133 |
try:
|
134 |
if use_uploaded_voice:
|
135 |
if uploaded_voice is None:
|
@@ -140,6 +149,7 @@ def tts(model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploade
|
|
140 |
tmp_file.write(uploaded_voice)
|
141 |
uploaded_file_path = tmp_file.name
|
142 |
|
|
|
143 |
audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
|
144 |
else:
|
145 |
# EdgeTTS processing
|
@@ -153,7 +163,9 @@ def tts(model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploade
|
|
153 |
# Invoke Edge TTS
|
154 |
t0 = time.time()
|
155 |
speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
|
156 |
-
edge_tts.Communicate(
|
|
|
|
|
157 |
t1 = time.time()
|
158 |
edge_time = t1 - t0
|
159 |
|
@@ -212,7 +224,9 @@ def tts(model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploade
|
|
212 |
)
|
213 |
|
214 |
except EOFError:
|
215 |
-
info =
|
|
|
|
|
216 |
print(info)
|
217 |
return info, None, None
|
218 |
except Exception as e:
|
@@ -220,10 +234,15 @@ def tts(model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploade
|
|
220 |
print(traceback_info)
|
221 |
return str(e), None, None
|
222 |
|
|
|
223 |
voice_mapping = {
|
224 |
"Mongolian Male": "mn-MN-BataaNeural",
|
225 |
"Mongolian Female": "mn-MN-YesuiNeural"
|
226 |
}
|
227 |
|
|
|
|
|
228 |
hubert_model = load_hubert()
|
|
|
229 |
rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
|
|
|
|
1 |
+
import asyncio
|
2 |
import datetime
|
3 |
import logging
|
4 |
import os
|
|
|
34 |
config = Config()
|
35 |
|
36 |
# Edge TTS
|
37 |
+
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
|
38 |
tts_voices = ["mn-MN-BataaNeural", "mn-MN-YesuiNeural"] # Specific voices
|
39 |
|
40 |
# RVC models
|
|
|
118 |
model_root = "weights" # Assuming this is where your models are stored
|
119 |
return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
|
120 |
|
121 |
+
async def tts(
|
122 |
+
model_name,
|
123 |
+
tts_text,
|
124 |
+
tts_voice,
|
125 |
+
index_rate,
|
126 |
+
use_uploaded_voice,
|
127 |
+
uploaded_voice,
|
128 |
+
):
|
129 |
# Default values for parameters used in EdgeTTS
|
130 |
speed = 0 # Default speech speed
|
131 |
f0_up_key = 0 # Default pitch adjustment
|
|
|
138 |
|
139 |
edge_output_filename = get_unique_filename("mp3")
|
140 |
|
141 |
+
|
142 |
try:
|
143 |
if use_uploaded_voice:
|
144 |
if uploaded_voice is None:
|
|
|
149 |
tmp_file.write(uploaded_voice)
|
150 |
uploaded_file_path = tmp_file.name
|
151 |
|
152 |
+
#uploaded_file_path = uploaded_voice.name
|
153 |
audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
|
154 |
else:
|
155 |
# EdgeTTS processing
|
|
|
163 |
# Invoke Edge TTS
|
164 |
t0 = time.time()
|
165 |
speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
|
166 |
+
await edge_tts.Communicate(
|
167 |
+
tts_text, tts_voice, rate=speed_str
|
168 |
+
).save(edge_output_filename)
|
169 |
t1 = time.time()
|
170 |
edge_time = t1 - t0
|
171 |
|
|
|
224 |
)
|
225 |
|
226 |
except EOFError:
|
227 |
+
info = (
|
228 |
+
"output not valid. This may occur when input text and speaker do not match."
|
229 |
+
)
|
230 |
print(info)
|
231 |
return info, None, None
|
232 |
except Exception as e:
|
|
|
234 |
print(traceback_info)
|
235 |
return str(e), None, None
|
236 |
|
237 |
+
|
238 |
voice_mapping = {
|
239 |
"Mongolian Male": "mn-MN-BataaNeural",
|
240 |
"Mongolian Female": "mn-MN-YesuiNeural"
|
241 |
}
|
242 |
|
243 |
+
|
244 |
+
|
245 |
hubert_model = load_hubert()
|
246 |
+
|
247 |
rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
|
248 |
+
|