Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,23 +3,35 @@ import json
|
|
3 |
import argparse
|
4 |
import traceback
|
5 |
import logging
|
6 |
-
from datetime import datetime
|
7 |
import gradio as gr
|
8 |
import numpy as np
|
9 |
import librosa
|
10 |
import torch
|
|
|
|
|
|
|
11 |
from fairseq import checkpoint_utils
|
12 |
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
|
13 |
from vc_infer_pipeline import VC
|
14 |
-
from config import
|
15 |
-
|
|
|
|
|
16 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
17 |
|
18 |
def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
|
19 |
-
def vc_fn(
|
|
|
|
|
|
|
|
|
|
|
20 |
try:
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
23 |
sampling_rate, audio = input_audio
|
24 |
duration = audio.shape[0] / sampling_rate
|
25 |
if duration > 10000000:
|
@@ -29,9 +41,6 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
|
|
29 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
30 |
if sampling_rate != 16000:
|
31 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
32 |
-
else: # Assume it's a file path
|
33 |
-
audio, sr = librosa.load(input_audio, sr=16000, mono=True)
|
34 |
-
|
35 |
times = [0, 0, 0]
|
36 |
f0_up_key = int(f0_up_key)
|
37 |
audio_opt = vc.pipeline(
|
@@ -51,11 +60,10 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
|
|
51 |
f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
52 |
)
|
53 |
return "Success", (tgt_sr, audio_opt)
|
54 |
-
except
|
55 |
info = traceback.format_exc()
|
56 |
print(info)
|
57 |
-
return
|
58 |
-
|
59 |
return vc_fn
|
60 |
|
61 |
def load_hubert():
|
@@ -78,13 +86,10 @@ if __name__ == '__main__':
|
|
78 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
79 |
parser.add_argument("--files", action="store_true", default=False, help="load audio from path")
|
80 |
args, unknown = parser.parse_known_args()
|
81 |
-
|
82 |
load_hubert()
|
83 |
models = []
|
84 |
-
|
85 |
with open("weights/model_info.json", "r", encoding="utf-8") as f:
|
86 |
models_info = json.load(f)
|
87 |
-
|
88 |
for name, info in models_info.items():
|
89 |
if not info['enable']:
|
90 |
continue
|
@@ -95,7 +100,6 @@ if __name__ == '__main__':
|
|
95 |
cpt = torch.load(f"weights/{name}/{name}.pth", map_location="cpu")
|
96 |
tgt_sr = cpt["config"][-1]
|
97 |
if_f0 = cpt.get("f0", 1)
|
98 |
-
|
99 |
if if_f0 == 1:
|
100 |
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
|
101 |
else:
|
@@ -103,15 +107,12 @@ if __name__ == '__main__':
|
|
103 |
del net_g.enc_q
|
104 |
print(net_g.load_state_dict(cpt["weight"], strict=False))
|
105 |
net_g.eval().to(device)
|
106 |
-
|
107 |
if is_half:
|
108 |
net_g = net_g.half()
|
109 |
else:
|
110 |
net_g = net_g.float()
|
111 |
-
|
112 |
vc = VC(tgt_sr, device, is_half)
|
113 |
models.append((name, title, cover, create_vc_fn(tgt_sr, net_g, vc, if_f0, index, npy)))
|
114 |
-
|
115 |
with gr.Blocks() as app:
|
116 |
gr.Markdown(
|
117 |
"# <center> RVC generator\n"
|
@@ -153,18 +154,4 @@ if __name__ == '__main__':
|
|
153 |
vc_output1 = gr.Textbox(label="Output Message")
|
154 |
vc_output2 = gr.Audio(label="Output Audio")
|
155 |
vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio], [vc_output1, vc_output2])
|
156 |
-
app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
|
157 |
-
|
158 |
-
# Note: The following code is outside the `__main__` block, as it was causing indentation issues in the provided code.
|
159 |
-
iface = gr.Interface(
|
160 |
-
fn=create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy),
|
161 |
-
inputs=[
|
162 |
-
gr.inputs.Audio(source="microphone", type="numpy", sample_rate=16000),
|
163 |
-
gr.inputs.Slider(minimum=-12, maximum=12, step=1, default=0),
|
164 |
-
gr.inputs.Radio(["world", "dio"], label="F0 method"),
|
165 |
-
gr.inputs.Slider(minimum=0, maximum=1, step=0.01, default=0.5)
|
166 |
-
],
|
167 |
-
outputs="text",
|
168 |
-
)
|
169 |
-
|
170 |
-
iface.launch()
|
|
|
3 |
import argparse
|
4 |
import traceback
|
5 |
import logging
|
|
|
6 |
import gradio as gr
|
7 |
import numpy as np
|
8 |
import librosa
|
9 |
import torch
|
10 |
+
import asyncio
|
11 |
+
import edge_tts
|
12 |
+
from datetime import datetime
|
13 |
from fairseq import checkpoint_utils
|
14 |
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
|
15 |
from vc_infer_pipeline import VC
|
16 |
+
from config import (
|
17 |
+
is_half,
|
18 |
+
device
|
19 |
+
)
|
20 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
21 |
|
22 |
def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
|
23 |
+
def vc_fn(
|
24 |
+
input_audio,
|
25 |
+
f0_up_key,
|
26 |
+
f0_method,
|
27 |
+
index_rate
|
28 |
+
):
|
29 |
try:
|
30 |
+
if args.files:
|
31 |
+
audio, sr = librosa.load(input_audio, sr=16000, mono=True)
|
32 |
+
else:
|
33 |
+
if input_audio is None:
|
34 |
+
return "You need to upload an audio", None
|
35 |
sampling_rate, audio = input_audio
|
36 |
duration = audio.shape[0] / sampling_rate
|
37 |
if duration > 10000000:
|
|
|
41 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
42 |
if sampling_rate != 16000:
|
43 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
|
|
|
|
|
|
44 |
times = [0, 0, 0]
|
45 |
f0_up_key = int(f0_up_key)
|
46 |
audio_opt = vc.pipeline(
|
|
|
60 |
f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
61 |
)
|
62 |
return "Success", (tgt_sr, audio_opt)
|
63 |
+
except:
|
64 |
info = traceback.format_exc()
|
65 |
print(info)
|
66 |
+
return info, (None, None)
|
|
|
67 |
return vc_fn
|
68 |
|
69 |
def load_hubert():
|
|
|
86 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
87 |
parser.add_argument("--files", action="store_true", default=False, help="load audio from path")
|
88 |
args, unknown = parser.parse_known_args()
|
|
|
89 |
load_hubert()
|
90 |
models = []
|
|
|
91 |
with open("weights/model_info.json", "r", encoding="utf-8") as f:
|
92 |
models_info = json.load(f)
|
|
|
93 |
for name, info in models_info.items():
|
94 |
if not info['enable']:
|
95 |
continue
|
|
|
100 |
cpt = torch.load(f"weights/{name}/{name}.pth", map_location="cpu")
|
101 |
tgt_sr = cpt["config"][-1]
|
102 |
if_f0 = cpt.get("f0", 1)
|
|
|
103 |
if if_f0 == 1:
|
104 |
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
|
105 |
else:
|
|
|
107 |
del net_g.enc_q
|
108 |
print(net_g.load_state_dict(cpt["weight"], strict=False))
|
109 |
net_g.eval().to(device)
|
|
|
110 |
if is_half:
|
111 |
net_g = net_g.half()
|
112 |
else:
|
113 |
net_g = net_g.float()
|
|
|
114 |
vc = VC(tgt_sr, device, is_half)
|
115 |
models.append((name, title, cover, create_vc_fn(tgt_sr, net_g, vc, if_f0, index, npy)))
|
|
|
116 |
with gr.Blocks() as app:
|
117 |
gr.Markdown(
|
118 |
"# <center> RVC generator\n"
|
|
|
154 |
vc_output1 = gr.Textbox(label="Output Message")
|
155 |
vc_output2 = gr.Audio(label="Output Audio")
|
156 |
vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio], [vc_output1, vc_output2])
|
157 |
+
app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|