Spaces:
Runtime error
Runtime error
SayaSS
commited on
Commit
·
357a89b
1
Parent(s):
5ea47d3
add TTS
Browse files- app-slice.py +135 -0
- app.py +68 -17
- cluster/__pycache__/__init__.cpython-38.pyc +0 -0
- hubert/__pycache__/__init__.cpython-38.pyc +0 -0
- hubert/__pycache__/hubert_model.cpython-38.pyc +0 -0
- inference/__pycache__/infer_tool.cpython-38.pyc +0 -0
- inference/infer_tool.py +30 -27
- modules/__pycache__/__init__.cpython-38.pyc +0 -0
- modules/__pycache__/attentions.cpython-38.pyc +0 -0
- modules/__pycache__/commons.cpython-38.pyc +0 -0
- modules/__pycache__/modules.cpython-38.pyc +0 -0
- vdecoder/__pycache__/__init__.cpython-38.pyc +0 -0
- vdecoder/hifigan/__pycache__/env.cpython-38.pyc +0 -0
- vdecoder/hifigan/__pycache__/models.cpython-38.pyc +0 -0
- vdecoder/hifigan/__pycache__/utils.cpython-38.pyc +0 -0
app-slice.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import edge_tts
|
4 |
+
from pathlib import Path
|
5 |
+
import inference.infer_tool as infer_tool
|
6 |
+
import utils
|
7 |
+
from inference.infer_tool import Svc
|
8 |
+
import logging
|
9 |
+
import webbrowser
|
10 |
+
import argparse
|
11 |
+
import asyncio
|
12 |
+
import librosa
|
13 |
+
import soundfile
|
14 |
+
import gradio.processing_utils as gr_processing_utils
|
15 |
+
logging.getLogger('numba').setLevel(logging.WARNING)
|
16 |
+
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
17 |
+
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
18 |
+
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
19 |
+
|
20 |
+
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
|
21 |
+
|
22 |
+
audio_postprocess_ori = gr.Audio.postprocess
|
23 |
+
|
24 |
+
def audio_postprocess(self, y):
|
25 |
+
data = audio_postprocess_ori(self, y)
|
26 |
+
if data is None:
|
27 |
+
return None
|
28 |
+
return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
|
29 |
+
|
30 |
+
|
31 |
+
gr.Audio.postprocess = audio_postprocess
|
32 |
+
def create_vc_fn(model, sid):
|
33 |
+
def vc_fn(input_audio, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode):
|
34 |
+
if tts_mode:
|
35 |
+
if len(tts_text) > 100 and limitation:
|
36 |
+
return "Text is too long", None
|
37 |
+
if tts_text is None or tts_voice is None:
|
38 |
+
return "You need to enter text and select a voice", None
|
39 |
+
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
|
40 |
+
audio, sr = librosa.load("tts.mp3")
|
41 |
+
soundfile.write("tts.wav", audio, 24000, format="wav")
|
42 |
+
wav_path = "tts.wav"
|
43 |
+
else:
|
44 |
+
if input_audio is None:
|
45 |
+
return "You need to select an audio", None
|
46 |
+
raw_audio_path = f"raw/{input_audio}"
|
47 |
+
if "." not in raw_audio_path:
|
48 |
+
raw_audio_path += ".wav"
|
49 |
+
infer_tool.format_wav(raw_audio_path)
|
50 |
+
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
51 |
+
_audio = model.slice_inference(
|
52 |
+
wav_path, sid, vc_transform, slice_db,
|
53 |
+
cluster_infer_ratio=0,
|
54 |
+
auto_predict_f0=auto_f0,
|
55 |
+
noice_scale=noise_scale,
|
56 |
+
pad_seconds=pad_seconds)
|
57 |
+
model.clear_empty()
|
58 |
+
return "Success", (44100, _audio)
|
59 |
+
return vc_fn
|
60 |
+
|
61 |
+
def refresh_raw_wav():
|
62 |
+
return gr.Dropdown.update(choices=os.listdir("raw"))
|
63 |
+
|
64 |
+
def change_to_tts_mode(tts_mode):
|
65 |
+
if tts_mode:
|
66 |
+
return gr.Audio.update(visible=False), gr.Button.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
|
67 |
+
else:
|
68 |
+
return gr.Audio.update(visible=True), gr.Button.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
|
69 |
+
|
70 |
+
if __name__ == '__main__':
|
71 |
+
parser = argparse.ArgumentParser()
|
72 |
+
parser.add_argument('--device', type=str, default='cpu')
|
73 |
+
parser.add_argument('--api', action="store_true", default=False)
|
74 |
+
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
75 |
+
parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
|
76 |
+
args = parser.parse_args()
|
77 |
+
hubert_model = utils.get_hubert_model().to(args.device)
|
78 |
+
models = []
|
79 |
+
voices = []
|
80 |
+
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
|
81 |
+
for r in tts_voice_list:
|
82 |
+
voices.append(f"{r['ShortName']}-{r['Gender']}")
|
83 |
+
raw = os.listdir("raw")
|
84 |
+
for f in os.listdir("models"):
|
85 |
+
name = f
|
86 |
+
model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device, hubert_model=hubert_model)
|
87 |
+
cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
|
88 |
+
models.append((name, cover, create_vc_fn(model, name)))
|
89 |
+
with gr.Blocks() as app:
|
90 |
+
gr.Markdown(
|
91 |
+
"# <center> Sovits Models\n"
|
92 |
+
"## <center> The input audio should be clean and pure voice without background music.\n"
|
93 |
+
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
|
94 |
+
"[Open In Colab](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)"
|
95 |
+
" without queue and length limitation.\n\n"
|
96 |
+
"[Original Repo](https://github.com/svc-develop-team/so-vits-svc)\n\n"
|
97 |
+
"Other models:\n"
|
98 |
+
"[rudolf](https://huggingface.co/spaces/sayashi/sovits-rudolf)\n"
|
99 |
+
"[teio](https://huggingface.co/spaces/sayashi/sovits-teio)\n"
|
100 |
+
"[goldship](https://huggingface.co/spaces/sayashi/sovits-goldship)\n"
|
101 |
+
"[tannhauser](https://huggingface.co/spaces/sayashi/sovits-tannhauser)\n"
|
102 |
+
|
103 |
+
)
|
104 |
+
with gr.Tabs():
|
105 |
+
for (name, cover, vc_fn) in models:
|
106 |
+
with gr.TabItem(name):
|
107 |
+
with gr.Row():
|
108 |
+
gr.Markdown(
|
109 |
+
'<div align="center">'
|
110 |
+
f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
|
111 |
+
'</div>'
|
112 |
+
)
|
113 |
+
with gr.Row():
|
114 |
+
with gr.Column():
|
115 |
+
with gr.Row():
|
116 |
+
vc_input = gr.Dropdown(label="Input audio", choices=raw)
|
117 |
+
vc_refresh = gr.Button("🔁", variant="primary")
|
118 |
+
vc_transform = gr.Number(label="vc_transform", value=0)
|
119 |
+
slice_db = gr.Number(label="slice_db", value=-40)
|
120 |
+
noise_scale = gr.Number(label="noise_scale", value=0.4)
|
121 |
+
pad_seconds = gr.Number(label="pad_seconds", value=0.5)
|
122 |
+
auto_f0 = gr.Checkbox(label="auto_f0", value=False)
|
123 |
+
tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
|
124 |
+
tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
|
125 |
+
tts_voice = gr.Dropdown(choices=voices, visible=False)
|
126 |
+
vc_submit = gr.Button("Generate", variant="primary")
|
127 |
+
with gr.Column():
|
128 |
+
vc_output1 = gr.Textbox(label="Output Message")
|
129 |
+
vc_output2 = gr.Audio(label="Output Audio")
|
130 |
+
vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode], [vc_output1, vc_output2])
|
131 |
+
vc_refresh.click(refresh_raw_wav, [], [vc_input])
|
132 |
+
tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, vc_refresh, tts_text, tts_voice])
|
133 |
+
if args.colab:
|
134 |
+
webbrowser.open("http://127.0.0.1:7860")
|
135 |
+
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|
app.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
import os
|
|
|
2 |
import gradio as gr
|
3 |
import librosa
|
4 |
import numpy as np
|
5 |
import utils
|
6 |
from inference.infer_tool import Svc
|
7 |
import logging
|
8 |
-
import
|
|
|
9 |
import argparse
|
|
|
10 |
import gradio.processing_utils as gr_processing_utils
|
11 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
12 |
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
@@ -26,32 +29,65 @@ def audio_postprocess(self, y):
|
|
26 |
|
27 |
gr.Audio.postprocess = audio_postprocess
|
28 |
def create_vc_fn(model, sid):
|
29 |
-
def vc_fn(input_audio, vc_transform, auto_f0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
if input_audio is None:
|
31 |
return "You need to upload an audio", None
|
32 |
sampling_rate, audio = input_audio
|
33 |
duration = audio.shape[0] / sampling_rate
|
34 |
-
if duration >
|
35 |
-
return "Please upload an audio file that is less than
|
36 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
37 |
if len(audio.shape) > 1:
|
38 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
39 |
-
if sampling_rate !=
|
40 |
-
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
return "Success", (44100, out_audio.cpu().numpy())
|
44 |
return vc_fn
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
if __name__ == '__main__':
|
47 |
parser = argparse.ArgumentParser()
|
48 |
parser.add_argument('--device', type=str, default='cpu')
|
49 |
parser.add_argument('--api', action="store_true", default=False)
|
50 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
51 |
-
parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
|
52 |
args = parser.parse_args()
|
53 |
hubert_model = utils.get_hubert_model().to(args.device)
|
54 |
models = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
for f in os.listdir("models"):
|
56 |
name = f
|
57 |
model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device, hubert_model=hubert_model)
|
@@ -62,9 +98,10 @@ if __name__ == '__main__':
|
|
62 |
"# <center> Sovits Models\n"
|
63 |
"## <center> The input audio should be clean and pure voice without background music.\n"
|
64 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
|
65 |
-
"[
|
66 |
-
"
|
67 |
-
"[Original Repo](https://github.com/
|
|
|
68 |
)
|
69 |
with gr.Tabs():
|
70 |
for (name, cover, vc_fn) in models:
|
@@ -77,14 +114,28 @@ if __name__ == '__main__':
|
|
77 |
)
|
78 |
with gr.Row():
|
79 |
with gr.Column():
|
80 |
-
vc_input = gr.Audio(label="Input audio"+' (less than
|
81 |
vc_transform = gr.Number(label="vc_transform", value=0)
|
82 |
auto_f0 = gr.Checkbox(label="auto_f0", value=False)
|
|
|
|
|
|
|
83 |
vc_submit = gr.Button("Generate", variant="primary")
|
84 |
with gr.Column():
|
85 |
vc_output1 = gr.Textbox(label="Output Message")
|
86 |
vc_output2 = gr.Audio(label="Output Audio")
|
87 |
-
vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0], [vc_output1, vc_output2])
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import io
|
3 |
import gradio as gr
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
import utils
|
7 |
from inference.infer_tool import Svc
|
8 |
import logging
|
9 |
+
import soundfile
|
10 |
+
import asyncio
|
11 |
import argparse
|
12 |
+
import edge_tts
|
13 |
import gradio.processing_utils as gr_processing_utils
|
14 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
15 |
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
|
|
29 |
|
30 |
gr.Audio.postprocess = audio_postprocess
|
31 |
def create_vc_fn(model, sid):
|
32 |
+
def vc_fn(input_audio, vc_transform, auto_f0, tts_text, tts_voice, tts_mode):
|
33 |
+
if tts_mode:
|
34 |
+
if len(tts_text) > 100 and limitation:
|
35 |
+
return "Text is too long", None
|
36 |
+
if tts_text is None or tts_voice is None:
|
37 |
+
return "You need to enter text and select a voice", None
|
38 |
+
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
|
39 |
+
audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
|
40 |
+
raw_path = io.BytesIO()
|
41 |
+
soundfile.write(raw_path, audio, 16000, format="wav")
|
42 |
+
raw_path.seek(0)
|
43 |
+
out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
|
44 |
+
auto_predict_f0=auto_f0,
|
45 |
+
)
|
46 |
+
return "Success", (44100, out_audio.cpu().numpy())
|
47 |
if input_audio is None:
|
48 |
return "You need to upload an audio", None
|
49 |
sampling_rate, audio = input_audio
|
50 |
duration = audio.shape[0] / sampling_rate
|
51 |
+
if duration > 20 and limitation:
|
52 |
+
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
|
53 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
54 |
if len(audio.shape) > 1:
|
55 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
56 |
+
if sampling_rate != 16000:
|
57 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
58 |
+
raw_path = io.BytesIO()
|
59 |
+
soundfile.write(raw_path, audio, 16000, format="wav")
|
60 |
+
raw_path.seek(0)
|
61 |
+
out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
|
62 |
+
auto_predict_f0=auto_f0,
|
63 |
+
)
|
64 |
return "Success", (44100, out_audio.cpu().numpy())
|
65 |
return vc_fn
|
66 |
|
67 |
+
def change_to_tts_mode(tts_mode):
|
68 |
+
if tts_mode:
|
69 |
+
return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True), gr.Checkbox.update(value=True)
|
70 |
+
else:
|
71 |
+
return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False), gr.Checkbox.update(value=False)
|
72 |
+
|
73 |
if __name__ == '__main__':
|
74 |
parser = argparse.ArgumentParser()
|
75 |
parser.add_argument('--device', type=str, default='cpu')
|
76 |
parser.add_argument('--api', action="store_true", default=False)
|
77 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
|
|
78 |
args = parser.parse_args()
|
79 |
hubert_model = utils.get_hubert_model().to(args.device)
|
80 |
models = []
|
81 |
+
others = {
|
82 |
+
"rudolf": "https://huggingface.co/spaces/sayashi/sovits-rudolf",
|
83 |
+
"teio": "https://huggingface.co/spaces/sayashi/sovits-teio",
|
84 |
+
"goldship": "https://huggingface.co/spaces/sayashi/sovits-goldship",
|
85 |
+
"alice": "https://huggingface.co/spaces/sayashi/sovits-models"
|
86 |
+
}
|
87 |
+
voices = []
|
88 |
+
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
|
89 |
+
for r in tts_voice_list:
|
90 |
+
voices.append(f"{r['ShortName']}-{r['Gender']}")
|
91 |
for f in os.listdir("models"):
|
92 |
name = f
|
93 |
model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device, hubert_model=hubert_model)
|
|
|
98 |
"# <center> Sovits Models\n"
|
99 |
"## <center> The input audio should be clean and pure voice without background music.\n"
|
100 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
|
101 |
+
"[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)\n\n"
|
102 |
+
"[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/sayashi/sovits-tannhauser?duplicate=true)"
|
103 |
+
"[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/svc-develop-team/so-vits-svc)"
|
104 |
+
|
105 |
)
|
106 |
with gr.Tabs():
|
107 |
for (name, cover, vc_fn) in models:
|
|
|
114 |
)
|
115 |
with gr.Row():
|
116 |
with gr.Column():
|
117 |
+
vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
|
118 |
vc_transform = gr.Number(label="vc_transform", value=0)
|
119 |
auto_f0 = gr.Checkbox(label="auto_f0", value=False)
|
120 |
+
tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
|
121 |
+
tts_text = gr.Textbox(visible=False, label="TTS text (100 words limitation)" if limitation else "TTS text")
|
122 |
+
tts_voice = gr.Dropdown(choices=voices, visible=False)
|
123 |
vc_submit = gr.Button("Generate", variant="primary")
|
124 |
with gr.Column():
|
125 |
vc_output1 = gr.Textbox(label="Output Message")
|
126 |
vc_output2 = gr.Audio(label="Output Audio")
|
127 |
+
vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, tts_text, tts_voice, tts_mode], [vc_output1, vc_output2])
|
128 |
+
tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice, auto_f0])
|
129 |
+
for category, link in others.items():
|
130 |
+
with gr.TabItem(category):
|
131 |
+
gr.Markdown(
|
132 |
+
f'''
|
133 |
+
<center>
|
134 |
+
<h2>Click to Go</h2>
|
135 |
+
<a href="{link}">
|
136 |
+
<img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-xl-dark.svg"
|
137 |
+
</a>
|
138 |
+
</center>
|
139 |
+
'''
|
140 |
+
)
|
141 |
+
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|
cluster/__pycache__/__init__.cpython-38.pyc
CHANGED
Binary files a/cluster/__pycache__/__init__.cpython-38.pyc and b/cluster/__pycache__/__init__.cpython-38.pyc differ
|
|
hubert/__pycache__/__init__.cpython-38.pyc
CHANGED
Binary files a/hubert/__pycache__/__init__.cpython-38.pyc and b/hubert/__pycache__/__init__.cpython-38.pyc differ
|
|
hubert/__pycache__/hubert_model.cpython-38.pyc
CHANGED
Binary files a/hubert/__pycache__/hubert_model.cpython-38.pyc and b/hubert/__pycache__/hubert_model.cpython-38.pyc differ
|
|
inference/__pycache__/infer_tool.cpython-38.pyc
CHANGED
Binary files a/inference/__pycache__/infer_tool.cpython-38.pyc and b/inference/__pycache__/infer_tool.cpython-38.pyc differ
|
|
inference/infer_tool.py
CHANGED
@@ -142,7 +142,8 @@ class Svc(object):
|
|
142 |
|
143 |
|
144 |
|
145 |
-
def get_unit_f0(self,
|
|
|
146 |
f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
|
147 |
f0, uv = utils.interpolate_f0(f0)
|
148 |
f0 = torch.FloatTensor(f0)
|
@@ -151,12 +152,12 @@ class Svc(object):
|
|
151 |
f0 = f0.unsqueeze(0).to(self.dev)
|
152 |
uv = uv.unsqueeze(0).to(self.dev)
|
153 |
|
154 |
-
wav16k = librosa.resample(wav, orig_sr=
|
155 |
wav16k = torch.from_numpy(wav16k).to(self.dev)
|
156 |
c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
|
157 |
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
158 |
|
159 |
-
if cluster_infer_ratio !=0:
|
160 |
cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
|
161 |
cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
|
162 |
c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
|
@@ -164,7 +165,7 @@ class Svc(object):
|
|
164 |
c = c.unsqueeze(0)
|
165 |
return c, f0, uv
|
166 |
|
167 |
-
def infer(self, speaker, tran,
|
168 |
cluster_infer_ratio=0,
|
169 |
auto_predict_f0=False,
|
170 |
noice_scale=0.4):
|
@@ -173,7 +174,7 @@ class Svc(object):
|
|
173 |
if len(self.spk2id.__dict__) >= speaker:
|
174 |
speaker_id = speaker
|
175 |
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
176 |
-
c, f0, uv = self.get_unit_f0(
|
177 |
if "half" in self.net_g_path and torch.cuda.is_available():
|
178 |
c = c.half()
|
179 |
with torch.no_grad():
|
@@ -187,17 +188,18 @@ class Svc(object):
|
|
187 |
# 清理显存
|
188 |
torch.cuda.empty_cache()
|
189 |
|
190 |
-
def slice_inference(self,raw_audio_path, spk, tran, slice_db,cluster_infer_ratio, auto_predict_f0,noice_scale,
|
|
|
191 |
wav_path = raw_audio_path
|
192 |
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
193 |
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
194 |
-
per_size = int(clip_seconds*audio_sr)
|
195 |
-
lg_size = int(lg_num*audio_sr)
|
196 |
-
lg_size_r = int(lg_size*lgr_num)
|
197 |
-
lg_size_c_l = (lg_size-lg_size_r)//2
|
198 |
-
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
|
199 |
-
lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
|
200 |
-
|
201 |
audio = []
|
202 |
for (slice_tag, data) in audio_data:
|
203 |
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
@@ -209,12 +211,12 @@ class Svc(object):
|
|
209 |
audio.extend(list(pad_array(_audio, length)))
|
210 |
continue
|
211 |
if per_size != 0:
|
212 |
-
datas = split_list_by_n(data, per_size,lg_size)
|
213 |
else:
|
214 |
datas = [data]
|
215 |
-
for k,dat in enumerate(datas):
|
216 |
-
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
|
217 |
-
if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
218 |
# padd
|
219 |
pad_len = int(audio_sr * pad_seconds)
|
220 |
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
@@ -222,24 +224,25 @@ class Svc(object):
|
|
222 |
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
223 |
raw_path.seek(0)
|
224 |
out_audio, out_sr = self.infer(spk, tran, raw_path,
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
_audio = out_audio.cpu().numpy()
|
230 |
pad_len = int(self.target_sample * pad_seconds)
|
231 |
_audio = _audio[pad_len:-pad_len]
|
232 |
_audio = pad_array(_audio, per_length)
|
233 |
-
if lg_size!=0 and k!=0:
|
234 |
-
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
|
235 |
-
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r]
|
236 |
-
lg_pre = lg1*(1-lg)+lg2*lg
|
237 |
-
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
|
238 |
audio.extend(lg_pre)
|
239 |
-
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
|
240 |
audio.extend(list(_audio))
|
241 |
return np.array(audio)
|
242 |
|
|
|
243 |
class RealTimeVC:
|
244 |
def __init__(self):
|
245 |
self.last_chunk = None
|
|
|
142 |
|
143 |
|
144 |
|
145 |
+
def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker):
|
146 |
+
wav, sr = librosa.load(in_path, sr=self.target_sample)
|
147 |
f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
|
148 |
f0, uv = utils.interpolate_f0(f0)
|
149 |
f0 = torch.FloatTensor(f0)
|
|
|
152 |
f0 = f0.unsqueeze(0).to(self.dev)
|
153 |
uv = uv.unsqueeze(0).to(self.dev)
|
154 |
|
155 |
+
wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
|
156 |
wav16k = torch.from_numpy(wav16k).to(self.dev)
|
157 |
c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
|
158 |
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
159 |
|
160 |
+
if cluster_infer_ratio != 0:
|
161 |
cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
|
162 |
cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
|
163 |
c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
|
|
|
165 |
c = c.unsqueeze(0)
|
166 |
return c, f0, uv
|
167 |
|
168 |
+
def infer(self, speaker, tran, raw_path,
|
169 |
cluster_infer_ratio=0,
|
170 |
auto_predict_f0=False,
|
171 |
noice_scale=0.4):
|
|
|
174 |
if len(self.spk2id.__dict__) >= speaker:
|
175 |
speaker_id = speaker
|
176 |
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
177 |
+
c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker)
|
178 |
if "half" in self.net_g_path and torch.cuda.is_available():
|
179 |
c = c.half()
|
180 |
with torch.no_grad():
|
|
|
188 |
# 清理显存
|
189 |
torch.cuda.empty_cache()
|
190 |
|
191 |
+
def slice_inference(self, raw_audio_path, spk, tran, slice_db, cluster_infer_ratio, auto_predict_f0, noice_scale,
|
192 |
+
pad_seconds=0.5, clip_seconds=0, lg_num=0, lgr_num=0.75):
|
193 |
wav_path = raw_audio_path
|
194 |
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
195 |
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
196 |
+
per_size = int(clip_seconds * audio_sr)
|
197 |
+
lg_size = int(lg_num * audio_sr)
|
198 |
+
lg_size_r = int(lg_size * lgr_num)
|
199 |
+
lg_size_c_l = (lg_size - lg_size_r) // 2
|
200 |
+
lg_size_c_r = lg_size - lg_size_r - lg_size_c_l
|
201 |
+
lg = np.linspace(0, 1, lg_size_r) if lg_size != 0 else 0
|
202 |
+
|
203 |
audio = []
|
204 |
for (slice_tag, data) in audio_data:
|
205 |
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
|
|
211 |
audio.extend(list(pad_array(_audio, length)))
|
212 |
continue
|
213 |
if per_size != 0:
|
214 |
+
datas = split_list_by_n(data, per_size, lg_size)
|
215 |
else:
|
216 |
datas = [data]
|
217 |
+
for k, dat in enumerate(datas):
|
218 |
+
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds != 0 else length
|
219 |
+
if clip_seconds != 0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
220 |
# padd
|
221 |
pad_len = int(audio_sr * pad_seconds)
|
222 |
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
|
|
224 |
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
225 |
raw_path.seek(0)
|
226 |
out_audio, out_sr = self.infer(spk, tran, raw_path,
|
227 |
+
cluster_infer_ratio=cluster_infer_ratio,
|
228 |
+
auto_predict_f0=auto_predict_f0,
|
229 |
+
noice_scale=noice_scale
|
230 |
+
)
|
231 |
_audio = out_audio.cpu().numpy()
|
232 |
pad_len = int(self.target_sample * pad_seconds)
|
233 |
_audio = _audio[pad_len:-pad_len]
|
234 |
_audio = pad_array(_audio, per_length)
|
235 |
+
if lg_size != 0 and k != 0:
|
236 |
+
lg1 = audio[-(lg_size_r + lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
|
237 |
+
lg2 = _audio[lg_size_c_l:lg_size_c_l + lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
|
238 |
+
lg_pre = lg1 * (1 - lg) + lg2 * lg
|
239 |
+
audio = audio[0:-(lg_size_r + lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
|
240 |
audio.extend(lg_pre)
|
241 |
+
_audio = _audio[lg_size_c_l + lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
|
242 |
audio.extend(list(_audio))
|
243 |
return np.array(audio)
|
244 |
|
245 |
+
|
246 |
class RealTimeVC:
|
247 |
def __init__(self):
|
248 |
self.last_chunk = None
|
modules/__pycache__/__init__.cpython-38.pyc
CHANGED
Binary files a/modules/__pycache__/__init__.cpython-38.pyc and b/modules/__pycache__/__init__.cpython-38.pyc differ
|
|
modules/__pycache__/attentions.cpython-38.pyc
CHANGED
Binary files a/modules/__pycache__/attentions.cpython-38.pyc and b/modules/__pycache__/attentions.cpython-38.pyc differ
|
|
modules/__pycache__/commons.cpython-38.pyc
CHANGED
Binary files a/modules/__pycache__/commons.cpython-38.pyc and b/modules/__pycache__/commons.cpython-38.pyc differ
|
|
modules/__pycache__/modules.cpython-38.pyc
CHANGED
Binary files a/modules/__pycache__/modules.cpython-38.pyc and b/modules/__pycache__/modules.cpython-38.pyc differ
|
|
vdecoder/__pycache__/__init__.cpython-38.pyc
CHANGED
Binary files a/vdecoder/__pycache__/__init__.cpython-38.pyc and b/vdecoder/__pycache__/__init__.cpython-38.pyc differ
|
|
vdecoder/hifigan/__pycache__/env.cpython-38.pyc
CHANGED
Binary files a/vdecoder/hifigan/__pycache__/env.cpython-38.pyc and b/vdecoder/hifigan/__pycache__/env.cpython-38.pyc differ
|
|
vdecoder/hifigan/__pycache__/models.cpython-38.pyc
CHANGED
Binary files a/vdecoder/hifigan/__pycache__/models.cpython-38.pyc and b/vdecoder/hifigan/__pycache__/models.cpython-38.pyc differ
|
|
vdecoder/hifigan/__pycache__/utils.cpython-38.pyc
CHANGED
Binary files a/vdecoder/hifigan/__pycache__/utils.cpython-38.pyc and b/vdecoder/hifigan/__pycache__/utils.cpython-38.pyc differ
|
|