Spaces:
Running
on
A10G
Running
on
A10G
Adorable-Qin
commited on
Commit
·
2d4dca5
1
Parent(s):
7c4481d
Change F0 extractor to Crepe
Browse files
ckpts/svc/vocalist_l1_contentvec+whisper/args.json
CHANGED
@@ -140,7 +140,7 @@
|
|
140 |
"pin_memory": true,
|
141 |
"pitch_bin": 256,
|
142 |
"pitch_dir": "pitches",
|
143 |
-
"pitch_extractor": "parselmouth"
|
144 |
"pitch_max": 1100.0,
|
145 |
"pitch_min": 50.0,
|
146 |
"processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
|
|
|
140 |
"pin_memory": true,
|
141 |
"pitch_bin": 256,
|
142 |
"pitch_dir": "pitches",
|
143 |
+
"pitch_extractor": "crepe", // "parselmouth"
|
144 |
"pitch_max": 1100.0,
|
145 |
"pitch_min": 50.0,
|
146 |
"processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
|
utils/f0.py
CHANGED
@@ -207,7 +207,7 @@ def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max
|
|
207 |
return f0
|
208 |
|
209 |
|
210 |
-
def
|
211 |
audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
|
212 |
):
|
213 |
"""Using torchcrepe to extract the f0 feature.
|
@@ -259,6 +259,25 @@ def get_f0_features_using_crepe(
|
|
259 |
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
260 |
return f0
|
261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
def get_f0(audio, cfg):
|
264 |
if cfg.pitch_extractor == "dio":
|
@@ -267,6 +286,8 @@ def get_f0(audio, cfg):
|
|
267 |
f0 = get_f0_features_using_pyin(audio, cfg)
|
268 |
elif cfg.pitch_extractor == "parselmouth":
|
269 |
f0, _ = get_f0_features_using_parselmouth(audio, cfg)
|
|
|
|
|
270 |
# elif cfg.data.f0_extractor == 'cwt': # todo
|
271 |
|
272 |
return f0
|
|
|
207 |
return f0
|
208 |
|
209 |
|
210 |
+
def get_f0_features_using_crepe_legacy(
|
211 |
audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
|
212 |
):
|
213 |
"""Using torchcrepe to extract the f0 feature.
|
|
|
259 |
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
260 |
return f0
|
261 |
|
262 |
+
def get_f0_features_using_crepe(audio, cfg):
|
263 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
264 |
+
audio_torch = torch.FloatTensor(audio).unsqueeze(0).to(device)
|
265 |
+
|
266 |
+
crepe_pitch, pd = torchcrepe.predict(audio_torch, cfg.sample_rate, cfg.hop_size, fmin=cfg.f0_min, fmax=cfg.f0_max, return_periodicity=True)
|
267 |
+
|
268 |
+
threshold = 0.3
|
269 |
+
|
270 |
+
# Filter, de-silence, set up threshold for unvoiced part
|
271 |
+
pd = torchcrepe.filter.median(pd, 3)
|
272 |
+
pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_torch, cfg.sample_rate, 256)
|
273 |
+
crepe_pitch = torchcrepe.threshold.At(threshold)(crepe_pitch, pd)
|
274 |
+
crepe_pitch = torchcrepe.filter.mean(crepe_pitch, 3)
|
275 |
+
|
276 |
+
# Convert unvoiced part to 0hz
|
277 |
+
crepe_pitch = torch.where(torch.isnan(crepe_pitch), torch.full_like(crepe_pitch, 0), crepe_pitch)
|
278 |
+
|
279 |
+
return crepe_pitch[0].cpu().numpy()
|
280 |
+
|
281 |
|
282 |
def get_f0(audio, cfg):
|
283 |
if cfg.pitch_extractor == "dio":
|
|
|
286 |
f0 = get_f0_features_using_pyin(audio, cfg)
|
287 |
elif cfg.pitch_extractor == "parselmouth":
|
288 |
f0, _ = get_f0_features_using_parselmouth(audio, cfg)
|
289 |
+
elif cfg.pitch_extractor == "crepe":
|
290 |
+
f0 = get_f0_features_using_crepe(audio, cfg)
|
291 |
# elif cfg.data.f0_extractor == 'cwt': # todo
|
292 |
|
293 |
return f0
|