Politrees commited on
Commit
d2f2216
1 Parent(s): 278bd08

Update src/infer_pack/predictor/RMVPE.py

Browse files
Files changed (1) hide show
  1. src/infer_pack/predictor/RMVPE.py +16 -21
src/infer_pack/predictor/RMVPE.py CHANGED
@@ -334,28 +334,23 @@ class RMVPE:
334
  ckpt = torch.load(model_path, map_location="cpu")
335
  model.load_state_dict(ckpt)
336
  model.eval()
337
- if is_half == True:
338
  model = model.half()
339
  self.model = model
340
- self.resample_kernel = {}
341
  self.is_half = is_half
342
- if device is None:
343
- device = "cuda" if torch.cuda.is_available() else "cpu"
344
- self.device = device
345
- self.mel_extractor = MelSpectrogram(
346
- is_half, 128, 16000, 1024, 160, None, 30, 8000
347
- ).to(device)
348
- self.model = self.model.to(device)
349
  cents_mapping = 20 * np.arange(360) + 1997.3794084376191
350
- self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
351
 
352
  def mel2hidden(self, mel):
353
  with torch.no_grad():
354
  n_frames = mel.shape[-1]
355
- mel = mel.float() # Приведение к float32 перед паддингом
356
- mel = F.pad(
357
- mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
358
- )
359
  hidden = self.model(mel)
360
  return hidden[:, :n_frames]
361
 
@@ -370,7 +365,7 @@ class RMVPE:
370
  mel = self.mel_extractor(audio, center=True)
371
  hidden = self.mel2hidden(mel)
372
  hidden = hidden.squeeze(0).cpu().numpy()
373
- if self.is_half == True:
374
  hidden = hidden.astype("float32")
375
  f0 = self.decode(hidden, thred=thred)
376
  return f0
@@ -384,23 +379,23 @@ class RMVPE:
384
  starts = center - 4
385
  ends = center + 5
386
  for idx in range(salience.shape[0]):
387
- todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
388
- todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
389
  todo_salience = np.array(todo_salience)
390
  todo_cents_mapping = np.array(todo_cents_mapping)
391
  product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
392
  weight_sum = np.sum(todo_salience, 1)
393
- devided = product_sum / weight_sum
394
  maxx = np.max(salience, axis=1)
395
- devided[maxx <= thred] = 0
396
- return devided
397
 
398
  def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
399
  audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
400
  mel = self.mel_extractor(audio, center=True)
401
  hidden = self.mel2hidden(mel)
402
  hidden = hidden.squeeze(0).cpu().numpy()
403
- if self.is_half == True:
404
  hidden = hidden.astype("float32")
405
  f0 = self.decode(hidden, thred=thred)
406
  f0[(f0 < f0_min) | (f0 > f0_max)] = 0
 
334
  ckpt = torch.load(model_path, map_location="cpu")
335
  model.load_state_dict(ckpt)
336
  model.eval()
337
+ if is_half:
338
  model = model.half()
339
  self.model = model
 
340
  self.is_half = is_half
341
+ self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
342
+ self.mel_extractor = MelSpectrogram(is_half, 128, 16000, 1024, 160, None, 30, 8000).to(self.device)
343
+ self.model = self.model.to(self.device)
 
 
 
 
344
  cents_mapping = 20 * np.arange(360) + 1997.3794084376191
345
+ self.cents_mapping = np.pad(cents_mapping, (4, 4))
346
 
347
  def mel2hidden(self, mel):
348
  with torch.no_grad():
349
  n_frames = mel.shape[-1]
350
+ mel = mel.float()
351
+ mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect")
352
+ if self.is_half:
353
+ mel = mel.half()
354
  hidden = self.model(mel)
355
  return hidden[:, :n_frames]
356
 
 
365
  mel = self.mel_extractor(audio, center=True)
366
  hidden = self.mel2hidden(mel)
367
  hidden = hidden.squeeze(0).cpu().numpy()
368
+ if self.is_half:
369
  hidden = hidden.astype("float32")
370
  f0 = self.decode(hidden, thred=thred)
371
  return f0
 
379
  starts = center - 4
380
  ends = center + 5
381
  for idx in range(salience.shape[0]):
382
+ todo_salience.append(salience[:, starts[idx]:ends[idx]][idx])
383
+ todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
384
  todo_salience = np.array(todo_salience)
385
  todo_cents_mapping = np.array(todo_cents_mapping)
386
  product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
387
  weight_sum = np.sum(todo_salience, 1)
388
+ divided = product_sum / weight_sum
389
  maxx = np.max(salience, axis=1)
390
+ divided[maxx <= thred] = 0
391
+ return divided
392
 
393
  def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
394
  audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
395
  mel = self.mel_extractor(audio, center=True)
396
  hidden = self.mel2hidden(mel)
397
  hidden = hidden.squeeze(0).cpu().numpy()
398
+ if self.is_half:
399
  hidden = hidden.astype("float32")
400
  f0 = self.decode(hidden, thred=thred)
401
  f0[(f0 < f0_min) | (f0 > f0_max)] = 0