Spaces:
No application file
No application file
File size: 3,171 Bytes
b5dba8a d9382cf b5dba8a d9382cf b5dba8a 37eb59e b5dba8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from typing import Optional
from cog import BasePredictor, Input, Path, BaseModel
class ModelOutput(BaseModel):
prompt_npz: Optional[Path]
audio_out: Path
class Predictor(BasePredictor):
def setup(self):
"""Load the model into memory to make running multiple predictions efficient"""
def predict(
self,
speaker: Path = Input(
description="Reference audio.", default=None),
) -> ModelOutput:
"""Run a single prediction on the model"""
# SETUP
import numpy as np
import torch
import torchaudio
from encodec import EncodecModel
from encodec.utils import convert_audio
from bark_hubert_quantizer.hubert_manager import HuBERTManager
from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
from bark_hubert_quantizer.customtokenizer import CustomTokenizer
large_quant_model = False # Use the larger pretrained model
device = torch.device('cuda') # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')
model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
print('Loading HuBERT...')
hubert_model = CustomHubert(
HuBERTManager.make_sure_hubert_installed(), device=device)
print('Loading Quantizer...')
quant_model = CustomTokenizer.load_from_checkpoint(
HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
print('Loading Encodec...')
encodec_model = EncodecModel.encodec_model_24khz()
encodec_model.set_target_bandwidth(6.0)
encodec_model.to(device)
print('Downloaded and loaded models!')
# PREDICT
# Put the path of the speaker you want to use here.
wav_file = speaker
# Put the path to save the cloned speaker to here.
out_file = 'speaker.npz'
wav, sr = torchaudio.load(wav_file)
wav_hubert = wav.to(device)
if wav_hubert.shape[0] == 2: # Stereo to mono if needed
wav_hubert = wav_hubert.mean(0, keepdim=True)
print('Extracting semantics...')
semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)
print('Tokenizing semantics...')
semantic_tokens = quant_model.get_token(semantic_vectors)
print('Creating coarse and fine prompts...')
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)
wav = wav.to(device)
with torch.no_grad():
encoded_frames = encodec_model.encode(wav)
codes = torch.cat([encoded[0]
for encoded in encoded_frames], dim=-1).squeeze()
codes = codes.cpu()
semantic_tokens = semantic_tokens.cpu()
np.savez(out_file,
semantic_prompt=semantic_tokens,
fine_prompt=codes,
coarse_prompt=codes[:2, :]
)
print('Done!')
return ModelOutput(audio_out=Path('speaker.npz'))
|