File size: 3,171 Bytes
b5dba8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9382cf
b5dba8a
 
 
 
 
 
 
 
 
 
d9382cf
 
 
b5dba8a
 
 
 
37eb59e
b5dba8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from typing import Optional
from cog import BasePredictor, Input, Path, BaseModel


class ModelOutput(BaseModel):
    prompt_npz: Optional[Path]
    audio_out: Path


class Predictor(BasePredictor):

    def setup(self):
        """Load the model into memory to make running multiple predictions efficient"""        

    def predict(
        self,
        speaker: Path = Input(
            description="Reference audio.", default=None),
    ) -> ModelOutput:
        """Run a single prediction on the model"""
        # SETUP
        

        import numpy as np
        import torch
        import torchaudio
        from encodec import EncodecModel
        from encodec.utils import convert_audio
        from bark_hubert_quantizer.hubert_manager import HuBERTManager
        from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
        from bark_hubert_quantizer.customtokenizer import CustomTokenizer

        large_quant_model = False  # Use the larger pretrained model
        device = torch.device('cuda')  # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')

        model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
            'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')

        print('Loading HuBERT...')
        
        hubert_model = CustomHubert(
            HuBERTManager.make_sure_hubert_installed(), device=device)
        print('Loading Quantizer...')
        quant_model = CustomTokenizer.load_from_checkpoint(
            HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
        print('Loading Encodec...')
        encodec_model = EncodecModel.encodec_model_24khz()
        encodec_model.set_target_bandwidth(6.0)
        encodec_model.to(device)

        print('Downloaded and loaded models!')
        # PREDICT
        # Put the path of the speaker you want to use here.
        wav_file = speaker
        # Put the path to save the cloned speaker to here.
        out_file = 'speaker.npz'

        wav, sr = torchaudio.load(wav_file)

        wav_hubert = wav.to(device)

        if wav_hubert.shape[0] == 2:  # Stereo to mono if needed
            wav_hubert = wav_hubert.mean(0, keepdim=True)

        print('Extracting semantics...')
        semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)
        print('Tokenizing semantics...')
        semantic_tokens = quant_model.get_token(semantic_vectors)
        print('Creating coarse and fine prompts...')
        wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)

        wav = wav.to(device)

        with torch.no_grad():
            encoded_frames = encodec_model.encode(wav)
        codes = torch.cat([encoded[0]
                          for encoded in encoded_frames], dim=-1).squeeze()

        codes = codes.cpu()
        semantic_tokens = semantic_tokens.cpu()

        np.savez(out_file,
                 semantic_prompt=semantic_tokens,
                 fine_prompt=codes,
                 coarse_prompt=codes[:2, :]
                 )

        print('Done!')

        return ModelOutput(audio_out=Path('speaker.npz'))