File size: 8,248 Bytes
93fbf3b
110cc32
93fbf3b
 
 
 
 
 
 
 
 
 
 
9674513
 
 
93fbf3b
 
 
 
 
 
 
 
30f3714
 
 
93fbf3b
 
 
 
 
 
 
 
30f3714
93fbf3b
 
 
 
 
 
 
 
30f3714
93fbf3b
 
 
 
 
 
 
 
 
4d92caa
170d5e3
 
 
 
 
 
 
66d0593
30f3714
170d5e3
0ceacef
 
 
 
 
 
 
 
 
93fbf3b
 
fe33723
 
 
 
 
 
 
 
 
 
 
 
 
4d92caa
fe33723
 
 
 
6dc0682
 
 
 
7f03e43
9ca6ea1
6dc0682
 
8732225
 
 
 
6dc0682
8732225
f0fcaae
8732225
fe33723
 
 
 
 
 
 
 
 
 
93fbf3b
 
 
 
9dbace4
93fbf3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170d5e3
 
0ceacef
 
93fbf3b
 
 
0ceacef
93fbf3b
 
 
 
 
 
30f3714
6dc0682
9c6b8fa
fe33723
 
 
8732225
93fbf3b
c2de092
 
 
 
 
 
 
 
 
 
 
ca334ec
93fbf3b
57ceb45
93fbf3b
57ceb45
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import os
import random
import torch
import librosa
import gradio as gr
from scipy.io.wavfile import write
from transformers import WavLMModel

import utils
from models import SynthesizerTrn
from mel_processing import mel_spectrogram_torch
from speaker_encoder.voice_encoder import SpeakerEncoder

import logging
logging.basicConfig(level=logging.INFO)

'''
def get_wavlm():
    os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
    shutil.move('WavLM-Large.pt', 'wavlm')
'''

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

'''
print("Loading FreeVC...")
hps = utils.get_hparams_from_file("configs/freevc.json")
freevc = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = freevc.eval()
_ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
'''
print("Loading FreeVC(24k)...")
hps = utils.get_hparams_from_file("configs/freevc-24.json")
freevc_24 = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = freevc_24.eval()
_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
'''
print("Loading FreeVC-s...")
hps = utils.get_hparams_from_file("configs/freevc-s.json")
freevc_s = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = freevc_s.eval()
_ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)

return random.choice(all_files)
print("Loading FreeVC-cvfr...")
hps = utils.get_hparams_from_file("configs/freevc_nosr_cvfr.json")
freevc_cvfr = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = freevc_cvfr.eval()
_ = utils.load_checkpoint("checkpoints/freevc-cvfr.pth", freevc_cvfr, None)
'''

print("Loading FreeVC-mls...")
hps = utils.get_hparams_from_file("configs/freevc_nosr_mls.json")
freevc_mls = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
_ = freevc_mls.eval()
_ = utils.load_checkpoint("checkpoints/freevc-mls.pth", freevc_mls, None)

print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)

def get_random_wav_from_directory(directory, gender=None):
    """
    Get a random WAV file from a directory.
    If gender is specified, it fetches a male or female WAV accordingly.
    """
    all_files = [f for f in os.listdir(directory) if f.endswith('.wav')]
    
    if gender == "male":
        all_files = [f for f in all_files if "male" in f and "female" not in f]
    elif gender == "female":
        all_files = [f for f in all_files if "female" in f]
        
    return os.path.join(directory, random.choice(all_files))



def convert(model, src_mic,src_file, reference_option):
    """
    helper function which checks where source come from
    """
    src = None
    if src_mic and src_mic != "-":
        src = src_mic
    elif src_file:
        src = src_file
    #if not src:
    #    logging.warning("source or target are not provided")
    #    return
    
    if not src:
        logging.error("Source audio not provided")
        return

    if reference_option == "aléatoire":
        tgt = get_random_wav_from_directory("mls_samples")
    elif reference_option == "aléatoire (homme)":
        tgt = get_random_wav_from_directory("mls_samples", "male")
    elif reference_option == "aléatoire (femme)":
        tgt = get_random_wav_from_directory("mls_samples", "female")
    else:
        logging.error("Option de référence non reconnue")
        return
    
    with torch.no_grad():
        # tgt
        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
        if model == "FreeVC" or model == "FreeVC (24kHz)" or model == "FreeVC CVFR" or model == "FreeVC MLS":
            g_tgt = smodel.embed_utterance(wav_tgt)
            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
        else:
            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
            mel_tgt = mel_spectrogram_torch(
                wav_tgt, 
                hps.data.filter_length,
                hps.data.n_mel_channels,
                hps.data.sampling_rate,
                hps.data.hop_length,
                hps.data.win_length,
                hps.data.mel_fmin,
                hps.data.mel_fmax
            )
        # src
        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
        # infer
        if model == "FreeVC":
            audio = freevc.infer(c, g=g_tgt)
        elif model == "FreeVC-s":
            audio = freevc_s.infer(c, mel=mel_tgt)
        elif model == "FreeVC CVFR":
            audio = freevc_cvfr.infer(c, g=g_tgt)
        elif model == "FreeVC MLS":
            audio = freevc_mls.infer(c, g=g_tgt)
        else:
            audio = freevc_24.infer(c, g=g_tgt)
        audio = audio[0][0].data.cpu().float().numpy()
        if model == "FreeVC" or model == "FreeVC-s" or model == "FreeVC CVFR" or model == "FreeVC MLS":
            write("out.wav", hps.data.sampling_rate, audio)
        else:
            write("out.wav", 24000, audio)
    out = "out.wav"
    return out
    
model = gr.Dropdown(choices=["FreeVC MLS","FreeVC (24kHz)"], value="FreeVC MLS",type="value", label="Model") 
audio1_mic=gr.Audio(source="microphone", type="filepath", label='record your voice', optional=True)
audio1_file = gr.inputs.Audio(type='filepath', label='or upload an audio file', optional=True)
#audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath', optional=True)
reference_dropdown = gr.Dropdown(choices=["aléatoire", "aléatoire (homme)", "aléatoire (femme)"], value="aléatoire",label="Voix de référence")
inputs = [model, audio1_mic, audio1_file, reference_dropdown]
outputs = gr.outputs.Audio(label="Output Audio", type='filepath')

title = "Démonstration d'Anonymisation de Voix"
description = ("Cette démo Gradio permet d'anonymiser une voix grâce à une implémentation simple de FreeVC. "
               "Elle a été entraînée sur un extrait du jeu de données francophone MLS. Pour l'utiliser, vous pouvez "
               "charger un fichier audio, enregistrer votre propre voix, ou choisir parmi des exemples pré-enregistrés. "
               "À noter : le checkpoint WavLM dans HuggingFace semble différer légèrement de celui utilisé pour entraîner "
               "FreeVC, ce qui pourrait impacter les performances. De plus, la ressemblance entre les voix peut être altérée "
               "si l'audio de référence contient trop de silences. Veuillez donc retirer ces silences avant de soumettre "
               "votre fichier audio. \n\n"
               "<strong>AVERTISSEMENT :</strong> Cette démonstration est à visée pédagogique et il reste encore beaucoup "
               "de travail à réaliser pour perfectionner l'outil. Nous déconseillons fortement son utilisation dans un "
               "environnement de production.")
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Article FreeVC</a> | <a href='https://arxiv.org/abs/2110.13900' target='_blank'>Article WavLM</a> | <a href='http://www.openslr.org/94/' target='_blank'>Jeu de données MLS</a></p>"

examples=[["FreeVC MLS",'SAMPLE_NADINE_MALICIEUX.wav','SAMPLE_NADINE_MALICIEUX.wav', 'aléatoire (homme)'], ["FreeVC MLS",'SAMPLE_HUGO_METEO.wav','SAMPLE_HUGO_METEO.wav', 'aléatoire (femme)'],["FreeVC MLS",'Julien30sec.wav','Julien30sec.wav', 'aléatoire (femme)'],]

gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
#gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, enable_queue=True).launch()