Spaces:
Sleeping
Sleeping
File size: 8,248 Bytes
93fbf3b 110cc32 93fbf3b 9674513 93fbf3b 30f3714 93fbf3b 30f3714 93fbf3b 30f3714 93fbf3b 4d92caa 170d5e3 66d0593 30f3714 170d5e3 0ceacef 93fbf3b fe33723 4d92caa fe33723 6dc0682 7f03e43 9ca6ea1 6dc0682 8732225 6dc0682 8732225 f0fcaae 8732225 fe33723 93fbf3b 9dbace4 93fbf3b 170d5e3 0ceacef 93fbf3b 0ceacef 93fbf3b 30f3714 6dc0682 9c6b8fa fe33723 8732225 93fbf3b c2de092 ca334ec 93fbf3b 57ceb45 93fbf3b 57ceb45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import os
import random
import torch
import librosa
import gradio as gr
from scipy.io.wavfile import write
from transformers import WavLMModel
import utils
from models import SynthesizerTrn
from mel_processing import mel_spectrogram_torch
from speaker_encoder.voice_encoder import SpeakerEncoder
import logging
logging.basicConfig(level=logging.INFO)
'''
def get_wavlm():
os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
shutil.move('WavLM-Large.pt', 'wavlm')
'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
'''
print("Loading FreeVC...")
hps = utils.get_hparams_from_file("configs/freevc.json")
freevc = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc.eval()
_ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
'''
print("Loading FreeVC(24k)...")
hps = utils.get_hparams_from_file("configs/freevc-24.json")
freevc_24 = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_24.eval()
_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
'''
print("Loading FreeVC-s...")
hps = utils.get_hparams_from_file("configs/freevc-s.json")
freevc_s = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_s.eval()
_ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
return random.choice(all_files)
print("Loading FreeVC-cvfr...")
hps = utils.get_hparams_from_file("configs/freevc_nosr_cvfr.json")
freevc_cvfr = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_cvfr.eval()
_ = utils.load_checkpoint("checkpoints/freevc-cvfr.pth", freevc_cvfr, None)
'''
print("Loading FreeVC-mls...")
hps = utils.get_hparams_from_file("configs/freevc_nosr_mls.json")
freevc_mls = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_mls.eval()
_ = utils.load_checkpoint("checkpoints/freevc-mls.pth", freevc_mls, None)
print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
def get_random_wav_from_directory(directory, gender=None):
"""
Get a random WAV file from a directory.
If gender is specified, it fetches a male or female WAV accordingly.
"""
all_files = [f for f in os.listdir(directory) if f.endswith('.wav')]
if gender == "male":
all_files = [f for f in all_files if "male" in f and "female" not in f]
elif gender == "female":
all_files = [f for f in all_files if "female" in f]
return os.path.join(directory, random.choice(all_files))
def convert(model, src_mic,src_file, reference_option):
"""
helper function which checks where source come from
"""
src = None
if src_mic and src_mic != "-":
src = src_mic
elif src_file:
src = src_file
#if not src:
# logging.warning("source or target are not provided")
# return
if not src:
logging.error("Source audio not provided")
return
if reference_option == "aléatoire":
tgt = get_random_wav_from_directory("mls_samples")
elif reference_option == "aléatoire (homme)":
tgt = get_random_wav_from_directory("mls_samples", "male")
elif reference_option == "aléatoire (femme)":
tgt = get_random_wav_from_directory("mls_samples", "female")
else:
logging.error("Option de référence non reconnue")
return
with torch.no_grad():
# tgt
wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
if model == "FreeVC" or model == "FreeVC (24kHz)" or model == "FreeVC CVFR" or model == "FreeVC MLS":
g_tgt = smodel.embed_utterance(wav_tgt)
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
else:
wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
mel_tgt = mel_spectrogram_torch(
wav_tgt,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax
)
# src
wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
# infer
if model == "FreeVC":
audio = freevc.infer(c, g=g_tgt)
elif model == "FreeVC-s":
audio = freevc_s.infer(c, mel=mel_tgt)
elif model == "FreeVC CVFR":
audio = freevc_cvfr.infer(c, g=g_tgt)
elif model == "FreeVC MLS":
audio = freevc_mls.infer(c, g=g_tgt)
else:
audio = freevc_24.infer(c, g=g_tgt)
audio = audio[0][0].data.cpu().float().numpy()
if model == "FreeVC" or model == "FreeVC-s" or model == "FreeVC CVFR" or model == "FreeVC MLS":
write("out.wav", hps.data.sampling_rate, audio)
else:
write("out.wav", 24000, audio)
out = "out.wav"
return out
model = gr.Dropdown(choices=["FreeVC MLS","FreeVC (24kHz)"], value="FreeVC MLS",type="value", label="Model")
audio1_mic=gr.Audio(source="microphone", type="filepath", label='record your voice', optional=True)
audio1_file = gr.inputs.Audio(type='filepath', label='or upload an audio file', optional=True)
#audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath', optional=True)
reference_dropdown = gr.Dropdown(choices=["aléatoire", "aléatoire (homme)", "aléatoire (femme)"], value="aléatoire",label="Voix de référence")
inputs = [model, audio1_mic, audio1_file, reference_dropdown]
outputs = gr.outputs.Audio(label="Output Audio", type='filepath')
title = "Démonstration d'Anonymisation de Voix"
description = ("Cette démo Gradio permet d'anonymiser une voix grâce à une implémentation simple de FreeVC. "
"Elle a été entraînée sur un extrait du jeu de données francophone MLS. Pour l'utiliser, vous pouvez "
"charger un fichier audio, enregistrer votre propre voix, ou choisir parmi des exemples pré-enregistrés. "
"À noter : le checkpoint WavLM dans HuggingFace semble différer légèrement de celui utilisé pour entraîner "
"FreeVC, ce qui pourrait impacter les performances. De plus, la ressemblance entre les voix peut être altérée "
"si l'audio de référence contient trop de silences. Veuillez donc retirer ces silences avant de soumettre "
"votre fichier audio. \n\n"
"<strong>AVERTISSEMENT :</strong> Cette démonstration est à visée pédagogique et il reste encore beaucoup "
"de travail à réaliser pour perfectionner l'outil. Nous déconseillons fortement son utilisation dans un "
"environnement de production.")
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Article FreeVC</a> | <a href='https://arxiv.org/abs/2110.13900' target='_blank'>Article WavLM</a> | <a href='http://www.openslr.org/94/' target='_blank'>Jeu de données MLS</a></p>"
examples=[["FreeVC MLS",'SAMPLE_NADINE_MALICIEUX.wav','SAMPLE_NADINE_MALICIEUX.wav', 'aléatoire (homme)'], ["FreeVC MLS",'SAMPLE_HUGO_METEO.wav','SAMPLE_HUGO_METEO.wav', 'aléatoire (femme)'],["FreeVC MLS",'Julien30sec.wav','Julien30sec.wav', 'aléatoire (femme)'],]
gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
#gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, enable_queue=True).launch()
|