Nunchakuka's picture
Update app.py
57ceb45
raw
history blame
8.25 kB
import os
import random
import torch
import librosa
import gradio as gr
from scipy.io.wavfile import write
from transformers import WavLMModel
import utils
from models import SynthesizerTrn
from mel_processing import mel_spectrogram_torch
from speaker_encoder.voice_encoder import SpeakerEncoder
import logging
logging.basicConfig(level=logging.INFO)
'''
def get_wavlm():
os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
shutil.move('WavLM-Large.pt', 'wavlm')
'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
'''
print("Loading FreeVC...")
hps = utils.get_hparams_from_file("configs/freevc.json")
freevc = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc.eval()
_ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
'''
print("Loading FreeVC(24k)...")
hps = utils.get_hparams_from_file("configs/freevc-24.json")
freevc_24 = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_24.eval()
_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
'''
print("Loading FreeVC-s...")
hps = utils.get_hparams_from_file("configs/freevc-s.json")
freevc_s = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_s.eval()
_ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
return random.choice(all_files)
print("Loading FreeVC-cvfr...")
hps = utils.get_hparams_from_file("configs/freevc_nosr_cvfr.json")
freevc_cvfr = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_cvfr.eval()
_ = utils.load_checkpoint("checkpoints/freevc-cvfr.pth", freevc_cvfr, None)
'''
print("Loading FreeVC-mls...")
hps = utils.get_hparams_from_file("configs/freevc_nosr_mls.json")
freevc_mls = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model).to(device)
_ = freevc_mls.eval()
_ = utils.load_checkpoint("checkpoints/freevc-mls.pth", freevc_mls, None)
print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
def get_random_wav_from_directory(directory, gender=None):
"""
Get a random WAV file from a directory.
If gender is specified, it fetches a male or female WAV accordingly.
"""
all_files = [f for f in os.listdir(directory) if f.endswith('.wav')]
if gender == "male":
all_files = [f for f in all_files if "male" in f and "female" not in f]
elif gender == "female":
all_files = [f for f in all_files if "female" in f]
return os.path.join(directory, random.choice(all_files))
def convert(model, src_mic,src_file, reference_option):
"""
helper function which checks where source come from
"""
src = None
if src_mic and src_mic != "-":
src = src_mic
elif src_file:
src = src_file
#if not src:
# logging.warning("source or target are not provided")
# return
if not src:
logging.error("Source audio not provided")
return
if reference_option == "aléatoire":
tgt = get_random_wav_from_directory("mls_samples")
elif reference_option == "aléatoire (homme)":
tgt = get_random_wav_from_directory("mls_samples", "male")
elif reference_option == "aléatoire (femme)":
tgt = get_random_wav_from_directory("mls_samples", "female")
else:
logging.error("Option de référence non reconnue")
return
with torch.no_grad():
# tgt
wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
if model == "FreeVC" or model == "FreeVC (24kHz)" or model == "FreeVC CVFR" or model == "FreeVC MLS":
g_tgt = smodel.embed_utterance(wav_tgt)
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
else:
wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
mel_tgt = mel_spectrogram_torch(
wav_tgt,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax
)
# src
wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
# infer
if model == "FreeVC":
audio = freevc.infer(c, g=g_tgt)
elif model == "FreeVC-s":
audio = freevc_s.infer(c, mel=mel_tgt)
elif model == "FreeVC CVFR":
audio = freevc_cvfr.infer(c, g=g_tgt)
elif model == "FreeVC MLS":
audio = freevc_mls.infer(c, g=g_tgt)
else:
audio = freevc_24.infer(c, g=g_tgt)
audio = audio[0][0].data.cpu().float().numpy()
if model == "FreeVC" or model == "FreeVC-s" or model == "FreeVC CVFR" or model == "FreeVC MLS":
write("out.wav", hps.data.sampling_rate, audio)
else:
write("out.wav", 24000, audio)
out = "out.wav"
return out
model = gr.Dropdown(choices=["FreeVC MLS","FreeVC (24kHz)"], value="FreeVC MLS",type="value", label="Model")
audio1_mic=gr.Audio(source="microphone", type="filepath", label='record your voice', optional=True)
audio1_file = gr.inputs.Audio(type='filepath', label='or upload an audio file', optional=True)
#audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath', optional=True)
reference_dropdown = gr.Dropdown(choices=["aléatoire", "aléatoire (homme)", "aléatoire (femme)"], value="aléatoire",label="Voix de référence")
inputs = [model, audio1_mic, audio1_file, reference_dropdown]
outputs = gr.outputs.Audio(label="Output Audio", type='filepath')
title = "Démonstration d'Anonymisation de Voix"
description = ("Cette démo Gradio permet d'anonymiser une voix grâce à une implémentation simple de FreeVC. "
"Elle a été entraînée sur un extrait du jeu de données francophone MLS. Pour l'utiliser, vous pouvez "
"charger un fichier audio, enregistrer votre propre voix, ou choisir parmi des exemples pré-enregistrés. "
"À noter : le checkpoint WavLM dans HuggingFace semble différer légèrement de celui utilisé pour entraîner "
"FreeVC, ce qui pourrait impacter les performances. De plus, la ressemblance entre les voix peut être altérée "
"si l'audio de référence contient trop de silences. Veuillez donc retirer ces silences avant de soumettre "
"votre fichier audio. \n\n"
"<strong>AVERTISSEMENT :</strong> Cette démonstration est à visée pédagogique et il reste encore beaucoup "
"de travail à réaliser pour perfectionner l'outil. Nous déconseillons fortement son utilisation dans un "
"environnement de production.")
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Article FreeVC</a> | <a href='https://arxiv.org/abs/2110.13900' target='_blank'>Article WavLM</a> | <a href='http://www.openslr.org/94/' target='_blank'>Jeu de données MLS</a></p>"
examples=[["FreeVC MLS",'SAMPLE_NADINE_MALICIEUX.wav','SAMPLE_NADINE_MALICIEUX.wav', 'aléatoire (homme)'], ["FreeVC MLS",'SAMPLE_HUGO_METEO.wav','SAMPLE_HUGO_METEO.wav', 'aléatoire (femme)'],["FreeVC MLS",'Julien30sec.wav','Julien30sec.wav', 'aléatoire (femme)'],]
gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
#gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, enable_queue=True).launch()