|
|
|
from pathlib import Path |
|
import torchaudio |
|
import gradio as gr |
|
|
|
import numpy as np |
|
|
|
import torch |
|
import json |
|
|
|
|
|
from hifigan.config import v1 |
|
from hifigan.denoiser import Denoiser |
|
from hifigan.env import AttrDict |
|
from hifigan.models import Generator as HiFiGAN |
|
|
|
|
|
from pflow.models.pflow_tts import pflowTTS |
|
from pflow.text import text_to_sequence, sequence_to_text |
|
from pflow.utils.utils import intersperse |
|
from pflow.data.text_mel_datamodule import mel_spectrogram |
|
from pflow.utils.model import normalize |
|
from vocos import Vocos |
|
|
|
|
|
|
|
|
|
|
|
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt' |
|
|
|
VOCODER_MODEL_PATH = 'checkpoints/pytorch_model.bin' |
|
HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000' |
|
|
|
|
|
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db") |
|
wav, sr = torchaudio.load('prompt.wav') |
|
|
|
prompt = mel_spectrogram( |
|
wav, |
|
1024, |
|
80, |
|
22050, |
|
256, |
|
1024, |
|
0, |
|
8000, |
|
center=False, |
|
)[:,:,:264] |
|
|
|
|
|
|
|
|
|
def process_text(text: str, device: torch.device): |
|
x = torch.tensor( |
|
intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0), |
|
dtype=torch.long, |
|
device=device, |
|
)[None] |
|
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) |
|
x_phones = sequence_to_text(x.squeeze(0).tolist()) |
|
return {"x_orig": text, "x": x, "x_lengths": x_lengths, 'x_phones':x_phones} |
|
|
|
|
|
|
|
|
|
def load_hifigan(checkpoint_path, device): |
|
h = AttrDict(v1) |
|
hifigan = HiFiGAN(h).to(device) |
|
hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"]) |
|
_ = hifigan.eval() |
|
hifigan.remove_weight_norm() |
|
return hifigan |
|
|
|
|
|
|
|
|
|
def load_vocos(checkpoint_path, config_path, device): |
|
model = Vocos.from_hparams(config_path) |
|
|
|
raw_model = torch.load(checkpoint_path, map_location=torch.device('cpu')) |
|
raw_model = raw_model if 'state_dict' not in raw_model else raw_model['state_dict'] |
|
model.load_state_dict(raw_model, strict=False) |
|
model.eval() |
|
return model |
|
|
|
|
|
def to_waveform(mel, vocoder, denoiser=None): |
|
return vocoder.decode(mel).cpu().squeeze() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_device(): |
|
if torch.cuda.is_available(): |
|
print("[+] GPU Available! Using GPU") |
|
device = torch.device("cuda") |
|
else: |
|
print("[-] GPU not available or forced CPU run! Using CPU") |
|
device = torch.device("cpu") |
|
return device |
|
|
|
|
|
device = get_device() |
|
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device) |
|
_ = model.eval() |
|
|
|
|
|
|
|
vocos = load_vocos(VOCODER_MODEL_PATH, 'config.yaml', device) |
|
|
|
denoiser = None |
|
|
|
|
|
@torch.inference_mode() |
|
def synthesise(text, speed): |
|
if len(text) > 1000: |
|
raise gr.Error("Текст повинен бути коротшим за 1000 символів.") |
|
|
|
text_processed = process_text(text.strip(), device) |
|
|
|
output = model.synthesise( |
|
text_processed["x"].to(device), |
|
text_processed["x_lengths"].to(device), |
|
n_timesteps=40, |
|
temperature=0.0, |
|
length_scale=1/speed, |
|
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device), |
|
|
|
guidance_scale=1.5 |
|
|
|
) |
|
waveform_vocos = vocos.decode(output["mel"]).squeeze() |
|
|
|
|
|
|
|
|
|
|
|
return text_processed['x_phones'][1::2], (22050, waveform_vocos.numpy()) |
|
|
|
|
|
description = f''' |
|
# Експериментальна апка для генерації аудіо з тексту. |
|
|
|
pflow checkpoint {PFLOW_MODEL_PATH} |
|
vocoder: Vocos - {VOCODER_MODEL_PATH} |
|
''' |
|
|
|
|
|
if __name__ == "__main__": |
|
i = gr.Interface( |
|
fn=synthesise, |
|
description=description, |
|
inputs=[ |
|
gr.Text(label='Текст для синтезу:', lines=5, max_lines=10), |
|
gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0) |
|
], |
|
outputs=[ |
|
gr.Text(label='Фонемізований текст:', lines=5), |
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Audio( |
|
label="Vocos аудіо:", |
|
autoplay=False, |
|
streaming=False, |
|
type="numpy", |
|
), |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
], |
|
allow_flagging ='manual', |
|
flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")], |
|
cache_examples=True, |
|
title='', |
|
|
|
|
|
|
|
) |
|
i.queue(max_size=20, default_concurrency_limit=4) |
|
i.launch(share=False, server_name="0.0.0.0") |
|
|