|
|
|
from pathlib import Path |
|
import torchaudio |
|
import gradio as gr |
|
|
|
import numpy as np |
|
|
|
import torch |
|
|
|
|
|
from hifigan.config import v1 |
|
from hifigan.denoiser import Denoiser |
|
from hifigan.env import AttrDict |
|
from hifigan.models import Generator as HiFiGAN |
|
|
|
|
|
|
|
|
|
|
|
|
|
from pflow.models.pflow_tts import pflowTTS |
|
from pflow.text import text_to_sequence, sequence_to_text |
|
from pflow.utils.utils import intersperse |
|
from pflow.data.text_mel_datamodule import mel_spectrogram |
|
from pflow.utils.model import normalize |
|
|
|
|
|
|
|
BIGVGAN_CONFIG = { |
|
"resblock": "1", |
|
"num_gpus": 0, |
|
"batch_size": 32, |
|
"learning_rate": 0.0001, |
|
"adam_b1": 0.8, |
|
"adam_b2": 0.99, |
|
"lr_decay": 0.999, |
|
"seed": 1234, |
|
|
|
"upsample_rates": [4,4,2,2,2,2], |
|
"upsample_kernel_sizes": [8,8,4,4,4,4], |
|
"upsample_initial_channel": 1536, |
|
"resblock_kernel_sizes": [3,7,11], |
|
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], |
|
|
|
"activation": "snakebeta", |
|
"snake_logscale": True, |
|
|
|
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], |
|
"mpd_reshapes": [2, 3, 5, 7, 11], |
|
"use_spectral_norm": False, |
|
"discriminator_channel_mult": 1, |
|
|
|
"segment_size": 8192, |
|
"num_mels": 80, |
|
"num_freq": 1025, |
|
"n_fft": 1024, |
|
"hop_size": 256, |
|
"win_size": 1024, |
|
|
|
"sampling_rate": 22050, |
|
|
|
"fmin": 0, |
|
"fmax": 8000, |
|
"fmax_for_loss": None, |
|
|
|
"num_workers": 4, |
|
|
|
"dist_config": { |
|
"dist_backend": "nccl", |
|
"dist_url": "tcp://localhost:54321", |
|
"world_size": 1 |
|
} |
|
} |
|
|
|
PFLOW_MODEL_PATH = 'checkpoint_epoch=499.ckpt' |
|
VOCODER_MODEL_PATH = 'g_00120000' |
|
VOCODER_BIGVGAN_MODEL_PATH = 'g_05000000' |
|
|
|
wav, sr = torchaudio.load('prompt.wav') |
|
|
|
prompt = mel_spectrogram( |
|
wav, |
|
1024, |
|
80, |
|
22050, |
|
256, |
|
1024, |
|
0, |
|
8000, |
|
center=False, |
|
)[:,:,:264] |
|
|
|
|
|
|
|
def process_text(text: str, device: torch.device): |
|
x = torch.tensor( |
|
intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0), |
|
dtype=torch.long, |
|
device=device, |
|
)[None] |
|
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) |
|
x_phones = sequence_to_text(x.squeeze(0).tolist()) |
|
return {"x_orig": text, "x": x, "x_lengths": x_lengths, 'x_phones':x_phones} |
|
|
|
|
|
|
|
|
|
def load_hifigan(checkpoint_path, device): |
|
h = AttrDict(v1) |
|
hifigan = HiFiGAN(h).to(device) |
|
hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"]) |
|
_ = hifigan.eval() |
|
hifigan.remove_weight_norm() |
|
return hifigan |
|
|
|
|
|
def load_bigvgan(checkpoint_path, device): |
|
print("Loading '{}'".format(checkpoint_path)) |
|
checkpoint_dict = torch.load(checkpoint_path, map_location=device) |
|
|
|
|
|
h = BigVGANAttrDict(BIGVGAN_CONFIG) |
|
torch.manual_seed(h.seed) |
|
|
|
generator = BigVGAN(h).to(device) |
|
generator.load_state_dict(checkpoint_dict['generator']) |
|
generator.eval() |
|
generator.remove_weight_norm() |
|
return generator |
|
|
|
|
|
def to_waveform(mel, vocoder, denoiser=None): |
|
audio = vocoder(mel).clamp(-1, 1) |
|
if denoiser is not None: |
|
audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze() |
|
|
|
return audio.cpu().squeeze() |
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_device(): |
|
if torch.cuda.is_available(): |
|
print("[+] GPU Available! Using GPU") |
|
device = torch.device("cuda") |
|
else: |
|
print("[-] GPU not available or forced CPU run! Using CPU") |
|
device = torch.device("cpu") |
|
return device |
|
|
|
|
|
device = get_device() |
|
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device) |
|
_ = model.eval() |
|
|
|
vocoder = load_hifigan(VOCODER_MODEL_PATH, device) |
|
denoiser = Denoiser(vocoder, mode="zeros") |
|
|
|
@torch.inference_mode() |
|
def synthesise(text, temperature, speed): |
|
if len(text) > 1000: |
|
raise gr.Error("Текст повинен бути коротшим за 1000 символів.") |
|
|
|
text_processed = process_text(text.strip(), device) |
|
|
|
output = model.synthesise( |
|
text_processed["x"].to(device), |
|
text_processed["x_lengths"].to(device), |
|
n_timesteps=40, |
|
temperature=temperature, |
|
length_scale=1/speed, |
|
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device) |
|
) |
|
waveform = to_waveform(output["mel"], vocoder, denoiser) |
|
|
|
return text_processed['x_phones'][1::2], (22050, waveform.numpy()) |
|
|
|
|
|
description = f''' |
|
# Експериментальна апка для генерації аудіо з тексту. |
|
|
|
pflow checkpoint {PFLOW_MODEL_PATH} |
|
vocoder: HIFIGAN(трейнутий на датасеті, з нуля) - {VOCODER_MODEL_PATH} |
|
''' |
|
|
|
|
|
if __name__ == "__main__": |
|
i = gr.Interface( |
|
fn=synthesise, |
|
description=description, |
|
inputs=[ |
|
gr.Text(label='Текст для синтезу:', lines=5, max_lines=10), |
|
gr.Slider(minimum=0.0, maximum=1.0, label="Температура", value=0.2), |
|
gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0) |
|
], |
|
outputs=[ |
|
gr.Text(label='Фонемізований текст:', lines=5), |
|
gr.Audio( |
|
label="Згенероване аудіо:", |
|
autoplay=False, |
|
streaming=False, |
|
type="numpy", |
|
) |
|
|
|
], |
|
allow_flagging ='manual', |
|
flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")], |
|
cache_examples=True, |
|
title='', |
|
|
|
|
|
|
|
) |
|
i.queue(max_size=20, default_concurrency_limit=4) |
|
i.launch(share=False, server_name="0.0.0.0") |
|
|