Spaces:

patriotyk
/

pflowtts_ukr_demo

Serhiy Stetskovych

Move prompt device.

428ef88 9 months ago

5.96 kB


	from pathlib import Path
	import torchaudio
	import gradio as gr

	import numpy as np

	import torch


	from hifigan.config import v1
	from hifigan.denoiser import Denoiser
	from hifigan.env import AttrDict
	from hifigan.models import Generator as HiFiGAN


	#from BigVGAN.models import BigVGAN
	#from BigVGAN.env import AttrDict as BigVGANAttrDict


	from pflow.models.pflow_tts import pflowTTS
	from pflow.text import text_to_sequence, sequence_to_text
	from pflow.utils.utils import intersperse
	from pflow.data.text_mel_datamodule import mel_spectrogram
	from pflow.utils.model import normalize



	BIGVGAN_CONFIG = {
	"resblock": "1",
	"num_gpus": 0,
	"batch_size": 32,
	"learning_rate": 0.0001,
	"adam_b1": 0.8,
	"adam_b2": 0.99,
	"lr_decay": 0.999,
	"seed": 1234,

	"upsample_rates": [4,4,2,2,2,2],
	"upsample_kernel_sizes": [8,8,4,4,4,4],
	"upsample_initial_channel": 1536,
	"resblock_kernel_sizes": [3,7,11],
	"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],

	"activation": "snakebeta",
	"snake_logscale": True,

	"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
	"mpd_reshapes": [2, 3, 5, 7, 11],
	"use_spectral_norm": False,
	"discriminator_channel_mult": 1,

	"segment_size": 8192,
	"num_mels": 80,
	"num_freq": 1025,
	"n_fft": 1024,
	"hop_size": 256,
	"win_size": 1024,

	"sampling_rate": 22050,

	"fmin": 0,
	"fmax": 8000,
	"fmax_for_loss": None,

	"num_workers": 4,

	"dist_config": {
	"dist_backend": "nccl",
	"dist_url": "tcp://localhost:54321",
	"world_size": 1
	}
	}

	PFLOW_MODEL_PATH = 'checkpoint_epoch=499.ckpt'
	VOCODER_MODEL_PATH = 'g_00120000'
	VOCODER_BIGVGAN_MODEL_PATH = 'g_05000000'

	wav, sr = torchaudio.load('prompt.wav')

	prompt = mel_spectrogram(
	wav,
	1024,
	80,
	22050,
	256,
	1024,
	0,
	8000,
	center=False,
	)[:,:,:264]



	def process_text(text: str, device: torch.device):
	x = torch.tensor(
	intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0),
	dtype=torch.long,
	device=device,
	)[None]
	x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
	x_phones = sequence_to_text(x.squeeze(0).tolist())
	return {"x_orig": text, "x": x, "x_lengths": x_lengths, 'x_phones':x_phones}




	def load_hifigan(checkpoint_path, device):
	h = AttrDict(v1)
	hifigan = HiFiGAN(h).to(device)
	hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
	_ = hifigan.eval()
	hifigan.remove_weight_norm()
	return hifigan


	def load_bigvgan(checkpoint_path, device):
	print("Loading '{}'".format(checkpoint_path))
	checkpoint_dict = torch.load(checkpoint_path, map_location=device)


	h = BigVGANAttrDict(BIGVGAN_CONFIG)
	torch.manual_seed(h.seed)

	generator = BigVGAN(h).to(device)
	generator.load_state_dict(checkpoint_dict['generator'])
	generator.eval()
	generator.remove_weight_norm()
	return generator


	def to_waveform(mel, vocoder, denoiser=None):
	audio = vocoder(mel).clamp(-1, 1)
	if denoiser is not None:
	audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze()

	return audio.cpu().squeeze()






	def get_device():
	if torch.cuda.is_available():
	print("[+] GPU Available! Using GPU")
	device = torch.device("cuda")
	else:
	print("[-] GPU not available or forced CPU run! Using CPU")
	device = torch.device("cpu")
	return device


	device = get_device()
	model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
	_ = model.eval()
	#vocoder = load_bigvgan(VOCODER_BIGVGAN_MODEL_PATH, device)
	vocoder = load_hifigan(VOCODER_MODEL_PATH, device)
	denoiser = Denoiser(vocoder, mode="zeros")

	@torch.inference_mode()
	def synthesise(text, temperature, speed):
	if len(text) > 1000:
	raise gr.Error("Текст повинен бути коротшим за 1000 символів.")

	text_processed = process_text(text.strip(), device)

	output = model.synthesise(
	text_processed["x"].to(device),
	text_processed["x_lengths"].to(device),
	n_timesteps=40,
	temperature=temperature,
	length_scale=1/speed,
	prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device)
	)
	waveform = to_waveform(output["mel"], vocoder, denoiser)

	return text_processed['x_phones'][1::2], (22050, waveform.numpy())


	description = f'''
	# Експериментальна апка для генерації аудіо з тексту.

	pflow checkpoint {PFLOW_MODEL_PATH}
	vocoder: HIFIGAN(трейнутий на датасеті, з нуля) - {VOCODER_MODEL_PATH}
	'''


	if __name__ == "__main__":
	i = gr.Interface(
	fn=synthesise,
	description=description,
	inputs=[
	gr.Text(label='Текст для синтезу:', lines=5, max_lines=10),
	gr.Slider(minimum=0.0, maximum=1.0, label="Температура", value=0.2),
	gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0)
	],
	outputs=[
	gr.Text(label='Фонемізований текст:', lines=5),
	gr.Audio(
	label="Згенероване аудіо:",
	autoplay=False,
	streaming=False,
	type="numpy",
	)

	],
	allow_flagging ='manual',
	flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
	cache_examples=True,
	title='',
	# description=description,
	# article=article,
	# examples=examples,
	)
	i.queue(max_size=20, default_concurrency_limit=4)
	i.launch(share=False, server_name="0.0.0.0")