Spaces:
Sleeping
Sleeping
File size: 3,049 Bytes
5fdd9cc 5e385c1 5fdd9cc 5e385c1 5fdd9cc 5e385c1 7354bb7 5fdd9cc 5e385c1 5fdd9cc 5e385c1 5fdd9cc f38c401 5e385c1 f38c401 5e385c1 f38c401 5e385c1 f38c401 5e385c1 f38c401 cbc35da 5e385c1 e231341 cbc35da 5e385c1 cbc35da 5e385c1 cbc35da f38c401 2bc1f84 5e385c1 5fdd9cc f38c401 5fdd9cc f38c401 5fdd9cc 5e385c1 5fdd9cc 5e385c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import gradio as gr
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, set_seed
import numpy as np
# Set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Load model and tokenizer
model = ParlerTTSForConditionalGeneration.from_pretrained("TArtx/parler-tts-mini-v1-finetuned-12").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
# Constants
SAMPLE_RATE = model.config.sampling_rate
SEED = 42
# Default inputs
default_text = "This is a demonstration of my ability to convert written words into spoken language, seamlessly and naturally. As a text-to-speech model, my goal is to sound as clear and engaging as a human, making sure every word I say leaves an impression."
default_description = "moderate speed, very clear, monotone, wonderful speech quality"
# TTS generation function
def gen_tts(text, description):
try:
# Set seed for reproducibility
set_seed(SEED)
# Prepare inputs
input_ids = tokenizer(description.strip(), return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(text.strip(), return_tensors="pt").input_ids.to(device)
# Generate audio
generation = model.generate(
input_ids=input_ids,
prompt_input_ids=prompt_input_ids,
do_sample=True,
temperature=0.7
)
# Convert to numpy array
audio_arr = generation.cpu().numpy().squeeze()
# Normalize audio
if np.max(np.abs(audio_arr)) > 0:
audio_arr = audio_arr / np.max(np.abs(audio_arr))
audio_arr = (audio_arr * np.iinfo(np.int16).max).astype(np.int16)
else:
# Fallback to white noise if generation fails
audio_arr = np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16)
return SAMPLE_RATE, audio_arr
except Exception as e:
print(f"Error in TTS generation: {str(e)}")
# Return white noise as fallback
return SAMPLE_RATE, np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16)
# Gradio interface
with gr.Blocks() as block:
gr.Markdown(
"""
## Parler-TTS 🗣️
Parler-TTS is a training and inference library for high-fidelity text-to-speech (TTS) models. This demo uses the Mini v1 model.
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
run_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
inputs = [input_text, description]
outputs = audio_out
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs)
# Launch the interface
block.launch(debug=True) |