File size: 3,049 Bytes
5fdd9cc
 
 
5e385c1
 
5fdd9cc
5e385c1
 
5fdd9cc
5e385c1
 
7354bb7
5fdd9cc
 
5e385c1
5fdd9cc
 
5e385c1
5fdd9cc
 
 
 
 
f38c401
5e385c1
f38c401
5e385c1
 
 
 
 
 
f38c401
5e385c1
 
f38c401
5e385c1
f38c401
cbc35da
5e385c1
e231341
cbc35da
5e385c1
 
 
 
 
 
 
cbc35da
5e385c1
cbc35da
f38c401
2bc1f84
5e385c1
 
5fdd9cc
 
f38c401
 
5fdd9cc
f38c401
 
5fdd9cc
 
 
 
 
 
 
 
 
 
 
5e385c1
 
5fdd9cc
 
5e385c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, set_seed
import numpy as np

# Set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer
model = ParlerTTSForConditionalGeneration.from_pretrained("TArtx/parler-tts-mini-v1-finetuned-12").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")

# Constants
SAMPLE_RATE = model.config.sampling_rate
SEED = 42

# Default inputs
default_text = "This is a demonstration of my ability to convert written words into spoken language, seamlessly and naturally. As a text-to-speech model, my goal is to sound as clear and engaging as a human, making sure every word I say leaves an impression."
default_description = "moderate speed, very clear, monotone, wonderful speech quality"

# TTS generation function
def gen_tts(text, description):
    try:
        # Set seed for reproducibility
        set_seed(SEED)

        # Prepare inputs
        input_ids = tokenizer(description.strip(), return_tensors="pt").input_ids.to(device)
        prompt_input_ids = tokenizer(text.strip(), return_tensors="pt").input_ids.to(device)

        # Generate audio
        generation = model.generate(
            input_ids=input_ids, 
            prompt_input_ids=prompt_input_ids,
            do_sample=True,
            temperature=0.7
        )

        # Convert to numpy array
        audio_arr = generation.cpu().numpy().squeeze()

        # Normalize audio
        if np.max(np.abs(audio_arr)) > 0:
            audio_arr = audio_arr / np.max(np.abs(audio_arr))
            audio_arr = (audio_arr * np.iinfo(np.int16).max).astype(np.int16)
        else:
            # Fallback to white noise if generation fails
            audio_arr = np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16)

        return SAMPLE_RATE, audio_arr

    except Exception as e:
        print(f"Error in TTS generation: {str(e)}")
        # Return white noise as fallback
        return SAMPLE_RATE, np.random.randint(-32768, 32767, SAMPLE_RATE * 10, dtype=np.int16)

# Gradio interface
with gr.Blocks() as block:
    gr.Markdown(
        """
        ## Parler-TTS 🗣️
        Parler-TTS is a training and inference library for high-fidelity text-to-speech (TTS) models. This demo uses the Mini v1 model.
        """
    )
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
            description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
            run_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column():
            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")

    inputs = [input_text, description]
    outputs = audio_out
    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs)

# Launch the interface
block.launch(debug=True)