File size: 3,520 Bytes
5fdd9cc
 
 
 
 
 
 
 
 
f38c401
 
5fdd9cc
f38c401
5fdd9cc
1346f0a
7354bb7
1346f0a
5fdd9cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f38c401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fdd9cc
 
f38c401
 
5fdd9cc
f38c401
 
5fdd9cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f38c401
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re

from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

# Set device to CPU only
device = "cpu"

# Load Mini model and associated components with low memory usage
repo_id = "TArtx/parler-tts-mini-v1-finetuned-12"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
feature_extractor = AutoFeatureExtractor.from_pretrained(parler-tts/parler-tts-mini-v1)

# Constants
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

# Default input text and description
default_text = "This is a demonstration of my ability to convert written words into spoken language, seamlessly and naturally. As a text-to-speech model, my goal is to sound as clear and engaging as a human, making sure every word I say leaves an impression."
default_description = "moderate speed, very clear, monotone, wonderful speech quality"

# Number normalizer
number_normalizer = EnglishNumberNormalizer()

# Preprocessing function
def preprocess(text):
    text = number_normalizer(text).strip()
    text = text.replace("-", " ")
    if text[-1] not in punctuation:
        text = f"{text}."
    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'

    def separate_abb(chunk):
        chunk = chunk.replace(".", "")
        return " ".join(chunk)

    abbreviations = re.findall(abbreviations_pattern, text)
    for abv in abbreviations:
        if abv in text:
            text = text.replace(abv, separate_abb(abv))
    return text

# TTS generation function
def gen_tts(text, description):
    try:
        # Tokenize inputs and prompts with truncation to avoid memory issues
        inputs = tokenizer(description.strip(), return_tensors="pt", truncation=True, max_length=128).to(device)
        prompt = tokenizer(preprocess(text), return_tensors="pt", truncation=True, max_length=128).to(device)

        set_seed(SEED)
        generation = model.generate(
            input_ids=inputs.input_ids,
            prompt_input_ids=prompt.input_ids,
            attention_mask=inputs.attention_mask,
            prompt_attention_mask=prompt.prompt_attention_mask,
            do_sample=True,
            temperature=1.0,
        )
        audio_arr = generation.cpu().numpy().squeeze()
        return SAMPLE_RATE, audio_arr
    except Exception as e:
        return SAMPLE_RATE, f"Error: {str(e)}"

# Gradio interface
with gr.Blocks() as block:
    gr.Markdown(
        """
        ## Parler-TTS 🗣️
        Parler-TTS is a training and inference library for high-fidelity text-to-speech (TTS) models. This demo uses the Mini v1 model.
        """
    )
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
            description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
            run_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column():
            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")

    inputs = [input_text, description]
    outputs = [audio_out]
    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)

# Launch the interface
block.queue()
block.launch()