File size: 7,929 Bytes
dc780c5
837713d
 
a07119c
 
 
 
837713d
 
 
 
 
 
 
fc14b63
837713d
 
 
 
 
 
 
2b1d793
837713d
85185da
1612a44
837713d
a1cb9c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
837713d
a07119c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d5a618
276c4d0
cf93985
a07119c
837713d
 
276c4d0
 
 
837713d
 
dc780c5
837713d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc780c5
 
 
 
 
 
 
 
 
9ab236a
dc780c5
 
 
 
 
 
a1cb9c1
85185da
 
a1cb9c1
85185da
a1cb9c1
 
dc780c5
 
837713d
 
 
cf93985
837713d
 
 
 
364343c
837713d
 
cf93985
50a7cb9
 
cf93985
 
 
 
 
 
 
85185da
88dfd37
 
837713d
cf93985
837713d
a07119c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import spaces
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re


from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

device = "cuda:0" if torch.cuda.is_available() else "cpu"


repo_id =  "ylacombe/p-m-e"

model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)


SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
examples = [
    # French
    [
        "La voix humaine est un instrument de musique au-dessus de tous les autres.",
        "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
        None,
    ],
    # Spanish
    [
        "La voz es el reflejo del alma en el espejo del tiempo.",
        "A female voice speaks with moderate speed, showing warmth and clarity. The recording is clean with minimal background noise and has natural resonance.",
        None,
    ],
    # Italian
    [
        "La voce umana è la più bella musica che esista al mondo.",
        "A male voice delivers the message with passion and depth. The recording has good clarity with slight room acoustics and a medium-distance perspective.",
        None,
    ],
    # Portuguese
    [
        "A voz é o espelho da alma e o som do coração.",
        "A young female voice speaks with enthusiasm and energy. The recording is close-miked with crisp audio quality and subtle room ambiance.",
        None,
    ],
    # Polish
    [
        "Głos ludzki jest najpiękniejszym instrumentem świata.",
        "An elderly male voice speaks with wisdom and gravitas. The recording has a vintage quality with some characteristic analog warmth.",
        None,
    ],
    # German
    [
        "Die menschliche Stimme ist das schönste Instrument der Welt.",
        "A mature female voice speaks with authority and precision. The recording is studio-quality with perfect clarity and no background noise.",
        None,
    ],
    # Dutch
    [
        "De menselijke stem is het mooiste instrument dat er bestaat.",
        "A middle-aged male voice speaks with gentle inflection and warmth. The recording has natural room acoustics and balanced frequency response.",
        None,
    ],
    # English
    [
        "The human voice is nature's most perfect instrument.",
        "A young male voice speaks with dynamic expression and energy. The recording is professional quality with subtle environmental ambiance.",
        None,
    ],
]
number_normalizer = EnglishNumberNormalizer()

def preprocess(text):
    text = number_normalizer(text).strip()
    text = text.replace("-", " ")
    if text[-1] not in punctuation:
        text = f"{text}."
    
    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
    
    def separate_abb(chunk):
        chunk = chunk.replace(".","")
        print(chunk)
        return " ".join(chunk)
    
    abbreviations = re.findall(abbreviations_pattern, text)
    for abv in abbreviations:
        if abv in text:
            text = text.replace(abv, separate_abb(abv))
    return text

@spaces.GPU
def gen_tts(text, description):
    inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
    prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)

    set_seed(SEED)
    generation = model.generate(
        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
    )
    audio_arr = generation.cpu().numpy().squeeze()

    return SAMPLE_RATE, audio_arr


css = """
        #share-btn-container {
            display: flex;
            padding-left: 0.5rem !important;
            padding-right: 0.5rem !important;
            background-color: #000000;
            justify-content: center;
            align-items: center;
            border-radius: 9999px !important; 
            width: 13rem;
            margin-top: 10px;
            margin-left: auto;
            flex: unset !important;
        }
        #share-btn {
            all: initial;
            color: #ffffff;
            font-weight: 600;
            cursor: pointer;
            font-family: 'IBM Plex Sans', sans-serif;
            margin-left: 0.5rem !important;
            padding-top: 0.25rem !important;
            padding-bottom: 0.25rem !important;
            right:0;
        }
        #share-btn * {
            all: unset !important;
        }
        #share-btn-container div:nth-child(-n+2){
            width: auto !important;
            min-height: 0px !important;
        }
        #share-btn-container .wrap {
            display: none !important;
        }
"""
with gr.Blocks(css=css) as block:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
                  Multi Parler-TTS 🗣️
                </h1>
              </div>
            </div>
        """
    )
    gr.HTML(
f"""
       <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
high-fidelity text-to-speech (TTS) models.</p> 
<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation). </p>

<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
<p><b>Note:</b> you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker") </p>
        """
    )
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
            description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
            run_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column():
            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")

    inputs = [input_text, description]
    outputs = [audio_out]
    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
    gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
    gr.HTML(
        """
        <p>Tips for ensuring good generation:
        <ul>
            <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
            <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
            <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
        </ul>
        </p>

        """
    )


block.queue()
block.launch(share=True)