File size: 6,694 Bytes
a2d19e9
468fe55
a2d19e9
468fe55
e7ab3a1
 
468fe55
 
 
 
ebf5126
41889ec
3b61dcb
 
 
882ecb2
 
a2d19e9
882ecb2
a2d19e9
468fe55
882ecb2
468fe55
 
 
 
78b1e75
 
 
a2d19e9
 
 
 
 
 
 
 
 
468fe55
 
 
3b61dcb
468fe55
3b61dcb
468fe55
 
 
a2d19e9
468fe55
 
 
 
 
f099617
468fe55
f099617
a2d19e9
ebf5126
 
 
468fe55
 
 
 
 
882ecb2
468fe55
 
 
1a38443
468fe55
 
 
 
 
 
a2d19e9
 
 
 
 
 
468fe55
 
 
 
 
b8c03ff
468fe55
 
 
 
 
 
 
 
 
 
 
 
78b1e75
 
a2d19e9
 
78b1e75
468fe55
a2d19e9
468fe55
b8c03ff
468fe55
a2d19e9
 
882ecb2
 
 
468fe55
a2d19e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
882ecb2
 
468fe55
41889ec
b8c03ff
78b1e75
ebf5126
 
 
 
 
 
 
 
 
 
 
 
 
78b1e75
 
468fe55
b8c03ff
468fe55
a2d19e9
 
882ecb2
a2d19e9
 
 
 
 
 
468fe55
b8c03ff
468fe55
 
 
78b1e75
468fe55
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import tempfile
import time
import librosa

import streamlit as st

from zerovox.tts.synthesize import ZeroVoxTTS

SAMPLE_RATE=24000 # FIXME

DEFAULT_SPEAKER = 'en_kevin.wav'

SAMPLE_SENTENCE_EN = "A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky."
SAMPLE_SENTENCE_DE = "Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als kreisbogenförmiges farbiges Lichtband in einem von der Sonne beschienenen Regenschauer erscheint."

DEFAULT_LANGUAGE = 'en'

if "lang" not in st.session_state:
    st.session_state.lang = DEFAULT_LANGUAGE

if "text" not in st.session_state:
    st.session_state.text = SAMPLE_SENTENCE_EN if st.session_state.lang == 'en' else SAMPLE_SENTENCE_DE

if "message" not in st.session_state:
    st.session_state.message = "READY."

if "autoplay" not in st.session_state:
    st.session_state.autoplay = False

if "speakerref" not in st.session_state:
    st.session_state.speakerref = DEFAULT_SPEAKER

if "custom_voice" not in st.session_state:
    st.session_state.custom_voice = False

if "voice_wav" not in st.session_state:
    st.session_state.voice_wav = None

def update_text_input():
    global text
    if st.session_state['lang'] == "en":
        st.session_state.text = SAMPLE_SENTENCE_EN
    elif st.session_state['lang'] == "de":
        st.session_state.text = SAMPLE_SENTENCE_DE

def do_synth():

    global status, playback, meldec

    synth = None
    if 'synth' in st.session_state:
        synth = st.session_state.synth

        if synth.meldec_model != st.session_state['meldec']:
            synth = None # trigger reload
        else:
            if synth.language != st.session_state.lang:
                #status.update(label=f"loading the lexicon for {st.session_state.lang} ...", state="running")
                #synth.language = st.session_state.lang
                synth = None # trigger reload

    if not synth:

        status.update(label="loading the model...", state="running")

        st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(st.session_state.lang),
                                                                                  meldec_model=st.session_state['meldec'],
                                                                                  infer_device='cpu',
                                                                                  num_threads=-1,
                                                                                  verbose=True)

    synth = st.session_state.synth
    modelcfg = st.session_state.modelcfg

    status.update(label="computing speaker embedding...", state="running")

    if not st.session_state.custom_voice or st.session_state.voice_wav is None:
        speakerref = ZeroVoxTTS.get_speakerref(st.session_state.speakerref, modelcfg['audio']['sampling_rate'])
    else:
        speakerref = st.session_state.voice_wav

    spkemb = synth.speaker_embed(speakerref)    

    status.update(label="synthesizing...", state="running")

    start_time = time.time()

    wav, phoneme, length = synth.tts(st.session_state.text, spkemb)

    elapsed_time = time.time() - start_time

    message = f"synth time: {elapsed_time:.2f} sec"
    wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
    message += f", voice length: {wav_len:.2f} sec"
    real_time_factor = wav_len / elapsed_time
    message += f", rtf: {real_time_factor:.2f}"

    st.session_state.message = message

    st.session_state.wav = wav
    st.session_state.autoplay = True

st.set_page_config(page_title="ZeroVOX TTS Demo", page_icon=':speech_balloon:', layout="centered", initial_sidebar_state="auto", menu_items=None)

st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")

tab1, tab2 = st.tabs(["Voice", "MEL Decoder"])

with tab1:

    st.checkbox("Custom voice", key='custom_voice')

    col1, col2 = st.columns([0.6, 0.4], vertical_alignment="bottom")
    with col1:
        speakerref = st.empty()

    if st.session_state.custom_voice:

        # Create a file uploader that accepts only .wav files
        uploaded_file = speakerref.file_uploader("Upload your voice sample", type=["wav"])

        # Process the uploaded file
        if uploaded_file is not None:
            with tempfile.NamedTemporaryFile() as f:
                f.write(uploaded_file.read())
                wav, sr = librosa.load(f.name, sr=SAMPLE_RATE)
            
            st.session_state.voice_wav=wav

            st.audio(wav, sample_rate=SAMPLE_RATE)

    else:

        speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
        speakerref.selectbox("Voice", speakers, key='speakerref')

        with col2:
            st.audio(ZeroVoxTTS.get_speakerref(st.session_state.speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)


with tab2:
    meldec = st.selectbox("MEL decoder",
                        [
                            #"../models/meldec-zerovox-de-hifigan-v1-0",
                            #"../models/meldec-libritts-hifigan-v1",
                            #"../models/meldec-libritts-multi-band-melgan-v2",
                            #"../models/meldec-libritts-parallel-wavegan-v1",
                            #"../models/meldec-libritts-parallel-wavegan-v1-long",
                            #"../models/meldec-libritts-style-melgan-v1",
                            #"../models/meldec-vctk-hifigan-v1",
                            #"../models/meldec-vctk-multi-band-melgan-v2",
                            #"../models/meldec-vctk-style-melgan-v1",
                            "meldec-libritts-multi-band-melgan-v2",
                            "meldec-libritts-hifigan-v1",
                        ],
                        #on_change=update_text_input,
                        key='meldec')

status = st.status(st.session_state.message, state="complete")

col1, col2 = st.columns([0.8, 0.2])
with col1:
    text = st.text_area("Text to synthesize", key='text', on_change=do_synth, height=128)

with col2:
    lang = st.selectbox("Language",
                        ["en", "de"],
                        on_change=update_text_input,
                        key='lang')

st.button("Synthesize!", type="primary", on_click=do_synth)

if 'wav' in st.session_state:

    playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)

else:

    playback = st.empty()