djkesu commited on
Commit
ba3f0c0
·
1 Parent(s): 9d9b6e3

Simplified app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -286
app.py CHANGED
@@ -1,304 +1,108 @@
1
- # AGPL: a notification must be added stating that changes have been made to that file.
2
-
3
  import os
4
  import shutil
5
  from pathlib import Path
6
-
7
  import streamlit as st
8
- from random import randint
9
-
10
- from tortoise.api import MODELS_DIR
11
- from tortoise.inference import (
12
- infer_on_texts,
13
- run_and_save_tts,
14
- split_and_recombine_text,
15
- )
16
- from tortoise.utils.diffusion import SAMPLERS
17
- from app_utils.filepicker import st_file_selector
18
- from app_utils.conf import TortoiseConfig
19
-
20
- from app_utils.funcs import (
21
- timeit,
22
- load_model,
23
- list_voices,
24
- load_voice_conditionings,
25
- )
26
-
27
 
28
- LATENT_MODES = [
29
- "Tortoise original (bad)",
30
- "average per 4.27s (broken on small files)",
31
- "average per voice file (broken on small files)",
32
- ]
33
 
34
- def main():
35
- conf = TortoiseConfig()
36
-
37
- with st.expander("Create New Voice", expanded=True):
38
- if "file_uploader_key" not in st.session_state:
39
- st.session_state["file_uploader_key"] = str(randint(1000, 100000000))
40
- st.session_state["text_input_key"] = str(randint(1000, 100000000))
41
-
42
- uploaded_files = st.file_uploader(
43
- "Upload Audio Samples for a New Voice",
44
- accept_multiple_files=True,
45
- type=["wav"],
46
- key=st.session_state["file_uploader_key"]
47
- )
48
-
49
- voice_name = st.text_input(
50
- "New Voice Name",
51
- help="Enter a name for your new voice.",
52
- value="",
53
- key=st.session_state["text_input_key"]
54
- )
55
-
56
- create_voice_button = st.button(
57
- "Create Voice",
58
- disabled = ((voice_name.strip() == "") | (len(uploaded_files) == 0))
59
- )
60
- if create_voice_button:
61
- st.write(st.session_state)
62
- with st.spinner(f"Creating new voice: {voice_name}"):
63
- new_voice_name = voice_name.strip().replace(" ", "_")
64
 
65
- voices_dir = f'./tortoise/voices/{new_voice_name}/'
66
- if os.path.exists(voices_dir):
67
- shutil.rmtree(voices_dir)
68
- os.makedirs(voices_dir)
69
 
70
- for index, uploaded_file in enumerate(uploaded_files):
71
- bytes_data = uploaded_file.read()
72
- with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
73
- wav_file.write(bytes_data)
74
 
75
- st.session_state["text_input_key"] = str(randint(1000, 100000000))
76
- st.session_state["file_uploader_key"] = str(randint(1000, 100000000))
77
 
78
- text = st.text_area(
79
- "Text",
80
- help="Text to speak.",
81
- value="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.",
82
- )
 
 
83
 
84
- voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- voice = st.selectbox(
87
- "Voice",
88
- voices,
89
- help="Selects the voice to use for generation. See options in voices/ directory (and add your own!) ",
90
- index=0,
91
- )
92
- preset = st.selectbox(
93
- "Preset",
94
- (
95
- "single_sample",
96
- "ultra_fast",
97
- "very_fast",
98
- "ultra_fast_old",
99
- "fast",
100
- "standard",
101
- "high_quality",
102
- ),
103
- help="Which voice preset to use.",
104
- index=1,
105
- )
106
- with st.expander("Advanced"):
107
- col1, col2 = st.columns(2)
108
- with col1:
109
- """#### Model parameters"""
110
- candidates = st.number_input(
111
- "Candidates",
112
- help="How many output candidates to produce per-voice.",
113
- value=1,
114
- )
115
- latent_averaging_mode = st.radio(
116
- "Latent averaging mode",
117
- LATENT_MODES,
118
- help="How voice samples should be averaged together.",
119
- index=0,
120
- )
121
- sampler = st.radio(
122
- "Sampler",
123
- #SAMPLERS,
124
- ["dpm++2m", "p", "ddim"],
125
- help="Diffusion sampler. Note that dpm++2m is experimental and typically requires more steps.",
126
- index=1,
127
- )
128
- steps = st.number_input(
129
- "Steps",
130
- help="Override the steps used for diffusion (default depends on preset)",
131
- value=10,
132
- )
133
- seed = st.number_input(
134
- "Seed",
135
- help="Random seed which can be used to reproduce results.",
136
- value=-1,
137
- )
138
- if seed == -1:
139
- seed = None
140
- voice_fixer = st.checkbox(
141
- "Voice fixer",
142
- help="Use `voicefixer` to improve audio quality. This is a post-processing step which can be applied to any output.",
143
- value=True,
144
- )
145
- """#### Directories"""
146
- output_path = st.text_input(
147
- "Output Path", help="Where to store outputs.", value="results/"
148
- )
149
 
150
- with col2:
151
- """#### Optimizations"""
152
- high_vram = not st.checkbox(
153
- "Low VRAM",
154
- help="Re-enable default offloading behaviour of tortoise",
155
- value=True,
156
- )
157
- half = st.checkbox(
158
- "Half-Precision",
159
- help="Enable autocast to half precision for autoregressive model",
160
- value=False,
161
- )
162
- kv_cache = st.checkbox(
163
- "Key-Value Cache",
164
- help="Enable kv_cache usage, leading to drastic speedups but worse memory usage",
165
- value=True,
166
- )
167
- cond_free = st.checkbox(
168
- "Conditioning Free",
169
- help="Force conditioning free diffusion",
170
- value=True,
171
- )
172
- no_cond_free = st.checkbox(
173
- "Force Not Conditioning Free",
174
- help="Force disable conditioning free diffusion",
175
- value=False,
176
- )
177
 
178
- """#### Text Splitting"""
179
- min_chars_to_split = st.number_input(
180
- "Min Chars to Split",
181
- help="Minimum number of characters to split text on",
182
- min_value=50,
183
- value=200,
184
- step=1,
185
- )
186
 
187
- """#### Debug"""
188
- produce_debug_state = st.checkbox(
189
- "Produce Debug State",
190
- help="Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.",
191
- value=True,
192
- )
193
 
194
- ar_checkpoint = "."
195
- diff_checkpoint = "."
196
- if st.button("Update Basic Settings"):
197
- conf.update(
198
- EXTRA_VOICES_DIR=extra_voices_dir,
199
- LOW_VRAM=not high_vram,
200
- AR_CHECKPOINT=ar_checkpoint,
201
- DIFF_CHECKPOINT=diff_checkpoint,
202
  )
203
 
204
- ar_checkpoint = None
205
- diff_checkpoint = None
206
- tts = load_model(MODELS_DIR, high_vram, kv_cache, ar_checkpoint, diff_checkpoint)
207
-
208
- if st.button("Start"):
209
- assert latent_averaging_mode
210
- assert preset
211
- assert voice
212
-
213
- def show_generation(fp, filename: str):
214
- """
215
- audio_buffer = BytesIO()
216
- save_gen_with_voicefix(g, audio_buffer, squeeze=False)
217
- torchaudio.save(audio_buffer, g, 24000, format='wav')
218
- """
219
- st.audio(str(fp), format="audio/wav")
220
- st.download_button(
221
- "Download sample",
222
- str(fp),
223
- file_name=filename, # this doesn't actually seem to work lol
224
- )
225
-
226
- with st.spinner(
227
- f"Generating {candidates} candidates for voice {voice} (seed={seed}). You can see progress in the terminal"
228
- ):
229
- os.makedirs(output_path, exist_ok=True)
230
-
231
- selected_voices = voice.split(",")
232
- for k, selected_voice in enumerate(selected_voices):
233
- if "&" in selected_voice:
234
- voice_sel = selected_voice.split("&")
235
- else:
236
- voice_sel = [selected_voice]
237
- voice_samples, conditioning_latents = load_voice_conditionings(
238
- voice_sel, []
239
- )
240
-
241
- voice_path = Path(os.path.join(output_path, selected_voice))
242
-
243
- with timeit(
244
- f"Generating {candidates} candidates for voice {selected_voice} (seed={seed})"
245
- ):
246
- nullable_kwargs = {
247
- k: v
248
- for k, v in zip(
249
- ["sampler", "diffusion_iterations", "cond_free"],
250
- [sampler, steps, cond_free],
251
- )
252
- if v is not None
253
- }
254
-
255
- def call_tts(text: str):
256
- return tts.tts_with_preset(
257
- text,
258
- k=candidates,
259
- voice_samples=voice_samples,
260
- conditioning_latents=conditioning_latents,
261
- preset=preset,
262
- use_deterministic_seed=seed,
263
- return_deterministic_state=True,
264
- cvvp_amount=0.0,
265
- half=half,
266
- latent_averaging_mode=LATENT_MODES.index(
267
- latent_averaging_mode
268
- ),
269
- **nullable_kwargs,
270
- )
271
-
272
- if len(text) < min_chars_to_split:
273
- filepaths = run_and_save_tts(
274
- call_tts,
275
- text,
276
- voice_path,
277
- return_deterministic_state=True,
278
- return_filepaths=True,
279
- voicefixer=voice_fixer,
280
- )
281
- for i, fp in enumerate(filepaths):
282
- show_generation(fp, f"{selected_voice}-text-{i}.wav")
283
- else:
284
- desired_length = int(min_chars_to_split)
285
- texts = split_and_recombine_text(
286
- text, desired_length, desired_length + 100
287
- )
288
- filepaths = infer_on_texts(
289
- call_tts,
290
- texts,
291
- voice_path,
292
- return_deterministic_state=True,
293
- return_filepaths=True,
294
- lines_to_regen=set(range(len(texts))),
295
- voicefixer=voice_fixer,
296
- )
297
- for i, fp in enumerate(filepaths):
298
- show_generation(fp, f"{selected_voice}-text-{i}.wav")
299
- if produce_debug_state:
300
- """Debug states can be found in the output directory"""
301
-
302
-
303
- if __name__ == "__main__":
304
- main()
 
 
 
1
  import os
2
  import shutil
3
  from pathlib import Path
 
4
  import streamlit as st
5
+ import torchaudio
6
+ import IPython
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ from tortoise.api import TextToSpeech
9
+ from tortoise.utils.audio import load_voice
 
 
 
10
 
11
+ # Initialize TextToSpeech model
12
+ tts = TextToSpeech()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Constants
15
+ PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
16
+ UPLOAD_FOLDER = "./uploads"
17
+ OUTPUT_FOLDER = "./output"
18
 
19
+ # Create upload and output directories if they don't exist
20
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
21
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
 
22
 
23
+ # Streamlit UI elements
24
+ st.title("Tortoise Text-to-Speech App")
25
 
26
+ # Upload .wav files
27
+ st.sidebar.header("Upload Audio Samples")
28
+ uploaded_files = st.sidebar.file_uploader(
29
+ "Upload Audio Samples for a New Voice",
30
+ accept_multiple_files=True,
31
+ type=["wav"],
32
+ )
33
 
34
+ # Create a new voice
35
+ voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")
36
+
37
+ if st.sidebar.button("Create Voice") and voice_name.strip() != "":
38
+ new_voice_name = voice_name.strip().replace(" ", "_")
39
+ voices_dir = f"./tortoise/voices/{new_voice_name}/"
40
+ if os.path.exists(voices_dir):
41
+ shutil.rmtree(voices_dir)
42
+ os.makedirs(voices_dir)
43
+
44
+ for index, uploaded_file in enumerate(uploaded_files):
45
+ bytes_data = uploaded_file.read()
46
+ with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
47
+ wav_file.write(bytes_data)
48
+
49
+ st.sidebar.success(f"Voice '{voice_name}' created successfully!")
50
+
51
+ # Input text and settings
52
+ st.header("Text-to-Speech Generation")
53
+ text = st.text_area(
54
+ "Enter Text",
55
+ help="Enter the text you want to convert to speech.",
56
+ value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
57
+ )
58
 
59
+ preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
62
+ voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # Generate speech
65
+ if st.button("Generate Speech"):
66
+ if voice_name.strip() == "":
67
+ st.warning("Please create a voice first.")
68
+ else:
69
+ st.info("Generating speech...")
 
 
70
 
71
+ # Load voice samples
72
+ voice_samples, conditioning_latents = load_voice(voice)
 
 
 
 
73
 
74
+ # Generate speech with Tortoise
75
+ gen = tts.tts_with_preset(
76
+ text,
77
+ voice_samples=voice_samples,
78
+ conditioning_latents=conditioning_latents,
79
+ preset=preset,
 
 
80
  )
81
 
82
+ # Save and display the generated audio
83
+ output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
84
+ torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
85
+ st.audio(output_path, format="audio/wav")
86
+
87
+ # Show generated output
88
+ st.subheader("Generated Output")
89
+ st.audio(output_path, format="audio/wav")
90
+ if st.checkbox("Play Audio"):
91
+ IPython.display.Audio(output_path)
92
+
93
+ st.success("Speech generated successfully!")
94
+
95
+ # Clean up uploaded files and output directory
96
+ if st.sidebar.button("Clean Up"):
97
+ shutil.rmtree(UPLOAD_FOLDER)
98
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
99
+ shutil.rmtree(OUTPUT_FOLDER)
100
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
101
+ st.sidebar.success("Clean up completed!")
102
+
103
+ # Display information
104
+ st.sidebar.header("Information")
105
+ st.sidebar.markdown(
106
+ "This app allows you to create a new voice by uploading .wav files. You can then generate speech "
107
+ "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
108
+ )