import os import shutil from pathlib import Path import streamlit as st import torchaudio import IPython from tortoise.api import TextToSpeech from tortoise.utils.audio import load_voice # Initialize TextToSpeech model tts = TextToSpeech() # Constants PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"] UPLOAD_FOLDER = "./uploads" OUTPUT_FOLDER = "./output" # Create upload and output directories if they don't exist os.makedirs(UPLOAD_FOLDER, exist_ok=True) os.makedirs(OUTPUT_FOLDER, exist_ok=True) # Streamlit UI elements st.title("Tortoise Text-to-Speech App") # Upload .wav files st.sidebar.header("Upload Audio Samples") uploaded_files = st.sidebar.file_uploader( "Upload Audio Samples for a New Voice", accept_multiple_files=True, type=["wav"], ) # Create a new voice voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.") if st.sidebar.button("Create Voice") and voice_name.strip() != "": new_voice_name = voice_name.strip().replace(" ", "_") voices_dir = f"./tortoise/voices/{new_voice_name}/" if os.path.exists(voices_dir): shutil.rmtree(voices_dir) os.makedirs(voices_dir) for index, uploaded_file in enumerate(uploaded_files): bytes_data = uploaded_file.read() with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file: wav_file.write(bytes_data) st.sidebar.success(f"Voice '{voice_name}' created successfully!") # Input text and settings st.header("Text-to-Speech Generation") text = st.text_area( "Enter Text", help="Enter the text you want to convert to speech.", value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?", ) preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.") voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"] voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.") # Generate speech if st.button("Generate Speech"): if voice_name.strip() == "": st.warning("Please create a voice first.") else: st.info("Generating speech...") # Load voice samples voice_samples, conditioning_latents = load_voice(voice) # Generate speech with Tortoise gen = tts.tts_with_preset( text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=preset, ) # Save and display the generated audio output_path = os.path.join(OUTPUT_FOLDER, "generated.wav") torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000) st.audio(output_path, format="audio/wav") # Show generated output st.subheader("Generated Output") st.audio(output_path, format="audio/wav") if st.checkbox("Play Audio"): IPython.display.Audio(output_path) st.success("Speech generated successfully!") # Clean up uploaded files and output directory if st.sidebar.button("Clean Up"): shutil.rmtree(UPLOAD_FOLDER) os.makedirs(UPLOAD_FOLDER, exist_ok=True) shutil.rmtree(OUTPUT_FOLDER) os.makedirs(OUTPUT_FOLDER, exist_ok=True) st.sidebar.success("Clean up completed!") # Display information st.sidebar.header("Information") st.sidebar.markdown( "This app allows you to create a new voice by uploading .wav files. You can then generate speech " "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed." )