kokoromic / app.py
michecosta's picture
Create app.py
ff9c7de verified
import streamlit as st
import torch
from models import build_model
from kokoro import generate
from scipy.io.wavfile import write as write_wav
import os
import re
import numpy as np
from phonemizer import phonemize
from phonemizer.backend.espeak.espeak import EspeakBackend
import subprocess
@st.cache_resource
def load_model():
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = build_model('kokoro-v0_19.pth', device)
return model, device
@st.cache_data
def load_voicepack(voice_name, device):
try:
voicepack = torch.load(f'voices/{voice_name}.pt', weights_only=True).to(device)
return voicepack
except FileNotFoundError:
st.error(f"Errore: La voce '{voice_name}' non è stata trovata.")
return None
def get_espeak_path():
"""Trova il percorso di espeak."""
try:
result = subprocess.run(['which', 'espeak'], capture_output=True, text=True, check=True)
return result.stdout.strip()
except subprocess.CalledProcessError:
st.error("Errore: espeak non trovato. Assicurati che espeak sia installato e nel PATH.")
return None
def count_tokens(text, lang='en-us'):
"""Conta i token di un testo usando phonemizer."""
espeak_path = get_espeak_path()
if not espeak_path:
return 0
os.environ['PHONEMIZER_ESPEAK_PATH'] = espeak_path
out_ps = phonemize(text, backend='espeak', language=lang, preserve_punctuation=True, with_stress=True)
ps = ' '.join(out_ps).replace(' ', ' ').strip()
ps = [x for x in ps.split(' ') if x != '']
return len(ps)
def split_text(text, max_tokens=500, lang='en-us'):
"""Suddivide il testo in segmenti più piccoli in base al numero di token."""
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
segments = []
current_segment = ""
for sentence in sentences:
sentence_tokens = count_tokens(sentence, lang)
if count_tokens(current_segment, lang) + sentence_tokens <= max_tokens:
current_segment += " " + sentence if current_segment else sentence
else:
if current_segment: # Aggiungi solo se non è vuoto
segments.append(current_segment.strip())
current_segment = sentence
if current_segment:
segments.append(current_segment.strip())
return segments
def generate_audio(model, text, voicepack, voice_name):
"""Genera audio da testo, gestendo la suddivisione del testo."""
if voice_name[0] == 'a':
lang = 'en-us'
elif voice_name[0] == 'b':
lang = 'en-gb'
else:
lang = 'en-us'
segments = split_text(text, lang=lang)
all_audio = []
for segment in segments:
audio, _ = generate(model, segment, voicepack, lang=voice_name[0])
all_audio.append(audio)
if all_audio:
return np.concatenate(all_audio)
else:
return None
def main():
st.title("Kokoro TTS Demo")
model, device = load_model()
voice_options = [
'af', 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
'af_nicole', 'af_sky'
]
selected_voice = st.selectbox("Seleziona una voce", voice_options)
input_text = st.text_area("Inserisci il testo da convertire in audio:")
if st.button("Genera Audio"):
if not input_text:
st.warning("Inserisci del testo.")
return
voicepack = load_voicepack(selected_voice, device)
if voicepack is None:
return
with st.spinner("Generando audio..."):
audio = generate_audio(model, input_text, voicepack, selected_voice)
if audio is not None:
st.audio(audio, sample_rate=24000)
st.success("Audio generato con successo!")
# Salva l'audio in un file temporaneo
temp_file = "temp_audio.wav"
write_wav(temp_file, 24000, audio)
with open(temp_file, "rb") as f:
st.download_button(
label="Scarica Audio",
data=f,
file_name="output.wav",
mime="audio/wav"
)
os.remove(temp_file)
if __name__ == "__main__":
main()