Spaces:
Running
Running
import streamlit as st | |
import torch | |
from models import build_model | |
from kokoro import generate | |
from scipy.io.wavfile import write as write_wav | |
import os | |
import re | |
import numpy as np | |
from phonemizer import phonemize | |
from phonemizer.backend.espeak.espeak import EspeakBackend | |
import subprocess | |
def load_model(): | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model = build_model('kokoro-v0_19.pth', device) | |
return model, device | |
def load_voicepack(voice_name, device): | |
try: | |
voicepack = torch.load(f'voices/{voice_name}.pt', weights_only=True).to(device) | |
return voicepack | |
except FileNotFoundError: | |
st.error(f"Errore: La voce '{voice_name}' non è stata trovata.") | |
return None | |
def get_espeak_path(): | |
"""Trova il percorso di espeak.""" | |
try: | |
result = subprocess.run(['which', 'espeak'], capture_output=True, text=True, check=True) | |
return result.stdout.strip() | |
except subprocess.CalledProcessError: | |
st.error("Errore: espeak non trovato. Assicurati che espeak sia installato e nel PATH.") | |
return None | |
def count_tokens(text, lang='en-us'): | |
"""Conta i token di un testo usando phonemizer.""" | |
espeak_path = get_espeak_path() | |
if not espeak_path: | |
return 0 | |
os.environ['PHONEMIZER_ESPEAK_PATH'] = espeak_path | |
out_ps = phonemize(text, backend='espeak', language=lang, preserve_punctuation=True, with_stress=True) | |
ps = ' '.join(out_ps).replace(' ', ' ').strip() | |
ps = [x for x in ps.split(' ') if x != ''] | |
return len(ps) | |
def split_text(text, max_tokens=500, lang='en-us'): | |
"""Suddivide il testo in segmenti più piccoli in base al numero di token.""" | |
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) | |
segments = [] | |
current_segment = "" | |
for sentence in sentences: | |
sentence_tokens = count_tokens(sentence, lang) | |
if count_tokens(current_segment, lang) + sentence_tokens <= max_tokens: | |
current_segment += " " + sentence if current_segment else sentence | |
else: | |
if current_segment: # Aggiungi solo se non è vuoto | |
segments.append(current_segment.strip()) | |
current_segment = sentence | |
if current_segment: | |
segments.append(current_segment.strip()) | |
return segments | |
def generate_audio(model, text, voicepack, voice_name): | |
"""Genera audio da testo, gestendo la suddivisione del testo.""" | |
if voice_name[0] == 'a': | |
lang = 'en-us' | |
elif voice_name[0] == 'b': | |
lang = 'en-gb' | |
else: | |
lang = 'en-us' | |
segments = split_text(text, lang=lang) | |
all_audio = [] | |
for segment in segments: | |
audio, _ = generate(model, segment, voicepack, lang=voice_name[0]) | |
all_audio.append(audio) | |
if all_audio: | |
return np.concatenate(all_audio) | |
else: | |
return None | |
def main(): | |
st.title("Kokoro TTS Demo") | |
model, device = load_model() | |
voice_options = [ | |
'af', 'af_bella', 'af_sarah', 'am_adam', 'am_michael', | |
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis', | |
'af_nicole', 'af_sky' | |
] | |
selected_voice = st.selectbox("Seleziona una voce", voice_options) | |
input_text = st.text_area("Inserisci il testo da convertire in audio:") | |
if st.button("Genera Audio"): | |
if not input_text: | |
st.warning("Inserisci del testo.") | |
return | |
voicepack = load_voicepack(selected_voice, device) | |
if voicepack is None: | |
return | |
with st.spinner("Generando audio..."): | |
audio = generate_audio(model, input_text, voicepack, selected_voice) | |
if audio is not None: | |
st.audio(audio, sample_rate=24000) | |
st.success("Audio generato con successo!") | |
# Salva l'audio in un file temporaneo | |
temp_file = "temp_audio.wav" | |
write_wav(temp_file, 24000, audio) | |
with open(temp_file, "rb") as f: | |
st.download_button( | |
label="Scarica Audio", | |
data=f, | |
file_name="output.wav", | |
mime="audio/wav" | |
) | |
os.remove(temp_file) | |
if __name__ == "__main__": | |
main() |