File size: 4,270 Bytes
ff9c7de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import streamlit as st
import torch
from models import build_model
from kokoro import generate
from scipy.io.wavfile import write as write_wav
import os
import re
import numpy as np
from phonemizer import phonemize
from phonemizer.backend.espeak.espeak import EspeakBackend
import subprocess

@st.cache_resource
def load_model():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = build_model('kokoro-v0_19.pth', device)
    return model, device

@st.cache_data
def load_voicepack(voice_name, device):
    try:
        voicepack = torch.load(f'voices/{voice_name}.pt', weights_only=True).to(device)
        return voicepack
    except FileNotFoundError:
        st.error(f"Errore: La voce '{voice_name}' non è stata trovata.")
        return None

def get_espeak_path():
    """Trova il percorso di espeak."""
    try:
        result = subprocess.run(['which', 'espeak'], capture_output=True, text=True, check=True)
        return result.stdout.strip()
    except subprocess.CalledProcessError:
        st.error("Errore: espeak non trovato. Assicurati che espeak sia installato e nel PATH.")
        return None

def count_tokens(text, lang='en-us'):
    """Conta i token di un testo usando phonemizer."""
    espeak_path = get_espeak_path()
    if not espeak_path:
        return 0
    os.environ['PHONEMIZER_ESPEAK_PATH'] = espeak_path
    out_ps = phonemize(text, backend='espeak', language=lang, preserve_punctuation=True, with_stress=True)
    ps = ' '.join(out_ps).replace('  ', ' ').strip()
    ps = [x for x in ps.split(' ') if x != '']
    return len(ps)

def split_text(text, max_tokens=500, lang='en-us'):
    """Suddivide il testo in segmenti più piccoli in base al numero di token."""
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    segments = []
    current_segment = ""
    for sentence in sentences:
        sentence_tokens = count_tokens(sentence, lang)
        if count_tokens(current_segment, lang) + sentence_tokens <= max_tokens:
            current_segment += " " + sentence if current_segment else sentence
        else:
            if current_segment:  # Aggiungi solo se non è vuoto
                segments.append(current_segment.strip())
            current_segment = sentence
    if current_segment:
        segments.append(current_segment.strip())
    return segments

def generate_audio(model, text, voicepack, voice_name):
    """Genera audio da testo, gestendo la suddivisione del testo."""
    if voice_name[0] == 'a':
       lang = 'en-us'
    elif voice_name[0] == 'b':
       lang = 'en-gb'
    else:
       lang = 'en-us'
    segments = split_text(text, lang=lang)
    all_audio = []
    for segment in segments:
        audio, _ = generate(model, segment, voicepack, lang=voice_name[0])
        all_audio.append(audio)
    if all_audio:
        return np.concatenate(all_audio)
    else:
        return None

def main():
    st.title("Kokoro TTS Demo")

    model, device = load_model()

    voice_options = [
        'af', 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
        'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
        'af_nicole', 'af_sky'
    ]
    selected_voice = st.selectbox("Seleziona una voce", voice_options)

    input_text = st.text_area("Inserisci il testo da convertire in audio:")

    if st.button("Genera Audio"):
        if not input_text:
            st.warning("Inserisci del testo.")
            return

        voicepack = load_voicepack(selected_voice, device)
        if voicepack is None:
            return

        with st.spinner("Generando audio..."):
            audio = generate_audio(model, input_text, voicepack, selected_voice)

        if audio is not None:
            st.audio(audio, sample_rate=24000)
            st.success("Audio generato con successo!")
            # Salva l'audio in un file temporaneo
            temp_file = "temp_audio.wav"
            write_wav(temp_file, 24000, audio)
            with open(temp_file, "rb") as f:
                st.download_button(
                    label="Scarica Audio",
                    data=f,
                    file_name="output.wav",
                    mime="audio/wav"
                )
            os.remove(temp_file)

if __name__ == "__main__":
    main()