Spaces:
Running
Running
michecosta
commited on
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
from models import build_model
|
4 |
+
from kokoro import generate
|
5 |
+
from scipy.io.wavfile import write as write_wav
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
import numpy as np
|
9 |
+
from phonemizer import phonemize
|
10 |
+
from phonemizer.backend.espeak.espeak import EspeakBackend
|
11 |
+
import subprocess
|
12 |
+
|
13 |
+
@st.cache_resource
|
14 |
+
def load_model():
|
15 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
16 |
+
model = build_model('kokoro-v0_19.pth', device)
|
17 |
+
return model, device
|
18 |
+
|
19 |
+
@st.cache_data
|
20 |
+
def load_voicepack(voice_name, device):
|
21 |
+
try:
|
22 |
+
voicepack = torch.load(f'voices/{voice_name}.pt', weights_only=True).to(device)
|
23 |
+
return voicepack
|
24 |
+
except FileNotFoundError:
|
25 |
+
st.error(f"Errore: La voce '{voice_name}' non è stata trovata.")
|
26 |
+
return None
|
27 |
+
|
28 |
+
def get_espeak_path():
|
29 |
+
"""Trova il percorso di espeak."""
|
30 |
+
try:
|
31 |
+
result = subprocess.run(['which', 'espeak'], capture_output=True, text=True, check=True)
|
32 |
+
return result.stdout.strip()
|
33 |
+
except subprocess.CalledProcessError:
|
34 |
+
st.error("Errore: espeak non trovato. Assicurati che espeak sia installato e nel PATH.")
|
35 |
+
return None
|
36 |
+
|
37 |
+
def count_tokens(text, lang='en-us'):
|
38 |
+
"""Conta i token di un testo usando phonemizer."""
|
39 |
+
espeak_path = get_espeak_path()
|
40 |
+
if not espeak_path:
|
41 |
+
return 0
|
42 |
+
os.environ['PHONEMIZER_ESPEAK_PATH'] = espeak_path
|
43 |
+
out_ps = phonemize(text, backend='espeak', language=lang, preserve_punctuation=True, with_stress=True)
|
44 |
+
ps = ' '.join(out_ps).replace(' ', ' ').strip()
|
45 |
+
ps = [x for x in ps.split(' ') if x != '']
|
46 |
+
return len(ps)
|
47 |
+
|
48 |
+
def split_text(text, max_tokens=500, lang='en-us'):
|
49 |
+
"""Suddivide il testo in segmenti più piccoli in base al numero di token."""
|
50 |
+
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
51 |
+
segments = []
|
52 |
+
current_segment = ""
|
53 |
+
for sentence in sentences:
|
54 |
+
sentence_tokens = count_tokens(sentence, lang)
|
55 |
+
if count_tokens(current_segment, lang) + sentence_tokens <= max_tokens:
|
56 |
+
current_segment += " " + sentence if current_segment else sentence
|
57 |
+
else:
|
58 |
+
if current_segment: # Aggiungi solo se non è vuoto
|
59 |
+
segments.append(current_segment.strip())
|
60 |
+
current_segment = sentence
|
61 |
+
if current_segment:
|
62 |
+
segments.append(current_segment.strip())
|
63 |
+
return segments
|
64 |
+
|
65 |
+
def generate_audio(model, text, voicepack, voice_name):
|
66 |
+
"""Genera audio da testo, gestendo la suddivisione del testo."""
|
67 |
+
if voice_name[0] == 'a':
|
68 |
+
lang = 'en-us'
|
69 |
+
elif voice_name[0] == 'b':
|
70 |
+
lang = 'en-gb'
|
71 |
+
else:
|
72 |
+
lang = 'en-us'
|
73 |
+
segments = split_text(text, lang=lang)
|
74 |
+
all_audio = []
|
75 |
+
for segment in segments:
|
76 |
+
audio, _ = generate(model, segment, voicepack, lang=voice_name[0])
|
77 |
+
all_audio.append(audio)
|
78 |
+
if all_audio:
|
79 |
+
return np.concatenate(all_audio)
|
80 |
+
else:
|
81 |
+
return None
|
82 |
+
|
83 |
+
def main():
|
84 |
+
st.title("Kokoro TTS Demo")
|
85 |
+
|
86 |
+
model, device = load_model()
|
87 |
+
|
88 |
+
voice_options = [
|
89 |
+
'af', 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
|
90 |
+
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
|
91 |
+
'af_nicole', 'af_sky'
|
92 |
+
]
|
93 |
+
selected_voice = st.selectbox("Seleziona una voce", voice_options)
|
94 |
+
|
95 |
+
input_text = st.text_area("Inserisci il testo da convertire in audio:")
|
96 |
+
|
97 |
+
if st.button("Genera Audio"):
|
98 |
+
if not input_text:
|
99 |
+
st.warning("Inserisci del testo.")
|
100 |
+
return
|
101 |
+
|
102 |
+
voicepack = load_voicepack(selected_voice, device)
|
103 |
+
if voicepack is None:
|
104 |
+
return
|
105 |
+
|
106 |
+
with st.spinner("Generando audio..."):
|
107 |
+
audio = generate_audio(model, input_text, voicepack, selected_voice)
|
108 |
+
|
109 |
+
if audio is not None:
|
110 |
+
st.audio(audio, sample_rate=24000)
|
111 |
+
st.success("Audio generato con successo!")
|
112 |
+
# Salva l'audio in un file temporaneo
|
113 |
+
temp_file = "temp_audio.wav"
|
114 |
+
write_wav(temp_file, 24000, audio)
|
115 |
+
with open(temp_file, "rb") as f:
|
116 |
+
st.download_button(
|
117 |
+
label="Scarica Audio",
|
118 |
+
data=f,
|
119 |
+
file_name="output.wav",
|
120 |
+
mime="audio/wav"
|
121 |
+
)
|
122 |
+
os.remove(temp_file)
|
123 |
+
|
124 |
+
if __name__ == "__main__":
|
125 |
+
main()
|