michecosta commited on
Commit
ff9c7de
·
verified ·
1 Parent(s): 1663b08

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from models import build_model
4
+ from kokoro import generate
5
+ from scipy.io.wavfile import write as write_wav
6
+ import os
7
+ import re
8
+ import numpy as np
9
+ from phonemizer import phonemize
10
+ from phonemizer.backend.espeak.espeak import EspeakBackend
11
+ import subprocess
12
+
13
+ @st.cache_resource
14
+ def load_model():
15
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
+ model = build_model('kokoro-v0_19.pth', device)
17
+ return model, device
18
+
19
+ @st.cache_data
20
+ def load_voicepack(voice_name, device):
21
+ try:
22
+ voicepack = torch.load(f'voices/{voice_name}.pt', weights_only=True).to(device)
23
+ return voicepack
24
+ except FileNotFoundError:
25
+ st.error(f"Errore: La voce '{voice_name}' non è stata trovata.")
26
+ return None
27
+
28
+ def get_espeak_path():
29
+ """Trova il percorso di espeak."""
30
+ try:
31
+ result = subprocess.run(['which', 'espeak'], capture_output=True, text=True, check=True)
32
+ return result.stdout.strip()
33
+ except subprocess.CalledProcessError:
34
+ st.error("Errore: espeak non trovato. Assicurati che espeak sia installato e nel PATH.")
35
+ return None
36
+
37
+ def count_tokens(text, lang='en-us'):
38
+ """Conta i token di un testo usando phonemizer."""
39
+ espeak_path = get_espeak_path()
40
+ if not espeak_path:
41
+ return 0
42
+ os.environ['PHONEMIZER_ESPEAK_PATH'] = espeak_path
43
+ out_ps = phonemize(text, backend='espeak', language=lang, preserve_punctuation=True, with_stress=True)
44
+ ps = ' '.join(out_ps).replace(' ', ' ').strip()
45
+ ps = [x for x in ps.split(' ') if x != '']
46
+ return len(ps)
47
+
48
+ def split_text(text, max_tokens=500, lang='en-us'):
49
+ """Suddivide il testo in segmenti più piccoli in base al numero di token."""
50
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
51
+ segments = []
52
+ current_segment = ""
53
+ for sentence in sentences:
54
+ sentence_tokens = count_tokens(sentence, lang)
55
+ if count_tokens(current_segment, lang) + sentence_tokens <= max_tokens:
56
+ current_segment += " " + sentence if current_segment else sentence
57
+ else:
58
+ if current_segment: # Aggiungi solo se non è vuoto
59
+ segments.append(current_segment.strip())
60
+ current_segment = sentence
61
+ if current_segment:
62
+ segments.append(current_segment.strip())
63
+ return segments
64
+
65
+ def generate_audio(model, text, voicepack, voice_name):
66
+ """Genera audio da testo, gestendo la suddivisione del testo."""
67
+ if voice_name[0] == 'a':
68
+ lang = 'en-us'
69
+ elif voice_name[0] == 'b':
70
+ lang = 'en-gb'
71
+ else:
72
+ lang = 'en-us'
73
+ segments = split_text(text, lang=lang)
74
+ all_audio = []
75
+ for segment in segments:
76
+ audio, _ = generate(model, segment, voicepack, lang=voice_name[0])
77
+ all_audio.append(audio)
78
+ if all_audio:
79
+ return np.concatenate(all_audio)
80
+ else:
81
+ return None
82
+
83
+ def main():
84
+ st.title("Kokoro TTS Demo")
85
+
86
+ model, device = load_model()
87
+
88
+ voice_options = [
89
+ 'af', 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
90
+ 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
91
+ 'af_nicole', 'af_sky'
92
+ ]
93
+ selected_voice = st.selectbox("Seleziona una voce", voice_options)
94
+
95
+ input_text = st.text_area("Inserisci il testo da convertire in audio:")
96
+
97
+ if st.button("Genera Audio"):
98
+ if not input_text:
99
+ st.warning("Inserisci del testo.")
100
+ return
101
+
102
+ voicepack = load_voicepack(selected_voice, device)
103
+ if voicepack is None:
104
+ return
105
+
106
+ with st.spinner("Generando audio..."):
107
+ audio = generate_audio(model, input_text, voicepack, selected_voice)
108
+
109
+ if audio is not None:
110
+ st.audio(audio, sample_rate=24000)
111
+ st.success("Audio generato con successo!")
112
+ # Salva l'audio in un file temporaneo
113
+ temp_file = "temp_audio.wav"
114
+ write_wav(temp_file, 24000, audio)
115
+ with open(temp_file, "rb") as f:
116
+ st.download_button(
117
+ label="Scarica Audio",
118
+ data=f,
119
+ file_name="output.wav",
120
+ mime="audio/wav"
121
+ )
122
+ os.remove(temp_file)
123
+
124
+ if __name__ == "__main__":
125
+ main()