Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,22 +5,24 @@ from kokoro import KPipeline
|
|
5 |
import os
|
6 |
from huggingface_hub import list_repo_files
|
7 |
import uuid
|
8 |
-
import re
|
9 |
import gradio as gr
|
10 |
key = os.getenv("SECRET_KEY", None)
|
11 |
-
#translate langauge
|
12 |
from deep_translator import GoogleTranslator
|
13 |
def bulk_translate(text, target_language, chunk_size=500):
|
14 |
language_map_local = {
|
15 |
-
"American English": "en",
|
16 |
-
"British English": "en",
|
17 |
"Hindi": "hi",
|
18 |
"Spanish": "es",
|
19 |
"French": "fr",
|
20 |
"Italian": "it",
|
21 |
"Brazilian Portuguese": "pt",
|
22 |
"Japanese": "ja",
|
23 |
-
"Mandarin Chinese": "zh-CN"
|
|
|
|
|
24 |
}
|
25 |
# lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
|
26 |
lang_code=language_map_local[target_language]
|
@@ -41,7 +43,7 @@ def bulk_translate(text, target_language, chunk_size=500):
|
|
41 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
42 |
result=" ".join(translated_chunks)
|
43 |
return result.strip()
|
44 |
-
|
45 |
# Language mapping dictionary
|
46 |
language_map = {
|
47 |
"American English": "a",
|
@@ -52,7 +54,9 @@ language_map = {
|
|
52 |
"Italian": "i",
|
53 |
"Brazilian Portuguese": "p",
|
54 |
"Japanese": "j",
|
55 |
-
"Mandarin Chinese": "z"
|
|
|
|
|
56 |
}
|
57 |
|
58 |
|
@@ -65,7 +69,7 @@ def update_pipeline(Language):
|
|
65 |
# Only update if the language is different
|
66 |
if new_lang != last_used_language:
|
67 |
pipeline = KPipeline(lang_code=new_lang)
|
68 |
-
last_used_language = new_lang
|
69 |
try:
|
70 |
pipeline = KPipeline(lang_code=new_lang)
|
71 |
last_used_language = new_lang # Update last used language
|
@@ -123,7 +127,7 @@ def clean_text(text):
|
|
123 |
r'[\U00002702-\U000027B0]|' # Dingbats
|
124 |
r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
|
125 |
r'', flags=re.UNICODE)
|
126 |
-
|
127 |
text = emoji_pattern.sub(r'', text)
|
128 |
|
129 |
# Remove multiple spaces and extra line breaks
|
@@ -137,13 +141,13 @@ def tts_file_name(text,language):
|
|
137 |
text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
|
138 |
text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
|
139 |
text = text.replace(" ", "_") # Replace spaces with underscores
|
140 |
-
language=language.replace(" ", "_").strip()
|
141 |
# Truncate or handle empty text
|
142 |
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
|
143 |
-
|
144 |
# Generate a random string for uniqueness
|
145 |
random_string = uuid.uuid4().hex[:8].upper()
|
146 |
-
|
147 |
# Construct the file name
|
148 |
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
|
149 |
return file_name
|
@@ -164,7 +168,7 @@ def remove_silence_function(file_path,minimum_silence=50):
|
|
164 |
audio_chunks = split_on_silence(sound,
|
165 |
min_silence_len=100,
|
166 |
silence_thresh=-45,
|
167 |
-
keep_silence=minimum_silence)
|
168 |
# Putting the file back together
|
169 |
combined = AudioSegment.empty()
|
170 |
for chunk in audio_chunks:
|
@@ -200,7 +204,7 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
|
|
200 |
audio_bytes = audio_int16.tobytes() # Convert to bytes
|
201 |
# Write the audio chunk to the WAV file
|
202 |
wav_file.writeframes(audio_bytes)
|
203 |
-
if remove_silence:
|
204 |
keep_silence = int(keep_silence_up_to * 1000)
|
205 |
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
|
206 |
return new_wave_file,timestamps
|
@@ -247,7 +251,7 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
|
|
247 |
|
248 |
for entry in word_level_timestamps:
|
249 |
word = entry["word"]
|
250 |
-
|
251 |
# Skip punctuation if enabled
|
252 |
if skip_punctuation and all(char in string.punctuation for char in word):
|
253 |
continue
|
@@ -286,13 +290,13 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
|
|
286 |
|
287 |
# Skip selected punctuation from remove_punctuation list
|
288 |
if word in remove_punctuation:
|
289 |
-
continue
|
290 |
|
291 |
# Attach punctuation to the previous word
|
292 |
if word in string.punctuation:
|
293 |
if subtitle_words:
|
294 |
subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
|
295 |
-
continue
|
296 |
|
297 |
# Start a new subtitle block if needed
|
298 |
if start_time is None:
|
@@ -348,16 +352,16 @@ import re
|
|
348 |
def fix_punctuation(text):
|
349 |
# Remove spaces before punctuation marks (., ?, !, ,)
|
350 |
text = re.sub(r'\s([.,?!])', r'\1', text)
|
351 |
-
|
352 |
# Handle quotation marks: remove spaces before and after them
|
353 |
text = text.replace('" ', '"')
|
354 |
text = text.replace(' "', '"')
|
355 |
text = text.replace('" ', '"')
|
356 |
-
|
357 |
# Track quotation marks to add space after closing quotes
|
358 |
track = 0
|
359 |
result = []
|
360 |
-
|
361 |
for index, char in enumerate(text):
|
362 |
if char == '"':
|
363 |
track += 1
|
@@ -460,15 +464,16 @@ def save_current_data():
|
|
460 |
if os.path.exists("./last"):
|
461 |
shutil.rmtree("./last")
|
462 |
os.makedirs("./last",exist_ok=True)
|
463 |
-
|
464 |
-
|
465 |
def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False, input_key=None, keep_silence_up_to=0.05):
|
466 |
print(input_key, key)
|
467 |
if input_key == key:
|
468 |
-
if translate_text:
|
469 |
text=bulk_translate(text, Language, chunk_size=500)
|
470 |
save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
|
471 |
if remove_silence==False:
|
|
|
472 |
if Language in ["American English", "British English"]:
|
473 |
word_level_timestamps=adjust_timestamps(timestamps)
|
474 |
word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_")
|
@@ -483,11 +488,16 @@ def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,t
|
|
483 |
shutil.copy(normal_srt, "./last/")
|
484 |
shutil.copy(json_file, "./last/")
|
485 |
return save_path,save_path,word_level_srt,normal_srt,json_file
|
486 |
-
|
|
|
|
|
|
|
|
|
487 |
else:
|
|
|
488 |
return None,None,None,None,None
|
489 |
-
|
490 |
-
|
491 |
|
492 |
|
493 |
|
@@ -505,20 +515,22 @@ def ui():
|
|
505 |
["Ciao, come stai?", "Italian", "if_sara"],
|
506 |
["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
|
507 |
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
|
508 |
-
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
|
|
|
|
|
509 |
]
|
510 |
-
|
511 |
with gr.Blocks() as demo:
|
512 |
# gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
|
513 |
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
514 |
-
lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
|
515 |
voice_names = get_voice_names("hexgrad/Kokoro-82M")
|
516 |
|
517 |
with gr.Row():
|
518 |
with gr.Column():
|
519 |
text = gr.Textbox(label='📝 Enter Text', lines=3)
|
520 |
-
input_key = gr.Textbox(label='Input Key', lines=1)
|
521 |
-
|
522 |
with gr.Row():
|
523 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
524 |
|
@@ -532,6 +544,18 @@ def ui():
|
|
532 |
speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
|
533 |
translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
|
534 |
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
|
536 |
with gr.Column():
|
537 |
audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True)
|
@@ -546,8 +570,8 @@ def ui():
|
|
546 |
srt_file = gr.File(label='📜 Download Sentence-Level SRT')
|
547 |
sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON')
|
548 |
|
549 |
-
text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
|
550 |
-
generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
|
551 |
|
552 |
# Add examples to the interface
|
553 |
gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
|
@@ -558,11 +582,10 @@ def tutorial():
|
|
558 |
# Markdown explanation for language code
|
559 |
explanation = """
|
560 |
## Language Code Explanation:
|
561 |
-
Example: `'af_bella'`
|
562 |
- **'a'** stands for **American English**.
|
563 |
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
|
564 |
- **'bella'** refers to the specific voice.
|
565 |
-
|
566 |
The first character in the voice code stands for the language:
|
567 |
- **"a"**: American English
|
568 |
- **"b"**: British English
|
@@ -573,7 +596,8 @@ def tutorial():
|
|
573 |
- **"p"**: Brazilian Portuguese
|
574 |
- **"j"**: Japanese
|
575 |
- **"z"**: Mandarin Chinese
|
576 |
-
|
|
|
577 |
The second character stands for gender:
|
578 |
- **"f_"**: Female
|
579 |
- **"m_"**: Male
|
@@ -607,4 +631,4 @@ last_used_language = "a"
|
|
607 |
pipeline = KPipeline(lang_code=last_used_language)
|
608 |
temp_folder = create_audio_dir()
|
609 |
if __name__ == "__main__":
|
610 |
-
main()
|
|
|
5 |
import os
|
6 |
from huggingface_hub import list_repo_files
|
7 |
import uuid
|
8 |
+
import re
|
9 |
import gradio as gr
|
10 |
key = os.getenv("SECRET_KEY", None)
|
11 |
+
#translate langauge
|
12 |
from deep_translator import GoogleTranslator
|
13 |
def bulk_translate(text, target_language, chunk_size=500):
|
14 |
language_map_local = {
|
15 |
+
"American English": "en",
|
16 |
+
"British English": "en",
|
17 |
"Hindi": "hi",
|
18 |
"Spanish": "es",
|
19 |
"French": "fr",
|
20 |
"Italian": "it",
|
21 |
"Brazilian Portuguese": "pt",
|
22 |
"Japanese": "ja",
|
23 |
+
"Mandarin Chinese": "zh-CN",
|
24 |
+
"Russian": "ru", # Added Russian
|
25 |
+
"German": "de" # Added German
|
26 |
}
|
27 |
# lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
|
28 |
lang_code=language_map_local[target_language]
|
|
|
43 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
44 |
result=" ".join(translated_chunks)
|
45 |
return result.strip()
|
46 |
+
|
47 |
# Language mapping dictionary
|
48 |
language_map = {
|
49 |
"American English": "a",
|
|
|
54 |
"Italian": "i",
|
55 |
"Brazilian Portuguese": "p",
|
56 |
"Japanese": "j",
|
57 |
+
"Mandarin Chinese": "z",
|
58 |
+
"Russian": "r", # Added Russian code
|
59 |
+
"German": "g" # Added German code
|
60 |
}
|
61 |
|
62 |
|
|
|
69 |
# Only update if the language is different
|
70 |
if new_lang != last_used_language:
|
71 |
pipeline = KPipeline(lang_code=new_lang)
|
72 |
+
last_used_language = new_lang
|
73 |
try:
|
74 |
pipeline = KPipeline(lang_code=new_lang)
|
75 |
last_used_language = new_lang # Update last used language
|
|
|
127 |
r'[\U00002702-\U000027B0]|' # Dingbats
|
128 |
r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
|
129 |
r'', flags=re.UNICODE)
|
130 |
+
|
131 |
text = emoji_pattern.sub(r'', text)
|
132 |
|
133 |
# Remove multiple spaces and extra line breaks
|
|
|
141 |
text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
|
142 |
text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
|
143 |
text = text.replace(" ", "_") # Replace spaces with underscores
|
144 |
+
language=language.replace(" ", "_").strip()
|
145 |
# Truncate or handle empty text
|
146 |
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
|
147 |
+
|
148 |
# Generate a random string for uniqueness
|
149 |
random_string = uuid.uuid4().hex[:8].upper()
|
150 |
+
|
151 |
# Construct the file name
|
152 |
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
|
153 |
return file_name
|
|
|
168 |
audio_chunks = split_on_silence(sound,
|
169 |
min_silence_len=100,
|
170 |
silence_thresh=-45,
|
171 |
+
keep_silence=minimum_silence)
|
172 |
# Putting the file back together
|
173 |
combined = AudioSegment.empty()
|
174 |
for chunk in audio_chunks:
|
|
|
204 |
audio_bytes = audio_int16.tobytes() # Convert to bytes
|
205 |
# Write the audio chunk to the WAV file
|
206 |
wav_file.writeframes(audio_bytes)
|
207 |
+
if remove_silence:
|
208 |
keep_silence = int(keep_silence_up_to * 1000)
|
209 |
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
|
210 |
return new_wave_file,timestamps
|
|
|
251 |
|
252 |
for entry in word_level_timestamps:
|
253 |
word = entry["word"]
|
254 |
+
|
255 |
# Skip punctuation if enabled
|
256 |
if skip_punctuation and all(char in string.punctuation for char in word):
|
257 |
continue
|
|
|
290 |
|
291 |
# Skip selected punctuation from remove_punctuation list
|
292 |
if word in remove_punctuation:
|
293 |
+
continue
|
294 |
|
295 |
# Attach punctuation to the previous word
|
296 |
if word in string.punctuation:
|
297 |
if subtitle_words:
|
298 |
subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
|
299 |
+
continue
|
300 |
|
301 |
# Start a new subtitle block if needed
|
302 |
if start_time is None:
|
|
|
352 |
def fix_punctuation(text):
|
353 |
# Remove spaces before punctuation marks (., ?, !, ,)
|
354 |
text = re.sub(r'\s([.,?!])', r'\1', text)
|
355 |
+
|
356 |
# Handle quotation marks: remove spaces before and after them
|
357 |
text = text.replace('" ', '"')
|
358 |
text = text.replace(' "', '"')
|
359 |
text = text.replace('" ', '"')
|
360 |
+
|
361 |
# Track quotation marks to add space after closing quotes
|
362 |
track = 0
|
363 |
result = []
|
364 |
+
|
365 |
for index, char in enumerate(text):
|
366 |
if char == '"':
|
367 |
track += 1
|
|
|
464 |
if os.path.exists("./last"):
|
465 |
shutil.rmtree("./last")
|
466 |
os.makedirs("./last",exist_ok=True)
|
467 |
+
|
468 |
+
|
469 |
def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False, input_key=None, keep_silence_up_to=0.05):
|
470 |
print(input_key, key)
|
471 |
if input_key == key:
|
472 |
+
if translate_text:
|
473 |
text=bulk_translate(text, Language, chunk_size=500)
|
474 |
save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
|
475 |
if remove_silence==False:
|
476 |
+
# Timestamps are currently only reliably supported for English
|
477 |
if Language in ["American English", "British English"]:
|
478 |
word_level_timestamps=adjust_timestamps(timestamps)
|
479 |
word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_")
|
|
|
488 |
shutil.copy(normal_srt, "./last/")
|
489 |
shutil.copy(json_file, "./last/")
|
490 |
return save_path,save_path,word_level_srt,normal_srt,json_file
|
491 |
+
else:
|
492 |
+
# For other languages, return audio but no timestamps/SRTs
|
493 |
+
return save_path, save_path, None, None, None
|
494 |
+
# If silence removal is enabled, only return audio (as timestamps become invalid)
|
495 |
+
return save_path,save_path,None,None,None
|
496 |
else:
|
497 |
+
gr.Warning("Invalid API Key provided!", duration=5)
|
498 |
return None,None,None,None,None
|
499 |
+
|
500 |
+
|
501 |
|
502 |
|
503 |
|
|
|
515 |
["Ciao, come stai?", "Italian", "if_sara"],
|
516 |
["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
|
517 |
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
|
518 |
+
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"],
|
519 |
+
["Привет, как дела?", "Russian", "rf_annika"], # Added Russian example (using hypothetical voice)
|
520 |
+
["Hallo, wie geht's?", "German", "gf_eva"] # Added German example (using hypothetical voice)
|
521 |
]
|
522 |
+
|
523 |
with gr.Blocks() as demo:
|
524 |
# gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
|
525 |
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
526 |
+
lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese', 'Russian', 'German'] # Added Russian and German
|
527 |
voice_names = get_voice_names("hexgrad/Kokoro-82M")
|
528 |
|
529 |
with gr.Row():
|
530 |
with gr.Column():
|
531 |
text = gr.Textbox(label='📝 Enter Text', lines=3)
|
532 |
+
input_key = gr.Textbox(label='Input Key', lines=1, type="password") # Changed type to password for security
|
533 |
+
|
534 |
with gr.Row():
|
535 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
536 |
|
|
|
544 |
speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
|
545 |
translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
|
546 |
remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
|
547 |
+
keep_silence_slider = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label='🤫 Keep Silence Up To (seconds)', info='Amount of silence to keep at start/end of segments if removing silence', interactive=True) # Renamed for clarity
|
548 |
+
|
549 |
+
# Make keep_silence_slider visible only when remove_silence is checked
|
550 |
+
def update_silence_slider_visibility(remove_silence_checked):
|
551 |
+
return gr.Slider(visible=remove_silence_checked)
|
552 |
+
|
553 |
+
remove_silence.change(
|
554 |
+
fn=update_silence_slider_visibility,
|
555 |
+
inputs=[remove_silence],
|
556 |
+
outputs=[keep_silence_slider]
|
557 |
+
)
|
558 |
+
|
559 |
|
560 |
with gr.Column():
|
561 |
audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True)
|
|
|
570 |
srt_file = gr.File(label='📜 Download Sentence-Level SRT')
|
571 |
sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON')
|
572 |
|
573 |
+
text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key, keep_silence_slider], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
|
574 |
+
generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key, keep_silence_slider], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
|
575 |
|
576 |
# Add examples to the interface
|
577 |
gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
|
|
|
582 |
# Markdown explanation for language code
|
583 |
explanation = """
|
584 |
## Language Code Explanation:
|
585 |
+
Example: `'af_bella'`
|
586 |
- **'a'** stands for **American English**.
|
587 |
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
|
588 |
- **'bella'** refers to the specific voice.
|
|
|
589 |
The first character in the voice code stands for the language:
|
590 |
- **"a"**: American English
|
591 |
- **"b"**: British English
|
|
|
596 |
- **"p"**: Brazilian Portuguese
|
597 |
- **"j"**: Japanese
|
598 |
- **"z"**: Mandarin Chinese
|
599 |
+
- **"r"**: Russian # Added Russian
|
600 |
+
- **"g"**: German # Added German
|
601 |
The second character stands for gender:
|
602 |
- **"f_"**: Female
|
603 |
- **"m_"**: Male
|
|
|
631 |
pipeline = KPipeline(lang_code=last_used_language)
|
632 |
temp_folder = create_audio_dir()
|
633 |
if __name__ == "__main__":
|
634 |
+
main()
|