KOKORO-TTS-1.0

Running

App Files Files Community

sdafd commited on 23 days ago

Commit

1406b91

verified ·

1 Parent(s): d43d439

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -37

app.py CHANGED Viewed

@@ -5,22 +5,24 @@ from kokoro import KPipeline
 import os
 from huggingface_hub import list_repo_files
 import uuid
-import re
 import gradio as gr
 key = os.getenv("SECRET_KEY", None)
-#translate langauge
 from deep_translator import GoogleTranslator
 def bulk_translate(text, target_language, chunk_size=500):
     language_map_local = {
-    "American English": "en",
-    "British English": "en",
     "Hindi": "hi",
     "Spanish": "es",
     "French": "fr",
     "Italian": "it",
     "Brazilian Portuguese": "pt",
     "Japanese": "ja",
-    "Mandarin Chinese": "zh-CN"
     }
     # lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
     lang_code=language_map_local[target_language]
@@ -41,7 +43,7 @@ def bulk_translate(text, target_language, chunk_size=500):
     translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
     result=" ".join(translated_chunks)
     return result.strip()
 # Language mapping dictionary
 language_map = {
     "American English": "a",
@@ -52,7 +54,9 @@ language_map = {
     "Italian": "i",
     "Brazilian Portuguese": "p",
     "Japanese": "j",
-    "Mandarin Chinese": "z"
 }
@@ -65,7 +69,7 @@ def update_pipeline(Language):
     # Only update if the language is different
     if new_lang != last_used_language:
         pipeline = KPipeline(lang_code=new_lang)
-        last_used_language = new_lang
         try:
             pipeline = KPipeline(lang_code=new_lang)
             last_used_language = new_lang  # Update last used language
@@ -123,7 +127,7 @@ def clean_text(text):
         r'[\U00002702-\U000027B0]|'  # Dingbats
         r'[\U0001F1E0-\U0001F1FF]'   # Flags (iOS)
         r'', flags=re.UNICODE)
     text = emoji_pattern.sub(r'', text)
     # Remove multiple spaces and extra line breaks
@@ -137,13 +141,13 @@ def tts_file_name(text,language):
     text = re.sub(r'[^a-zA-Z\s]', '', text)  # Retain only alphabets and spaces
     text = text.lower().strip()             # Convert to lowercase and strip leading/trailing spaces
     text = text.replace(" ", "_")           # Replace spaces with underscores
-    language=language.replace(" ", "_").strip()
     # Truncate or handle empty text
     truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
     # Generate a random string for uniqueness
     random_string = uuid.uuid4().hex[:8].upper()
     # Construct the file name
     file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
     return file_name
@@ -164,7 +168,7 @@ def remove_silence_function(file_path,minimum_silence=50):
     audio_chunks = split_on_silence(sound,
                                     min_silence_len=100,
                                     silence_thresh=-45,
-                                    keep_silence=minimum_silence)
     # Putting the file back together
     combined = AudioSegment.empty()
     for chunk in audio_chunks:
@@ -200,7 +204,7 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
           audio_bytes = audio_int16.tobytes()  # Convert to bytes
           # Write the audio chunk to the WAV file
           wav_file.writeframes(audio_bytes)
-    if remove_silence:
       keep_silence = int(keep_silence_up_to * 1000)
       new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
       return new_wave_file,timestamps
@@ -247,7 +251,7 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
         for entry in word_level_timestamps:
             word = entry["word"]
             # Skip punctuation if enabled
             if skip_punctuation and all(char in string.punctuation for char in word):
                 continue
@@ -286,13 +290,13 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
         # Skip selected punctuation from remove_punctuation list
         if word in remove_punctuation:
-            continue
         # Attach punctuation to the previous word
         if word in string.punctuation:
             if subtitle_words:
                 subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
-            continue
         # Start a new subtitle block if needed
         if start_time is None:
@@ -348,16 +352,16 @@ import re
 def fix_punctuation(text):
     # Remove spaces before punctuation marks (., ?, !, ,)
     text = re.sub(r'\s([.,?!])', r'\1', text)
     # Handle quotation marks: remove spaces before and after them
     text = text.replace('" ', '"')
     text = text.replace(' "', '"')
     text = text.replace('" ', '"')
     # Track quotation marks to add space after closing quotes
     track = 0
     result = []
     for index, char in enumerate(text):
         if char == '"':
             track += 1
@@ -460,15 +464,16 @@ def save_current_data():
     if os.path.exists("./last"):
         shutil.rmtree("./last")
     os.makedirs("./last",exist_ok=True)
 def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False, input_key=None, keep_silence_up_to=0.05):
     print(input_key, key)
     if input_key == key:
-        if translate_text:
             text=bulk_translate(text, Language, chunk_size=500)
         save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
         if remove_silence==False:
             if Language in ["American English", "British English"]:
                 word_level_timestamps=adjust_timestamps(timestamps)
                 word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_")
@@ -483,11 +488,16 @@ def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,t
                 shutil.copy(normal_srt, "./last/")
                 shutil.copy(json_file, "./last/")
                 return save_path,save_path,word_level_srt,normal_srt,json_file
-        return save_path,save_path,None,None,None
     else:
         return None,None,None,None,None
@@ -505,20 +515,22 @@ def ui():
         ["Ciao, come stai?", "Italian", "if_sara"],
         ["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
         ["こんにちは、お元気ですか？", "Japanese", "jf_nezumi"],
-        ["你好，你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
     ]
     with gr.Blocks() as demo:
         # gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>")  # Larger title with CSS
         gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
-        lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
         voice_names = get_voice_names("hexgrad/Kokoro-82M")
         with gr.Row():
             with gr.Column():
                 text = gr.Textbox(label='📝 Enter Text', lines=3)
-                input_key = gr.Textbox(label='Input Key', lines=1)
                 with gr.Row():
                     language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
@@ -532,6 +544,18 @@ def ui():
                     speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
                     translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
                     remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
             with gr.Column():
                 audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True)
@@ -546,8 +570,8 @@ def ui():
                     srt_file = gr.File(label='📜 Download Sentence-Level SRT')
                     sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON')
-        text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
-        generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
         # Add examples to the interface
         gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
@@ -558,11 +582,10 @@ def tutorial():
     # Markdown explanation for language code
     explanation = """
     ## Language Code Explanation:
-    Example: `'af_bella'`
     - **'a'** stands for **American English**.
     - **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
     - **'bella'** refers to the specific voice.
     The first character in the voice code stands for the language:
     - **"a"**: American English
     - **"b"**: British English
@@ -573,7 +596,8 @@ def tutorial():
     - **"p"**: Brazilian Portuguese
     - **"j"**: Japanese
     - **"z"**: Mandarin Chinese
     The second character stands for gender:
     - **"f_"**: Female
     - **"m_"**: Male
@@ -607,4 +631,4 @@ last_used_language = "a"
 pipeline = KPipeline(lang_code=last_used_language)
 temp_folder = create_audio_dir()
 if __name__ == "__main__":
-    main()

 import os
 from huggingface_hub import list_repo_files
 import uuid
+import re
 import gradio as gr
 key = os.getenv("SECRET_KEY", None)
+#translate langauge
 from deep_translator import GoogleTranslator
 def bulk_translate(text, target_language, chunk_size=500):
     language_map_local = {
+    "American English": "en",
+    "British English": "en",
     "Hindi": "hi",
     "Spanish": "es",
     "French": "fr",
     "Italian": "it",
     "Brazilian Portuguese": "pt",
     "Japanese": "ja",
+    "Mandarin Chinese": "zh-CN",
+    "Russian": "ru", # Added Russian
+    "German": "de"   # Added German
     }
     # lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
     lang_code=language_map_local[target_language]
     translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
     result=" ".join(translated_chunks)
     return result.strip()
 # Language mapping dictionary
 language_map = {
     "American English": "a",
     "Italian": "i",
     "Brazilian Portuguese": "p",
     "Japanese": "j",
+    "Mandarin Chinese": "z",
+    "Russian": "r", # Added Russian code
+    "German": "g"  # Added German code
 }
     # Only update if the language is different
     if new_lang != last_used_language:
         pipeline = KPipeline(lang_code=new_lang)
+        last_used_language = new_lang
         try:
             pipeline = KPipeline(lang_code=new_lang)
             last_used_language = new_lang  # Update last used language
         r'[\U00002702-\U000027B0]|'  # Dingbats
         r'[\U0001F1E0-\U0001F1FF]'   # Flags (iOS)
         r'', flags=re.UNICODE)
     text = emoji_pattern.sub(r'', text)
     # Remove multiple spaces and extra line breaks
     text = re.sub(r'[^a-zA-Z\s]', '', text)  # Retain only alphabets and spaces
     text = text.lower().strip()             # Convert to lowercase and strip leading/trailing spaces
     text = text.replace(" ", "_")           # Replace spaces with underscores
+    language=language.replace(" ", "_").strip()
     # Truncate or handle empty text
     truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
     # Generate a random string for uniqueness
     random_string = uuid.uuid4().hex[:8].upper()
     # Construct the file name
     file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
     return file_name
     audio_chunks = split_on_silence(sound,
                                     min_silence_len=100,
                                     silence_thresh=-45,
+                                    keep_silence=minimum_silence)
     # Putting the file back together
     combined = AudioSegment.empty()
     for chunk in audio_chunks:
           audio_bytes = audio_int16.tobytes()  # Convert to bytes
           # Write the audio chunk to the WAV file
           wav_file.writeframes(audio_bytes)
+    if remove_silence:
       keep_silence = int(keep_silence_up_to * 1000)
       new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
       return new_wave_file,timestamps
         for entry in word_level_timestamps:
             word = entry["word"]
             # Skip punctuation if enabled
             if skip_punctuation and all(char in string.punctuation for char in word):
                 continue
         # Skip selected punctuation from remove_punctuation list
         if word in remove_punctuation:
+            continue
         # Attach punctuation to the previous word
         if word in string.punctuation:
             if subtitle_words:
                 subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
+            continue
         # Start a new subtitle block if needed
         if start_time is None:
 def fix_punctuation(text):
     # Remove spaces before punctuation marks (., ?, !, ,)
     text = re.sub(r'\s([.,?!])', r'\1', text)
     # Handle quotation marks: remove spaces before and after them
     text = text.replace('" ', '"')
     text = text.replace(' "', '"')
     text = text.replace('" ', '"')
     # Track quotation marks to add space after closing quotes
     track = 0
     result = []
     for index, char in enumerate(text):
         if char == '"':
             track += 1
     if os.path.exists("./last"):
         shutil.rmtree("./last")
     os.makedirs("./last",exist_ok=True)
 def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False, input_key=None, keep_silence_up_to=0.05):
     print(input_key, key)
     if input_key == key:
+        if translate_text:
             text=bulk_translate(text, Language, chunk_size=500)
         save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
         if remove_silence==False:
+            # Timestamps are currently only reliably supported for English
             if Language in ["American English", "British English"]:
                 word_level_timestamps=adjust_timestamps(timestamps)
                 word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_")
                 shutil.copy(normal_srt, "./last/")
                 shutil.copy(json_file, "./last/")
                 return save_path,save_path,word_level_srt,normal_srt,json_file
+            else:
+                 # For other languages, return audio but no timestamps/SRTs
+                 return save_path, save_path, None, None, None
+        # If silence removal is enabled, only return audio (as timestamps become invalid)
+        return save_path,save_path,None,None,None
     else:
+        gr.Warning("Invalid API Key provided!", duration=5)
         return None,None,None,None,None
         ["Ciao, come stai?", "Italian", "if_sara"],
         ["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
         ["こんにちは、お元気ですか？", "Japanese", "jf_nezumi"],
+        ["你好，你怎么样?", "Mandarin Chinese", "zf_xiaoni"],
+        ["Привет, как дела?", "Russian", "rf_annika"], # Added Russian example (using hypothetical voice)
+        ["Hallo, wie geht's?", "German", "gf_eva"]    # Added German example (using hypothetical voice)
     ]
     with gr.Blocks() as demo:
         # gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>")  # Larger title with CSS
         gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
+        lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese', 'Russian', 'German'] # Added Russian and German
         voice_names = get_voice_names("hexgrad/Kokoro-82M")
         with gr.Row():
             with gr.Column():
                 text = gr.Textbox(label='📝 Enter Text', lines=3)
+                input_key = gr.Textbox(label='Input Key', lines=1, type="password") # Changed type to password for security
                 with gr.Row():
                     language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
                     speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
                     translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
                     remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
+                    keep_silence_slider = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label='🤫 Keep Silence Up To (seconds)', info='Amount of silence to keep at start/end of segments if removing silence', interactive=True) # Renamed for clarity
+                    # Make keep_silence_slider visible only when remove_silence is checked
+                    def update_silence_slider_visibility(remove_silence_checked):
+                        return gr.Slider(visible=remove_silence_checked)
+                    remove_silence.change(
+                        fn=update_silence_slider_visibility,
+                        inputs=[remove_silence],
+                        outputs=[keep_silence_slider]
+                    )
             with gr.Column():
                 audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True)
                     srt_file = gr.File(label='📜 Download Sentence-Level SRT')
                     sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON')
+        text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key, keep_silence_slider], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
+        generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key, keep_silence_slider], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
         # Add examples to the interface
         gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
     # Markdown explanation for language code
     explanation = """
     ## Language Code Explanation:
+    Example: `'af_bella'`
     - **'a'** stands for **American English**.
     - **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
     - **'bella'** refers to the specific voice.
     The first character in the voice code stands for the language:
     - **"a"**: American English
     - **"b"**: British English
     - **"p"**: Brazilian Portuguese
     - **"j"**: Japanese
     - **"z"**: Mandarin Chinese
+    - **"r"**: Russian  # Added Russian
+    - **"g"**: German   # Added German
     The second character stands for gender:
     - **"f_"**: Female
     - **"m_"**: Male
 pipeline = KPipeline(lang_code=last_used_language)
 temp_folder = create_audio_dir()
 if __name__ == "__main__":
+    main()