sdafd commited on
Commit
1406b91
·
verified ·
1 Parent(s): d43d439

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -37
app.py CHANGED
@@ -5,22 +5,24 @@ from kokoro import KPipeline
5
  import os
6
  from huggingface_hub import list_repo_files
7
  import uuid
8
- import re
9
  import gradio as gr
10
  key = os.getenv("SECRET_KEY", None)
11
- #translate langauge
12
  from deep_translator import GoogleTranslator
13
  def bulk_translate(text, target_language, chunk_size=500):
14
  language_map_local = {
15
- "American English": "en",
16
- "British English": "en",
17
  "Hindi": "hi",
18
  "Spanish": "es",
19
  "French": "fr",
20
  "Italian": "it",
21
  "Brazilian Portuguese": "pt",
22
  "Japanese": "ja",
23
- "Mandarin Chinese": "zh-CN"
 
 
24
  }
25
  # lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
26
  lang_code=language_map_local[target_language]
@@ -41,7 +43,7 @@ def bulk_translate(text, target_language, chunk_size=500):
41
  translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
42
  result=" ".join(translated_chunks)
43
  return result.strip()
44
-
45
  # Language mapping dictionary
46
  language_map = {
47
  "American English": "a",
@@ -52,7 +54,9 @@ language_map = {
52
  "Italian": "i",
53
  "Brazilian Portuguese": "p",
54
  "Japanese": "j",
55
- "Mandarin Chinese": "z"
 
 
56
  }
57
 
58
 
@@ -65,7 +69,7 @@ def update_pipeline(Language):
65
  # Only update if the language is different
66
  if new_lang != last_used_language:
67
  pipeline = KPipeline(lang_code=new_lang)
68
- last_used_language = new_lang
69
  try:
70
  pipeline = KPipeline(lang_code=new_lang)
71
  last_used_language = new_lang # Update last used language
@@ -123,7 +127,7 @@ def clean_text(text):
123
  r'[\U00002702-\U000027B0]|' # Dingbats
124
  r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
125
  r'', flags=re.UNICODE)
126
-
127
  text = emoji_pattern.sub(r'', text)
128
 
129
  # Remove multiple spaces and extra line breaks
@@ -137,13 +141,13 @@ def tts_file_name(text,language):
137
  text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
138
  text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
139
  text = text.replace(" ", "_") # Replace spaces with underscores
140
- language=language.replace(" ", "_").strip()
141
  # Truncate or handle empty text
142
  truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
143
-
144
  # Generate a random string for uniqueness
145
  random_string = uuid.uuid4().hex[:8].upper()
146
-
147
  # Construct the file name
148
  file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
149
  return file_name
@@ -164,7 +168,7 @@ def remove_silence_function(file_path,minimum_silence=50):
164
  audio_chunks = split_on_silence(sound,
165
  min_silence_len=100,
166
  silence_thresh=-45,
167
- keep_silence=minimum_silence)
168
  # Putting the file back together
169
  combined = AudioSegment.empty()
170
  for chunk in audio_chunks:
@@ -200,7 +204,7 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
200
  audio_bytes = audio_int16.tobytes() # Convert to bytes
201
  # Write the audio chunk to the WAV file
202
  wav_file.writeframes(audio_bytes)
203
- if remove_silence:
204
  keep_silence = int(keep_silence_up_to * 1000)
205
  new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
206
  return new_wave_file,timestamps
@@ -247,7 +251,7 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
247
 
248
  for entry in word_level_timestamps:
249
  word = entry["word"]
250
-
251
  # Skip punctuation if enabled
252
  if skip_punctuation and all(char in string.punctuation for char in word):
253
  continue
@@ -286,13 +290,13 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
286
 
287
  # Skip selected punctuation from remove_punctuation list
288
  if word in remove_punctuation:
289
- continue
290
 
291
  # Attach punctuation to the previous word
292
  if word in string.punctuation:
293
  if subtitle_words:
294
  subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
295
- continue
296
 
297
  # Start a new subtitle block if needed
298
  if start_time is None:
@@ -348,16 +352,16 @@ import re
348
  def fix_punctuation(text):
349
  # Remove spaces before punctuation marks (., ?, !, ,)
350
  text = re.sub(r'\s([.,?!])', r'\1', text)
351
-
352
  # Handle quotation marks: remove spaces before and after them
353
  text = text.replace('" ', '"')
354
  text = text.replace(' "', '"')
355
  text = text.replace('" ', '"')
356
-
357
  # Track quotation marks to add space after closing quotes
358
  track = 0
359
  result = []
360
-
361
  for index, char in enumerate(text):
362
  if char == '"':
363
  track += 1
@@ -460,15 +464,16 @@ def save_current_data():
460
  if os.path.exists("./last"):
461
  shutil.rmtree("./last")
462
  os.makedirs("./last",exist_ok=True)
463
-
464
-
465
  def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False, input_key=None, keep_silence_up_to=0.05):
466
  print(input_key, key)
467
  if input_key == key:
468
- if translate_text:
469
  text=bulk_translate(text, Language, chunk_size=500)
470
  save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
471
  if remove_silence==False:
 
472
  if Language in ["American English", "British English"]:
473
  word_level_timestamps=adjust_timestamps(timestamps)
474
  word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_")
@@ -483,11 +488,16 @@ def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,t
483
  shutil.copy(normal_srt, "./last/")
484
  shutil.copy(json_file, "./last/")
485
  return save_path,save_path,word_level_srt,normal_srt,json_file
486
- return save_path,save_path,None,None,None
 
 
 
 
487
  else:
 
488
  return None,None,None,None,None
489
-
490
-
491
 
492
 
493
 
@@ -505,20 +515,22 @@ def ui():
505
  ["Ciao, come stai?", "Italian", "if_sara"],
506
  ["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
507
  ["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
508
- ["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
 
 
509
  ]
510
-
511
  with gr.Blocks() as demo:
512
  # gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
513
  gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
514
- lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
515
  voice_names = get_voice_names("hexgrad/Kokoro-82M")
516
 
517
  with gr.Row():
518
  with gr.Column():
519
  text = gr.Textbox(label='📝 Enter Text', lines=3)
520
- input_key = gr.Textbox(label='Input Key', lines=1)
521
-
522
  with gr.Row():
523
  language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
524
 
@@ -532,6 +544,18 @@ def ui():
532
  speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
533
  translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
534
  remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
 
 
 
 
 
 
 
 
 
 
 
 
535
 
536
  with gr.Column():
537
  audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True)
@@ -546,8 +570,8 @@ def ui():
546
  srt_file = gr.File(label='📜 Download Sentence-Level SRT')
547
  sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON')
548
 
549
- text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
550
- generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
551
 
552
  # Add examples to the interface
553
  gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
@@ -558,11 +582,10 @@ def tutorial():
558
  # Markdown explanation for language code
559
  explanation = """
560
  ## Language Code Explanation:
561
- Example: `'af_bella'`
562
  - **'a'** stands for **American English**.
563
  - **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
564
  - **'bella'** refers to the specific voice.
565
-
566
  The first character in the voice code stands for the language:
567
  - **"a"**: American English
568
  - **"b"**: British English
@@ -573,7 +596,8 @@ def tutorial():
573
  - **"p"**: Brazilian Portuguese
574
  - **"j"**: Japanese
575
  - **"z"**: Mandarin Chinese
576
-
 
577
  The second character stands for gender:
578
  - **"f_"**: Female
579
  - **"m_"**: Male
@@ -607,4 +631,4 @@ last_used_language = "a"
607
  pipeline = KPipeline(lang_code=last_used_language)
608
  temp_folder = create_audio_dir()
609
  if __name__ == "__main__":
610
- main()
 
5
  import os
6
  from huggingface_hub import list_repo_files
7
  import uuid
8
+ import re
9
  import gradio as gr
10
  key = os.getenv("SECRET_KEY", None)
11
+ #translate langauge
12
  from deep_translator import GoogleTranslator
13
  def bulk_translate(text, target_language, chunk_size=500):
14
  language_map_local = {
15
+ "American English": "en",
16
+ "British English": "en",
17
  "Hindi": "hi",
18
  "Spanish": "es",
19
  "French": "fr",
20
  "Italian": "it",
21
  "Brazilian Portuguese": "pt",
22
  "Japanese": "ja",
23
+ "Mandarin Chinese": "zh-CN",
24
+ "Russian": "ru", # Added Russian
25
+ "German": "de" # Added German
26
  }
27
  # lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
28
  lang_code=language_map_local[target_language]
 
43
  translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
44
  result=" ".join(translated_chunks)
45
  return result.strip()
46
+
47
  # Language mapping dictionary
48
  language_map = {
49
  "American English": "a",
 
54
  "Italian": "i",
55
  "Brazilian Portuguese": "p",
56
  "Japanese": "j",
57
+ "Mandarin Chinese": "z",
58
+ "Russian": "r", # Added Russian code
59
+ "German": "g" # Added German code
60
  }
61
 
62
 
 
69
  # Only update if the language is different
70
  if new_lang != last_used_language:
71
  pipeline = KPipeline(lang_code=new_lang)
72
+ last_used_language = new_lang
73
  try:
74
  pipeline = KPipeline(lang_code=new_lang)
75
  last_used_language = new_lang # Update last used language
 
127
  r'[\U00002702-\U000027B0]|' # Dingbats
128
  r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
129
  r'', flags=re.UNICODE)
130
+
131
  text = emoji_pattern.sub(r'', text)
132
 
133
  # Remove multiple spaces and extra line breaks
 
141
  text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
142
  text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
143
  text = text.replace(" ", "_") # Replace spaces with underscores
144
+ language=language.replace(" ", "_").strip()
145
  # Truncate or handle empty text
146
  truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
147
+
148
  # Generate a random string for uniqueness
149
  random_string = uuid.uuid4().hex[:8].upper()
150
+
151
  # Construct the file name
152
  file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
153
  return file_name
 
168
  audio_chunks = split_on_silence(sound,
169
  min_silence_len=100,
170
  silence_thresh=-45,
171
+ keep_silence=minimum_silence)
172
  # Putting the file back together
173
  combined = AudioSegment.empty()
174
  for chunk in audio_chunks:
 
204
  audio_bytes = audio_int16.tobytes() # Convert to bytes
205
  # Write the audio chunk to the WAV file
206
  wav_file.writeframes(audio_bytes)
207
+ if remove_silence:
208
  keep_silence = int(keep_silence_up_to * 1000)
209
  new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
210
  return new_wave_file,timestamps
 
251
 
252
  for entry in word_level_timestamps:
253
  word = entry["word"]
254
+
255
  # Skip punctuation if enabled
256
  if skip_punctuation and all(char in string.punctuation for char in word):
257
  continue
 
290
 
291
  # Skip selected punctuation from remove_punctuation list
292
  if word in remove_punctuation:
293
+ continue
294
 
295
  # Attach punctuation to the previous word
296
  if word in string.punctuation:
297
  if subtitle_words:
298
  subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
299
+ continue
300
 
301
  # Start a new subtitle block if needed
302
  if start_time is None:
 
352
  def fix_punctuation(text):
353
  # Remove spaces before punctuation marks (., ?, !, ,)
354
  text = re.sub(r'\s([.,?!])', r'\1', text)
355
+
356
  # Handle quotation marks: remove spaces before and after them
357
  text = text.replace('" ', '"')
358
  text = text.replace(' "', '"')
359
  text = text.replace('" ', '"')
360
+
361
  # Track quotation marks to add space after closing quotes
362
  track = 0
363
  result = []
364
+
365
  for index, char in enumerate(text):
366
  if char == '"':
367
  track += 1
 
464
  if os.path.exists("./last"):
465
  shutil.rmtree("./last")
466
  os.makedirs("./last",exist_ok=True)
467
+
468
+
469
  def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False, input_key=None, keep_silence_up_to=0.05):
470
  print(input_key, key)
471
  if input_key == key:
472
+ if translate_text:
473
  text=bulk_translate(text, Language, chunk_size=500)
474
  save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
475
  if remove_silence==False:
476
+ # Timestamps are currently only reliably supported for English
477
  if Language in ["American English", "British English"]:
478
  word_level_timestamps=adjust_timestamps(timestamps)
479
  word_level_srt = modify_filename(save_path.replace(".wav", ".srt"), prefix="word_level_")
 
488
  shutil.copy(normal_srt, "./last/")
489
  shutil.copy(json_file, "./last/")
490
  return save_path,save_path,word_level_srt,normal_srt,json_file
491
+ else:
492
+ # For other languages, return audio but no timestamps/SRTs
493
+ return save_path, save_path, None, None, None
494
+ # If silence removal is enabled, only return audio (as timestamps become invalid)
495
+ return save_path,save_path,None,None,None
496
  else:
497
+ gr.Warning("Invalid API Key provided!", duration=5)
498
  return None,None,None,None,None
499
+
500
+
501
 
502
 
503
 
 
515
  ["Ciao, come stai?", "Italian", "if_sara"],
516
  ["Olá, como você está?", "Brazilian Portuguese", "pf_dora"],
517
  ["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
518
+ ["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"],
519
+ ["Привет, как дела?", "Russian", "rf_annika"], # Added Russian example (using hypothetical voice)
520
+ ["Hallo, wie geht's?", "German", "gf_eva"] # Added German example (using hypothetical voice)
521
  ]
522
+
523
  with gr.Blocks() as demo:
524
  # gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
525
  gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
526
+ lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese', 'Russian', 'German'] # Added Russian and German
527
  voice_names = get_voice_names("hexgrad/Kokoro-82M")
528
 
529
  with gr.Row():
530
  with gr.Column():
531
  text = gr.Textbox(label='📝 Enter Text', lines=3)
532
+ input_key = gr.Textbox(label='Input Key', lines=1, type="password") # Changed type to password for security
533
+
534
  with gr.Row():
535
  language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
536
 
 
544
  speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='⚡️Speed', info='Adjust the speaking speed')
545
  translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
546
  remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
547
+ keep_silence_slider = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label='🤫 Keep Silence Up To (seconds)', info='Amount of silence to keep at start/end of segments if removing silence', interactive=True) # Renamed for clarity
548
+
549
+ # Make keep_silence_slider visible only when remove_silence is checked
550
+ def update_silence_slider_visibility(remove_silence_checked):
551
+ return gr.Slider(visible=remove_silence_checked)
552
+
553
+ remove_silence.change(
554
+ fn=update_silence_slider_visibility,
555
+ inputs=[remove_silence],
556
+ outputs=[keep_silence_slider]
557
+ )
558
+
559
 
560
  with gr.Column():
561
  audio = gr.Audio(interactive=False, label='🔊 Output Audio', autoplay=True)
 
570
  srt_file = gr.File(label='📜 Download Sentence-Level SRT')
571
  sentence_duration_file = gr.File(label='⏳ Download Sentence Timestamp JSON')
572
 
573
+ text.submit(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key, keep_silence_slider], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
574
+ generate_btn.click(KOKORO_TTS_API, inputs=[text, language_name, voice_name, speed,translate_text, remove_silence,input_key, keep_silence_slider], outputs=[audio, audio_file,word_level_srt_file,srt_file,sentence_duration_file])
575
 
576
  # Add examples to the interface
577
  gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
 
582
  # Markdown explanation for language code
583
  explanation = """
584
  ## Language Code Explanation:
585
+ Example: `'af_bella'`
586
  - **'a'** stands for **American English**.
587
  - **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
588
  - **'bella'** refers to the specific voice.
 
589
  The first character in the voice code stands for the language:
590
  - **"a"**: American English
591
  - **"b"**: British English
 
596
  - **"p"**: Brazilian Portuguese
597
  - **"j"**: Japanese
598
  - **"z"**: Mandarin Chinese
599
+ - **"r"**: Russian # Added Russian
600
+ - **"g"**: German # Added German
601
  The second character stands for gender:
602
  - **"f_"**: Female
603
  - **"m_"**: Male
 
631
  pipeline = KPipeline(lang_code=last_used_language)
632
  temp_folder = create_audio_dir()
633
  if __name__ == "__main__":
634
+ main()