Spaces:

sdafd
/

kokoro-test

Running

App Files Files Community

sdafd commited on Mar 2

Commit

b7ec89c

verified ·

1 Parent(s): 744ca6c

Create app.py

Browse files

Files changed (1) hide show

app.py +113 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+import numpy as np
+import torch
+import soundfile as sf
+from kokoro import KPipeline
+import re
+import traceback
+# Helper: Format seconds into SRT timestamp (hh:mm:ss,ms)
+def format_time(seconds):
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    # Ensure milliseconds are comma separated
+    return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
+def generate_audio(text, voice, speed, lang_code, split_pattern, debug):
+    debug_logs = []
+    debug_logs.append("Starting Kokoro TTS generation...")
+    try:
+        debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)")
+        # Initialize the pipeline; by default, it will run on CPU if no GPU is available.
+        pipeline = KPipeline(lang_code=lang_code)
+        debug_logs.append("Pipeline initialized successfully.")
+    except Exception as e:
+        error_msg = f"Error initializing pipeline: {str(e)}"
+        debug_logs.append(error_msg)
+        return None, "", "\n".join(debug_logs)
+    # Prepare lists for audio segments, SRT entries, and segment-level debug info.
+    audio_segments = []
+    srt_entries = []
+    current_time = 0.0  # cumulative time for SRT timestamps
+    segment_index = 1
+    segment_debug_info = []
+    try:
+        debug_logs.append("Generating audio segments from input text...")
+        # Invoke the pipeline to process the text.
+        # The split_pattern parameter (regex) allows you to define how text is segmented.
+        generator = pipeline(
+            text,
+            voice=voice,
+            speed=speed,
+            split_pattern=split_pattern
+        )
+        for i, (gs, ps, audio) in enumerate(generator):
+            duration = len(audio) / 24000.0  # assuming a sample rate of 24000 Hz
+            start_timestamp = current_time
+            end_timestamp = current_time + duration
+            # Create an SRT entry for the segment.
+            srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n"
+            srt_entries.append(srt_entry)
+            current_time = end_timestamp
+            # Record segment details for debugging.
+            segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}")
+            audio_segments.append(audio)
+            segment_index += 1
+        debug_logs.append("Audio segments generated successfully.")
+    except Exception as e:
+        error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}"
+        debug_logs.append(error_msg)
+        return None, "", "\n".join(debug_logs)
+    # Concatenate all the generated segments into a single audio array.
+    if audio_segments:
+        full_audio = np.concatenate(audio_segments)
+    else:
+        debug_logs.append("No audio segments were generated.")
+        return None, "", "\n".join(debug_logs)
+    # Combine all SRT entries into one string.
+    srt_content = "\n".join(srt_entries)
+    # Combine all debug logs (with optional segment details).
+    if debug:
+        debug_info = "\n".join(debug_logs + segment_debug_info)
+    else:
+        debug_info = "\n".join(debug_logs)
+    # Return a tuple: audio (with sample rate), the SRT text, and the debug log.
+    return (24000, full_audio), srt_content, debug_info
+# Build the Gradio interface.
+iface = gr.Interface(
+    fn=generate_audio,
+    inputs=[
+        gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."),
+        gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"),
+        gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0),
+        gr.Textbox(label="Language Code", value="a",
+                   placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"),
+        gr.Textbox(label="Split Pattern (Regex)", value=r'\n+',
+                   placeholder="Regex to split the input text (e.g., '\\n+')"),
+        gr.Checkbox(label="Enable Debug Mode", value=True)
+    ],
+    outputs=[
+        gr.Audio(label="Generated Audio", type="numpy"),
+        gr.Textbox(label="Generated SRT"),
+        gr.Textbox(label="Debug Information", lines=15)
+    ],
+    title="Kokoro TTS Gradio App (CPU Mode)",
+    description=("This app uses the Kokoro TTS model to generate audio from text. "
+                 "You can tweak parameters such as voice, speed, language code, and the text split pattern. "
+                 "When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.")
+)
+if __name__ == "__main__":
+    iface.launch()