import gradio as gr import numpy as np import torch import soundfile as sf from kokoro import KPipeline import re import traceback # Helper: Format seconds into SRT timestamp (hh:mm:ss,ms) def format_time(seconds): hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = seconds % 60 # Ensure milliseconds are comma separated return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',') def generate_audio(text, voice, speed, lang_code, split_pattern, debug): debug_logs = [] debug_logs.append("Starting Kokoro TTS generation...") try: debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)") # Initialize the pipeline; by default, it will run on CPU if no GPU is available. pipeline = KPipeline(lang_code=lang_code) debug_logs.append("Pipeline initialized successfully.") except Exception as e: error_msg = f"Error initializing pipeline: {str(e)}" debug_logs.append(error_msg) return None, "", "\n".join(debug_logs) # Prepare lists for audio segments, SRT entries, and segment-level debug info. audio_segments = [] srt_entries = [] current_time = 0.0 # cumulative time for SRT timestamps segment_index = 1 segment_debug_info = [] try: debug_logs.append("Generating audio segments from input text...") # Invoke the pipeline to process the text. # The split_pattern parameter (regex) allows you to define how text is segmented. generator = pipeline( text, voice=voice, speed=speed, split_pattern=split_pattern ) for i, (gs, ps, audio) in enumerate(generator): duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz start_timestamp = current_time end_timestamp = current_time + duration # Create an SRT entry for the segment. srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n" srt_entries.append(srt_entry) current_time = end_timestamp # Record segment details for debugging. segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}") audio_segments.append(audio) segment_index += 1 debug_logs.append("Audio segments generated successfully.") except Exception as e: error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}" debug_logs.append(error_msg) return None, "", "\n".join(debug_logs) # Concatenate all the generated segments into a single audio array. if audio_segments: full_audio = np.concatenate(audio_segments) else: debug_logs.append("No audio segments were generated.") return None, "", "\n".join(debug_logs) # Combine all SRT entries into one string. srt_content = "\n".join(srt_entries) # Combine all debug logs (with optional segment details). if debug: debug_info = "\n".join(debug_logs + segment_debug_info) else: debug_info = "\n".join(debug_logs) # Return a tuple: audio (with sample rate), the SRT text, and the debug log. return (24000, full_audio), srt_content, debug_info # Build the Gradio interface. iface = gr.Interface( fn=generate_audio, inputs=[ gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."), gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"), gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0), gr.Textbox(label="Language Code", value="a", placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"), gr.Textbox(label="Split Pattern (Regex)", value=r'\n+', placeholder="Regex to split the input text (e.g., '\\n+')"), gr.Checkbox(label="Enable Debug Mode", value=True) ], outputs=[ gr.Audio(label="Generated Audio", type="numpy"), gr.Textbox(label="Generated SRT"), gr.Textbox(label="Debug Information", lines=15) ], title="Kokoro TTS Gradio App (CPU Mode)", description=("This app uses the Kokoro TTS model to generate audio from text. " "You can tweak parameters such as voice, speed, language code, and the text split pattern. " "When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.") ) if __name__ == "__main__": iface.launch()