Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import soundfile as sf
|
5 |
+
from kokoro import KPipeline
|
6 |
+
import re
|
7 |
+
import traceback
|
8 |
+
|
9 |
+
# Helper: Format seconds into SRT timestamp (hh:mm:ss,ms)
|
10 |
+
def format_time(seconds):
|
11 |
+
hours = int(seconds // 3600)
|
12 |
+
minutes = int((seconds % 3600) // 60)
|
13 |
+
secs = seconds % 60
|
14 |
+
# Ensure milliseconds are comma separated
|
15 |
+
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
|
16 |
+
|
17 |
+
def generate_audio(text, voice, speed, lang_code, split_pattern, debug):
|
18 |
+
debug_logs = []
|
19 |
+
debug_logs.append("Starting Kokoro TTS generation...")
|
20 |
+
|
21 |
+
try:
|
22 |
+
debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)")
|
23 |
+
# Initialize the pipeline; by default, it will run on CPU if no GPU is available.
|
24 |
+
pipeline = KPipeline(lang_code=lang_code)
|
25 |
+
debug_logs.append("Pipeline initialized successfully.")
|
26 |
+
except Exception as e:
|
27 |
+
error_msg = f"Error initializing pipeline: {str(e)}"
|
28 |
+
debug_logs.append(error_msg)
|
29 |
+
return None, "", "\n".join(debug_logs)
|
30 |
+
|
31 |
+
# Prepare lists for audio segments, SRT entries, and segment-level debug info.
|
32 |
+
audio_segments = []
|
33 |
+
srt_entries = []
|
34 |
+
current_time = 0.0 # cumulative time for SRT timestamps
|
35 |
+
segment_index = 1
|
36 |
+
segment_debug_info = []
|
37 |
+
|
38 |
+
try:
|
39 |
+
debug_logs.append("Generating audio segments from input text...")
|
40 |
+
# Invoke the pipeline to process the text.
|
41 |
+
# The split_pattern parameter (regex) allows you to define how text is segmented.
|
42 |
+
generator = pipeline(
|
43 |
+
text,
|
44 |
+
voice=voice,
|
45 |
+
speed=speed,
|
46 |
+
split_pattern=split_pattern
|
47 |
+
)
|
48 |
+
|
49 |
+
for i, (gs, ps, audio) in enumerate(generator):
|
50 |
+
duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz
|
51 |
+
start_timestamp = current_time
|
52 |
+
end_timestamp = current_time + duration
|
53 |
+
# Create an SRT entry for the segment.
|
54 |
+
srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n"
|
55 |
+
srt_entries.append(srt_entry)
|
56 |
+
current_time = end_timestamp
|
57 |
+
|
58 |
+
# Record segment details for debugging.
|
59 |
+
segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}")
|
60 |
+
audio_segments.append(audio)
|
61 |
+
segment_index += 1
|
62 |
+
|
63 |
+
debug_logs.append("Audio segments generated successfully.")
|
64 |
+
except Exception as e:
|
65 |
+
error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}"
|
66 |
+
debug_logs.append(error_msg)
|
67 |
+
return None, "", "\n".join(debug_logs)
|
68 |
+
|
69 |
+
# Concatenate all the generated segments into a single audio array.
|
70 |
+
if audio_segments:
|
71 |
+
full_audio = np.concatenate(audio_segments)
|
72 |
+
else:
|
73 |
+
debug_logs.append("No audio segments were generated.")
|
74 |
+
return None, "", "\n".join(debug_logs)
|
75 |
+
|
76 |
+
# Combine all SRT entries into one string.
|
77 |
+
srt_content = "\n".join(srt_entries)
|
78 |
+
|
79 |
+
# Combine all debug logs (with optional segment details).
|
80 |
+
if debug:
|
81 |
+
debug_info = "\n".join(debug_logs + segment_debug_info)
|
82 |
+
else:
|
83 |
+
debug_info = "\n".join(debug_logs)
|
84 |
+
|
85 |
+
# Return a tuple: audio (with sample rate), the SRT text, and the debug log.
|
86 |
+
return (24000, full_audio), srt_content, debug_info
|
87 |
+
|
88 |
+
# Build the Gradio interface.
|
89 |
+
iface = gr.Interface(
|
90 |
+
fn=generate_audio,
|
91 |
+
inputs=[
|
92 |
+
gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."),
|
93 |
+
gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"),
|
94 |
+
gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0),
|
95 |
+
gr.Textbox(label="Language Code", value="a",
|
96 |
+
placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"),
|
97 |
+
gr.Textbox(label="Split Pattern (Regex)", value=r'\n+',
|
98 |
+
placeholder="Regex to split the input text (e.g., '\\n+')"),
|
99 |
+
gr.Checkbox(label="Enable Debug Mode", value=True)
|
100 |
+
],
|
101 |
+
outputs=[
|
102 |
+
gr.Audio(label="Generated Audio", type="numpy"),
|
103 |
+
gr.Textbox(label="Generated SRT"),
|
104 |
+
gr.Textbox(label="Debug Information", lines=15)
|
105 |
+
],
|
106 |
+
title="Kokoro TTS Gradio App (CPU Mode)",
|
107 |
+
description=("This app uses the Kokoro TTS model to generate audio from text. "
|
108 |
+
"You can tweak parameters such as voice, speed, language code, and the text split pattern. "
|
109 |
+
"When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.")
|
110 |
+
)
|
111 |
+
|
112 |
+
if __name__ == "__main__":
|
113 |
+
iface.launch()
|