sdafd commited on
Commit
b7ec89c
·
verified ·
1 Parent(s): 744ca6c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import soundfile as sf
5
+ from kokoro import KPipeline
6
+ import re
7
+ import traceback
8
+
9
+ # Helper: Format seconds into SRT timestamp (hh:mm:ss,ms)
10
+ def format_time(seconds):
11
+ hours = int(seconds // 3600)
12
+ minutes = int((seconds % 3600) // 60)
13
+ secs = seconds % 60
14
+ # Ensure milliseconds are comma separated
15
+ return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
16
+
17
+ def generate_audio(text, voice, speed, lang_code, split_pattern, debug):
18
+ debug_logs = []
19
+ debug_logs.append("Starting Kokoro TTS generation...")
20
+
21
+ try:
22
+ debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)")
23
+ # Initialize the pipeline; by default, it will run on CPU if no GPU is available.
24
+ pipeline = KPipeline(lang_code=lang_code)
25
+ debug_logs.append("Pipeline initialized successfully.")
26
+ except Exception as e:
27
+ error_msg = f"Error initializing pipeline: {str(e)}"
28
+ debug_logs.append(error_msg)
29
+ return None, "", "\n".join(debug_logs)
30
+
31
+ # Prepare lists for audio segments, SRT entries, and segment-level debug info.
32
+ audio_segments = []
33
+ srt_entries = []
34
+ current_time = 0.0 # cumulative time for SRT timestamps
35
+ segment_index = 1
36
+ segment_debug_info = []
37
+
38
+ try:
39
+ debug_logs.append("Generating audio segments from input text...")
40
+ # Invoke the pipeline to process the text.
41
+ # The split_pattern parameter (regex) allows you to define how text is segmented.
42
+ generator = pipeline(
43
+ text,
44
+ voice=voice,
45
+ speed=speed,
46
+ split_pattern=split_pattern
47
+ )
48
+
49
+ for i, (gs, ps, audio) in enumerate(generator):
50
+ duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz
51
+ start_timestamp = current_time
52
+ end_timestamp = current_time + duration
53
+ # Create an SRT entry for the segment.
54
+ srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n"
55
+ srt_entries.append(srt_entry)
56
+ current_time = end_timestamp
57
+
58
+ # Record segment details for debugging.
59
+ segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}")
60
+ audio_segments.append(audio)
61
+ segment_index += 1
62
+
63
+ debug_logs.append("Audio segments generated successfully.")
64
+ except Exception as e:
65
+ error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}"
66
+ debug_logs.append(error_msg)
67
+ return None, "", "\n".join(debug_logs)
68
+
69
+ # Concatenate all the generated segments into a single audio array.
70
+ if audio_segments:
71
+ full_audio = np.concatenate(audio_segments)
72
+ else:
73
+ debug_logs.append("No audio segments were generated.")
74
+ return None, "", "\n".join(debug_logs)
75
+
76
+ # Combine all SRT entries into one string.
77
+ srt_content = "\n".join(srt_entries)
78
+
79
+ # Combine all debug logs (with optional segment details).
80
+ if debug:
81
+ debug_info = "\n".join(debug_logs + segment_debug_info)
82
+ else:
83
+ debug_info = "\n".join(debug_logs)
84
+
85
+ # Return a tuple: audio (with sample rate), the SRT text, and the debug log.
86
+ return (24000, full_audio), srt_content, debug_info
87
+
88
+ # Build the Gradio interface.
89
+ iface = gr.Interface(
90
+ fn=generate_audio,
91
+ inputs=[
92
+ gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."),
93
+ gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"),
94
+ gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0),
95
+ gr.Textbox(label="Language Code", value="a",
96
+ placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"),
97
+ gr.Textbox(label="Split Pattern (Regex)", value=r'\n+',
98
+ placeholder="Regex to split the input text (e.g., '\\n+')"),
99
+ gr.Checkbox(label="Enable Debug Mode", value=True)
100
+ ],
101
+ outputs=[
102
+ gr.Audio(label="Generated Audio", type="numpy"),
103
+ gr.Textbox(label="Generated SRT"),
104
+ gr.Textbox(label="Debug Information", lines=15)
105
+ ],
106
+ title="Kokoro TTS Gradio App (CPU Mode)",
107
+ description=("This app uses the Kokoro TTS model to generate audio from text. "
108
+ "You can tweak parameters such as voice, speed, language code, and the text split pattern. "
109
+ "When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.")
110
+ )
111
+
112
+ if __name__ == "__main__":
113
+ iface.launch()