kavehtaheri commited on
Commit
eaf7024
·
verified ·
1 Parent(s): a48c358

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -256
app.py CHANGED
@@ -4,8 +4,7 @@ import os
4
  import tempfile
5
  from pydub import AudioSegment
6
  import math
7
- import gc
8
- import re
9
 
10
  # --- Helper Functions ---
11
 
@@ -17,306 +16,299 @@ def format_time(seconds):
17
  milliseconds = int((seconds - int(seconds)) * 1000)
18
  return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
19
 
20
- def find_split_point(text, max_length=100):
21
- """Find the best point to split text at punctuation or natural breaks"""
22
- if len(text) <= max_length:
23
- return len(text)
24
 
25
- # Look for punctuation marks in reverse from max_length
26
- for i in range(min(max_length, len(text) - 1), max_length // 2, -1):
27
- if text[i] in '.!?;:':
28
- return i + 1
29
-
30
- # Look for commas
31
- for i in range(min(max_length, len(text) - 1), max_length // 2, -1):
32
- if text[i] == ',':
33
- return i + 1
34
-
35
- # Look for spaces
36
- for i in range(min(max_length, len(text) - 1), max_length // 2, -1):
37
- if text[i] == ' ':
38
- return i
39
-
40
- # If no good split point, just split at max_length
41
- return max_length
 
42
 
43
- def calculate_word_end_time(words, word_index):
44
- """Calculate the end time for a word based on its position and surrounding context"""
45
- if word_index >= len(words):
46
- return words[-1]['end'] if words else 0
47
-
48
- current_word = words[word_index]
49
- if 'end' in current_word and current_word['end'] is not None:
50
- return current_word['end']
 
 
 
 
 
51
 
52
- # Estimate based on start time and word length
53
- start_time = current_word.get('start', 0)
54
- word_length = len(current_word.get('word', '').strip())
55
- estimated_duration = max(0.1, word_length * 0.08) # ~80ms per character
56
 
57
- return start_time + estimated_duration
58
-
59
- def process_advanced_segments(segments, max_words_per_segment):
60
- """Process segments to ensure they don't exceed word limits while preserving timing"""
61
- processed_segments = []
62
 
63
- for segment in segments:
64
- words = segment.get('words', [])
65
- if not words or len(words) <= max_words_per_segment:
66
- processed_segments.append(segment)
67
- continue
68
-
69
- # Split long segments
70
- current_words = []
71
 
72
- for word in words:
73
- current_words.append(word)
 
74
 
75
- if len(current_words) >= max_words_per_segment:
76
- # Create new segment
77
- if current_words:
78
- new_segment = {
79
- 'start': current_words[0]['start'],
80
- 'end': calculate_word_end_time(current_words, len(current_words) - 1),
81
- 'text': ''.join([w['word'] for w in current_words]).strip(),
82
- 'words': current_words.copy()
83
- }
84
- processed_segments.append(new_segment)
85
- current_words = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- # Handle remaining words
88
- if current_words:
89
- new_segment = {
90
- 'start': current_words[0]['start'],
91
- 'end': calculate_word_end_time(current_words, len(current_words) - 1),
92
- 'text': ''.join([w['word'] for w in current_words]).strip(),
93
- 'words': current_words.copy()
94
- }
95
- processed_segments.append(new_segment)
96
 
97
- return processed_segments
98
-
99
- def transcribe_video(video_path, model_name, transcription_mode, chunk_length_min, max_words_per_segment, progress=gr.Progress()):
100
- """Enhanced transcription with advanced segment processing"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
 
 
 
 
 
 
 
 
 
 
 
102
  if video_path is None:
103
  return "Please upload a video file first.", None
104
 
105
- progress(0, desc="Loading model...")
106
-
 
 
107
  try:
108
  model = whisper.load_model(model_name)
109
- yield f"Model '{model_name}' loaded. Extracting audio...", None
110
  except Exception as e:
111
- return f"Error loading model: {str(e)}", None
112
 
 
 
 
113
  with tempfile.TemporaryDirectory() as temp_dir:
114
- progress(0.1, desc="Extracting audio...")
115
-
 
116
  try:
117
- # Extract audio from video
118
- audio = AudioSegment.from_file(video_path)
119
- audio_path = os.path.join(temp_dir, "audio.wav")
120
- audio.export(audio_path, format="wav", parameters=["-ar", "16000", "-ac", "1"])
121
-
122
- yield f"Audio extracted. Duration: {len(audio)/1000:.1f} seconds", None
123
-
124
  except Exception as e:
125
- del model
126
- gc.collect()
127
- return f"Error extracting audio: {str(e)}", None
128
-
129
- # **IMPORTANT: This is the chunking logic that was missing**
130
- chunk_length_ms = chunk_length_min * 60 * 1000 # Convert minutes to milliseconds
131
- audio_length_ms = len(audio)
 
 
132
 
133
- if audio_length_ms <= chunk_length_ms:
134
- # Single chunk
135
- chunks = [audio]
136
- num_chunks = 1
137
- else:
138
- # Multiple chunks
139
- chunks = []
140
- for i in range(0, audio_length_ms, chunk_length_ms):
141
- chunk = audio[i:i + chunk_length_ms]
142
- chunks.append(chunk)
143
- num_chunks = len(chunks)
144
-
145
- yield f"Processing {num_chunks} chunk(s)...", None
146
-
147
- all_segments = []
148
- total_offset = 0
149
-
150
  for i in range(num_chunks):
151
- progress(0.2 + (i / num_chunks) * 0.6, desc=f"Transcribing chunk {i+1}/{num_chunks}...")
 
 
152
 
153
- chunk = chunks[i]
154
  chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
155
- chunk.export(chunk_path, format="wav", parameters=["-ar", "16000", "-ac", "1"])
 
 
 
 
 
156
 
 
157
  try:
158
- should_get_word_timestamps = (transcription_mode in ["Word-level", "Advanced Segment"])
159
-
160
  result = model.transcribe(
161
  chunk_path,
162
  word_timestamps=should_get_word_timestamps,
163
- temperature=0.0,
164
- no_speech_threshold=0.6,
165
- logprob_threshold=-1.0
166
  )
167
-
168
- # Adjust timestamps for chunk offset
169
- chunk_offset = total_offset
170
-
171
- for segment in result['segments']:
172
- segment['start'] += chunk_offset
173
- segment['end'] += chunk_offset
174
-
175
- # Adjust word timestamps if available
176
- if 'words' in segment and segment['words']:
177
- for word in segment['words']:
178
- if 'start' in word and word['start'] is not None:
179
- word['start'] += chunk_offset
180
- if 'end' in word and word['end'] is not None:
181
- word['end'] += chunk_offset
182
-
183
- all_segments.extend(result['segments'])
184
- total_offset += len(chunk) / 1000.0 # Convert to seconds
185
-
186
  except Exception as e:
 
187
  del model
188
  gc.collect()
189
- return f"Error during transcription of chunk {i+1}: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- # Cleanup
 
 
 
192
  del model
193
  gc.collect()
194
 
195
- progress(0.9, desc="Processing results...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
- if not all_segments:
198
- return "No speech detected in the video.", None
199
 
200
- # Process segments based on mode
201
- if transcription_mode == "Advanced Segment":
202
- processed_segments = process_advanced_segments(all_segments, max_words_per_segment)
203
- else:
204
- processed_segments = all_segments
205
 
206
- # Generate output based on transcription mode
207
- if transcription_mode == "Segment-level":
208
- result_text = "\n".join([segment['text'].strip() for segment in processed_segments])
 
 
 
 
 
 
 
 
209
 
210
- elif transcription_mode == "Word-level":
211
- word_list = []
212
- for segment in processed_segments:
213
- if 'words' in segment and segment['words']:
214
- for word in segment['words']:
215
- start_time = word.get('start', 0)
216
- end_time = word.get('end', start_time + 0.5)
217
- word_text = word.get('word', '').strip()
218
- if word_text:
219
- word_list.append(f"[{start_time:.2f}s - {end_time:.2f}s] {word_text}")
220
- result_text = "\n".join(word_list)
221
 
222
- else: # Advanced Segment
223
- advanced_list = []
224
- for i, segment in enumerate(processed_segments, 1):
225
- start_time = segment.get('start', 0)
226
- end_time = segment.get('end', start_time + 1)
227
- text = segment.get('text', '').strip()
228
- word_count = len(segment.get('words', []))
229
-
230
- advanced_list.append(f"Segment {i}: [{format_time(start_time)} --> {format_time(end_time)}] "
231
- f"({word_count} words)")
232
- advanced_list.append(f" {text}")
233
- advanced_list.append("")
234
 
235
- result_text = "\n".join(advanced_list)
236
-
237
- # Generate SRT
238
- srt_content = ""
239
- for i, segment in enumerate(processed_segments, 1):
240
- start_time = format_time(segment.get('start', 0))
241
- end_time = format_time(segment.get('end', segment.get('start', 0) + 1))
242
- text = segment.get('text', '').strip()
243
 
244
- srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
245
-
246
- progress(1.0, desc="Complete!")
247
- yield result_text, srt_content
248
-
249
- # --- Gradio Interface ---
250
-
251
- def create_interface():
252
- with gr.Blocks(title="Video Transcription Tool", theme=gr.themes.Soft()) as iface:
253
- gr.Markdown("# 🎥 Video Transcription with Whisper")
254
-
255
- with gr.Row():
256
- with gr.Column(scale=1):
257
- video_input = gr.File(
258
- label="Upload Video File",
259
- file_types=["video"],
260
- height=100
261
- )
262
-
263
- model_dropdown = gr.Dropdown(
264
- choices=["tiny", "base", "small", "medium", "large-v2", "large-v3"],
265
- value="base",
266
- label="Whisper Model",
267
- info="Larger models are more accurate but slower"
268
- )
269
-
270
- transcription_mode = gr.Radio(
271
- choices=["Segment-level", "Word-level", "Advanced Segment"],
272
- value="Segment-level",
273
- label="Transcription Mode",
274
- info="Choose output format"
275
- )
276
-
277
- with gr.Accordion("Advanced Settings", open=False):
278
- chunk_length = gr.Slider(
279
- minimum=5,
280
- maximum=60,
281
- value=20,
282
- step=5,
283
- label="Chunk Length (minutes)",
284
- info="Split long videos into chunks"
285
- )
286
-
287
- max_words = gr.Slider(
288
- minimum=5,
289
- maximum=50,
290
- value=15,
291
- step=1,
292
- label="Max Words per Segment",
293
- info="Only applies to Advanced Segment mode"
294
- )
295
-
296
- transcribe_btn = gr.Button("🎯 Start Transcription", variant="primary" size="lg")
297
 
298
- with gr.Column(scale=2):
299
- result_text = gr.Textbox(
300
- label="Transcription Result",
301
- lines=20,
302
- max_lines=30,
303
- show_copy_button=True
304
- )
305
-
306
- srt_output = gr.File(
307
- label="Download SRT File",
308
- visible=True
309
- )
310
 
311
- transcribe_btn.click(
312
- fn=transcribe_video,
313
- inputs=[video_input, model_dropdown, transcription_mode, chunk_length, max_words],
314
- outputs=[result_text, srt_output],
315
- show_progress=True
316
- )
317
 
318
- return iface
 
 
 
 
 
 
 
 
 
319
 
320
  if __name__ == "__main__":
321
- interface = create_interface()
322
- interface.launch(share=True, server_name="0.0.0.0", server_port=7860)
 
4
  import tempfile
5
  from pydub import AudioSegment
6
  import math
7
+ import gc # Garbage Collector interface
 
8
 
9
  # --- Helper Functions ---
10
 
 
16
  milliseconds = int((seconds - int(seconds)) * 1000)
17
  return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
18
 
19
+ def generate_srt_from_result(result, transcription_mode):
20
+ """Generates SRT content from Whisper's result dictionary."""
21
+ srt_content = []
 
22
 
23
+ if transcription_mode == "word":
24
+ # Word-level SRT generation
25
+ entry_index = 1
26
+ for segment in result["segments"]:
27
+ for word_info in segment.get("words", []):
28
+ start_time = format_time(word_info["start"])
29
+ end_time = format_time(word_info["end"])
30
+ text = word_info["word"].strip()
31
+ if text: # Ensure we don't add empty entries
32
+ srt_content.append(f"{entry_index}\n{start_time} --> {end_time}\n{text}\n")
33
+ entry_index += 1
34
+ else: # Default to segment-level
35
+ for i, segment in enumerate(result["segments"], 1):
36
+ start_time = format_time(segment["start"])
37
+ end_time = format_time(segment["end"])
38
+ text = segment["text"].strip()
39
+ if text:
40
+ srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n")
41
 
42
+ return "\n".join(srt_content)
43
+
44
+ # --- New Function for Advanced Mode ---
45
+
46
+ def process_advanced_segments(full_result, max_words):
47
+ """
48
+ Post-processes segments for Word-level Advanced mode.
49
+ Groups words into new segments with <= max_words per segment, splitting at nearest punctuation.
50
+ Adjusts timestamps based on actual word times (or proportional if needed).
51
+ Optimized: Single pass with limited lookahead.
52
+ """
53
+ # Define punctuation for natural splits
54
+ punctuation = {'.', '!', '?', ';', ',', '--'}
55
 
56
+ # Flatten all words into a single list for continuous processing
57
+ all_words = []
58
+ for segment in full_result["segments"]:
59
+ all_words.extend(segment.get("words", []))
60
 
61
+ if not all_words:
62
+ return full_result # Nothing to process
 
 
 
63
 
64
+ new_segments = []
65
+ current_words = []
66
+ i = 0
67
+ while i < len(all_words):
68
+ current_words.append(all_words[i])
 
 
 
69
 
70
+ if len(current_words) >= max_words:
71
+ # Find nearest punctuation for split
72
+ split_index = -1
73
 
74
+ # Look backward in current words for last punctuation
75
+ for j in range(len(current_words) - 1, -1, -1):
76
+ word_text = current_words[j]["word"].strip()
77
+ if word_text[-1] in punctuation:
78
+ split_index = j + 1 # Split after this word
79
+ break
80
+
81
+ # If none, look forward in next words (limited lookahead to optimize)
82
+ if split_index == -1:
83
+ lookahead_end = min(i + 1 + 10, len(all_words)) # Cap lookahead for efficiency
84
+ for j in range(i + 1, lookahead_end):
85
+ word_text = all_words[j]["word"].strip()
86
+ current_words.append(all_words[j]) # Temporarily add to current
87
+ i += 1 # Advance i as we add
88
+ if word_text[-1] in punctuation:
89
+ split_index = len(current_words) # Split after this added word
90
+ break
91
+
92
+ # Fallback: Split at max_words if no punctuation found
93
+ if split_index == -1:
94
+ split_index = max_words
95
+
96
+ # Create new segment for current group up to split
97
+ group_words = current_words[:split_index]
98
+ if group_words:
99
+ text = " ".join(w["word"].strip() for w in group_words)
100
+ start = group_words[0]["start"]
101
+ end = group_words[-1]["end"]
102
+ new_segments.append({"start": start, "end": end, "text": text, "words": group_words})
103
+
104
+ # Remaining words become start of next group (timestamp adjustment: shifted to next)
105
+ current_words = current_words[split_index:]
106
 
107
+ i += 1
 
 
 
 
 
 
 
 
108
 
109
+ # Add any remaining words as last segment
110
+ if current_words:
111
+ text = " ".join(w["word"].strip() for w in current_words)
112
+ start = current_words[0]["start"]
113
+ end = current_words[-1]["end"]
114
+ new_segments.append({"start": start, "end": end, "text": text, "words": current_words})
115
+
116
+ # Handle rare case: If no word timestamps, fall back to proportional adjustment
117
+ for seg in new_segments:
118
+ if "words" not in seg or not seg["words"]:
119
+ # Proportional split (as per your description: adjust based on word count ratio)
120
+ orig_start = seg["start"]
121
+ orig_end = seg["end"]
122
+ word_count = len(seg["text"].split())
123
+ if word_count > max_words:
124
+ ratio = max_words / word_count
125
+ split_time = orig_start + (orig_end - orig_start) * ratio
126
+ seg["end"] = split_time # Minus from current
127
+ # Next segment would start at split_time (but since we're rebuilding, it's handled in loop)
128
 
129
+ # Replace original segments with new ones
130
+ full_result["segments"] = new_segments
131
+ return full_result
132
+
133
+ # --- Main Transcription Logic ---
134
+
135
+ def transcribe_video(video_path, model_name, transcription_mode, chunk_length_min, max_words): # Added max_words
136
+ """
137
+ Transcribes a video file by extracting audio, chunking it, processing chunks,
138
+ and generating a full SRT file with corrected timestamps.
139
+ """
140
  if video_path is None:
141
  return "Please upload a video file first.", None
142
 
143
+ yield "Loading model...", None # Update status for the user
144
+
145
+ # Load the Whisper model. This is cached by Gradio for subsequent calls.
146
+ # Note: On a Hugging Face Space, the model is loaded once when the app starts.
147
  try:
148
  model = whisper.load_model(model_name)
 
149
  except Exception as e:
150
+ return f"Error loading model: {e}", None
151
 
152
+ yield f"Model '{model_name}' loaded. Extracting audio...", None
153
+
154
+ # Use a temporary directory for all our files
155
  with tempfile.TemporaryDirectory() as temp_dir:
156
+ audio_path = os.path.join(temp_dir, "extracted_audio.wav")
157
+
158
+ # Extract audio from video using pydub
159
  try:
160
+ video = AudioSegment.from_file(video_path)
161
+ # Export as WAV, 16kHz, mono - ideal for Whisper
162
+ video.set_channels(1).set_frame_rate(16000).export(audio_path, format="wav")
163
+ audio = AudioSegment.from_wav(audio_path)
 
 
 
164
  except Exception as e:
165
+ return f"Error processing video/audio: {e}", None
166
+
167
+ # --- Chunking Logic ---
168
+ chunk_length_ms = chunk_length_min * 60 * 1000
169
+ num_chunks = math.ceil(len(audio) / chunk_length_ms)
170
+
171
+ full_result = {"segments": []}
172
+
173
+ yield f"Audio extracted. Splitting into {num_chunks} chunk(s) of {chunk_length_min} min...", None
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  for i in range(num_chunks):
176
+ start_ms = i * chunk_length_ms
177
+ end_ms = start_ms + chunk_length_ms
178
+ chunk = audio[start_ms:end_ms]
179
 
 
180
  chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
181
+ chunk.export(chunk_path, format="wav")
182
+
183
+ yield f"Transcribing chunk {i+1}/{num_chunks}...", None
184
+
185
+ # Determine if word-level timestamps are needed
186
+ should_get_word_timestamps = (transcription_mode in ["word", "Word-level Advanced"]) # Updated for new mode
187
 
188
+ # Transcribe the chunk
189
  try:
 
 
190
  result = model.transcribe(
191
  chunk_path,
192
  word_timestamps=should_get_word_timestamps,
193
+ fp16=False # Set to False for CPU-only inference
 
 
194
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  except Exception as e:
196
+ # Clean up and report error
197
  del model
198
  gc.collect()
199
+ return f"Error during transcription of chunk {i+1}: {e}", None
200
+
201
+
202
+ # --- Timestamp Correction ---
203
+ # Add the chunk's start time to all timestamps in the result
204
+ time_offset_s = start_ms / 1000.0
205
+
206
+ for segment in result["segments"]:
207
+ segment["start"] += time_offset_s
208
+ segment["end"] += time_offset_s
209
+
210
+ if "words" in segment:
211
+ for word_info in segment["words"]:
212
+ word_info["start"] += time_offset_s
213
+ word_info["end"] += time_offset_s
214
+
215
+ full_result["segments"].append(segment)
216
 
217
+ # Clean up the chunk file immediately
218
+ os.remove(chunk_path)
219
+
220
+ # Clean up the model from memory to be safe
221
  del model
222
  gc.collect()
223
 
224
+ # --- New: Process for Advanced Mode ---
225
+ if transcription_mode == "Word-level Advanced":
226
+ yield "Processing advanced word-level grouping...", None
227
+ full_result = process_advanced_segments(full_result, max_words)
228
+
229
+ yield "All chunks transcribed. Generating SRT file...", None
230
+
231
+ # Generate the final SRT file from the combined results
232
+ # For Advanced mode, force segment-level generation (grouped lines)
233
+ srt_mode = "segment" if transcription_mode == "Word-level Advanced" else transcription_mode
234
+ srt_output = generate_srt_from_result(full_result, srt_mode)
235
+
236
+ # Create a final SRT file in the temp directory to be returned by Gradio
237
+ srt_file_path = os.path.join(temp_dir, "output.srt")
238
+ with open(srt_file_path, "w", encoding="utf-8") as srt_file:
239
+ srt_file.write(srt_output)
240
+
241
+ yield "Done!", srt_file_path
242
 
 
 
243
 
244
+ # --- Gradio UI ---
 
 
 
 
245
 
246
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
247
+ gr.Markdown(
248
+ """
249
+ # Whisper Video Transcriber 🎥 -> 📝
250
+ Upload a video, choose your settings, and get a timed SRT subtitle file.
251
+ This app handles large videos by automatically splitting them into manageable chunks.
252
+ """
253
+ )
254
+ with gr.Row():
255
+ with gr.Column():
256
+ video_input = gr.Video(label="Upload Video")
257
 
258
+ model_name = gr.Radio(
259
+ ["tiny.en", "base.en"],
260
+ label="Whisper Model",
261
+ value="base.en",
262
+ info="`tiny.en` is faster, `base.en` is more accurate."
263
+ )
 
 
 
 
 
264
 
265
+ transcription_mode = gr.Radio(
266
+ ["Segment-level", "Word-level", "Word-level Advanced"], # Added new mode
267
+ label="Transcription Granularity",
268
+ value="Segment-level",
269
+ info="Word-level is more detailed but may be slightly slower. Word-level Advanced groups into lines with max words, splitting at punctuation."
270
+ )
 
 
 
 
 
 
271
 
272
+ chunk_length_min = gr.Slider(
273
+ minimum=5,
274
+ maximum=20,
275
+ value=10,
276
+ step=1,
277
+ label="Chunk Length (minutes)",
278
+ info="Shorter chunks use less RAM but may be slightly less accurate at boundaries."
279
+ )
280
 
281
+ max_words = gr.Slider( # New input for max_words
282
+ minimum=5,
283
+ maximum=30,
284
+ value=10,
285
+ step=1,
286
+ label="Max Words per Line (Advanced mode only)",
287
+ info="For Word-level Advanced: Limits words per subtitle line, splitting intelligently at punctuation."
288
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
+ submit_button = gr.Button("Transcribe Video", variant="primary")
291
+
292
+ with gr.Column():
293
+ status_output = gr.Textbox(label="Status", interactive=False, lines=5)
294
+ srt_output_file = gr.File(label="Download SRT File")
 
 
 
 
 
 
 
295
 
296
+ submit_button.click(
297
+ fn=transcribe_video,
298
+ inputs=[video_input, model_name, transcription_mode, chunk_length_min, max_words], # Added max_words
299
+ outputs=[status_output, srt_output_file]
300
+ )
 
301
 
302
+ gr.Markdown(
303
+ """
304
+ ### How to Use
305
+ 1. **Upload a video file.**
306
+ 2. **Select a Whisper model.** For English, `base.en` provides a great balance of speed and accuracy.
307
+ 3. **Choose the granularity.** 'Segment-level' is good for standard subtitles. 'Word-level' is great for karaoke-style highlighting. 'Word-level Advanced' groups into optimized subtitle lines.
308
+ 4. **Click 'Transcribe Video'.** The status box will show the progress.
309
+ 5. **Download the SRT file** when the process is complete. You can open this file in any text editor or load it into a video player like VLC.
310
+ """
311
+ )
312
 
313
  if __name__ == "__main__":
314
+ demo.launch(debug=True)