Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,8 +4,7 @@ import os
|
|
4 |
import tempfile
|
5 |
from pydub import AudioSegment
|
6 |
import math
|
7 |
-
import gc
|
8 |
-
import re
|
9 |
|
10 |
# --- Helper Functions ---
|
11 |
|
@@ -17,306 +16,299 @@ def format_time(seconds):
|
|
17 |
milliseconds = int((seconds - int(seconds)) * 1000)
|
18 |
return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
|
19 |
|
20 |
-
def
|
21 |
-
"""
|
22 |
-
|
23 |
-
return len(text)
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
#
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
def process_advanced_segments(segments, max_words_per_segment):
|
60 |
-
"""Process segments to ensure they don't exceed word limits while preserving timing"""
|
61 |
-
processed_segments = []
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
# Split long segments
|
70 |
-
current_words = []
|
71 |
|
72 |
-
|
73 |
-
|
|
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
if current_words:
|
89 |
-
new_segment = {
|
90 |
-
'start': current_words[0]['start'],
|
91 |
-
'end': calculate_word_end_time(current_words, len(current_words) - 1),
|
92 |
-
'text': ''.join([w['word'] for w in current_words]).strip(),
|
93 |
-
'words': current_words.copy()
|
94 |
-
}
|
95 |
-
processed_segments.append(new_segment)
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
if video_path is None:
|
103 |
return "Please upload a video file first.", None
|
104 |
|
105 |
-
|
106 |
-
|
|
|
|
|
107 |
try:
|
108 |
model = whisper.load_model(model_name)
|
109 |
-
yield f"Model '{model_name}' loaded. Extracting audio...", None
|
110 |
except Exception as e:
|
111 |
-
return f"Error loading model: {
|
112 |
|
|
|
|
|
|
|
113 |
with tempfile.TemporaryDirectory() as temp_dir:
|
114 |
-
|
115 |
-
|
|
|
116 |
try:
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
audio.
|
121 |
-
|
122 |
-
yield f"Audio extracted. Duration: {len(audio)/1000:.1f} seconds", None
|
123 |
-
|
124 |
except Exception as e:
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
132 |
|
133 |
-
if audio_length_ms <= chunk_length_ms:
|
134 |
-
# Single chunk
|
135 |
-
chunks = [audio]
|
136 |
-
num_chunks = 1
|
137 |
-
else:
|
138 |
-
# Multiple chunks
|
139 |
-
chunks = []
|
140 |
-
for i in range(0, audio_length_ms, chunk_length_ms):
|
141 |
-
chunk = audio[i:i + chunk_length_ms]
|
142 |
-
chunks.append(chunk)
|
143 |
-
num_chunks = len(chunks)
|
144 |
-
|
145 |
-
yield f"Processing {num_chunks} chunk(s)...", None
|
146 |
-
|
147 |
-
all_segments = []
|
148 |
-
total_offset = 0
|
149 |
-
|
150 |
for i in range(num_chunks):
|
151 |
-
|
|
|
|
|
152 |
|
153 |
-
chunk = chunks[i]
|
154 |
chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
|
155 |
-
chunk.export(chunk_path, format="wav"
|
|
|
|
|
|
|
|
|
|
|
156 |
|
|
|
157 |
try:
|
158 |
-
should_get_word_timestamps = (transcription_mode in ["Word-level", "Advanced Segment"])
|
159 |
-
|
160 |
result = model.transcribe(
|
161 |
chunk_path,
|
162 |
word_timestamps=should_get_word_timestamps,
|
163 |
-
|
164 |
-
no_speech_threshold=0.6,
|
165 |
-
logprob_threshold=-1.0
|
166 |
)
|
167 |
-
|
168 |
-
# Adjust timestamps for chunk offset
|
169 |
-
chunk_offset = total_offset
|
170 |
-
|
171 |
-
for segment in result['segments']:
|
172 |
-
segment['start'] += chunk_offset
|
173 |
-
segment['end'] += chunk_offset
|
174 |
-
|
175 |
-
# Adjust word timestamps if available
|
176 |
-
if 'words' in segment and segment['words']:
|
177 |
-
for word in segment['words']:
|
178 |
-
if 'start' in word and word['start'] is not None:
|
179 |
-
word['start'] += chunk_offset
|
180 |
-
if 'end' in word and word['end'] is not None:
|
181 |
-
word['end'] += chunk_offset
|
182 |
-
|
183 |
-
all_segments.extend(result['segments'])
|
184 |
-
total_offset += len(chunk) / 1000.0 # Convert to seconds
|
185 |
-
|
186 |
except Exception as e:
|
|
|
187 |
del model
|
188 |
gc.collect()
|
189 |
-
return f"Error during transcription of chunk {i+1}: {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
-
|
|
|
|
|
|
|
192 |
del model
|
193 |
gc.collect()
|
194 |
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
-
if not all_segments:
|
198 |
-
return "No speech detected in the video.", None
|
199 |
|
200 |
-
|
201 |
-
if transcription_mode == "Advanced Segment":
|
202 |
-
processed_segments = process_advanced_segments(all_segments, max_words_per_segment)
|
203 |
-
else:
|
204 |
-
processed_segments = all_segments
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
end_time = word.get('end', start_time + 0.5)
|
217 |
-
word_text = word.get('word', '').strip()
|
218 |
-
if word_text:
|
219 |
-
word_list.append(f"[{start_time:.2f}s - {end_time:.2f}s] {word_text}")
|
220 |
-
result_text = "\n".join(word_list)
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
word_count = len(segment.get('words', []))
|
229 |
-
|
230 |
-
advanced_list.append(f"Segment {i}: [{format_time(start_time)} --> {format_time(end_time)}] "
|
231 |
-
f"({word_count} words)")
|
232 |
-
advanced_list.append(f" {text}")
|
233 |
-
advanced_list.append("")
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
with gr.Blocks(title="Video Transcription Tool", theme=gr.themes.Soft()) as iface:
|
253 |
-
gr.Markdown("# 🎥 Video Transcription with Whisper")
|
254 |
-
|
255 |
-
with gr.Row():
|
256 |
-
with gr.Column(scale=1):
|
257 |
-
video_input = gr.File(
|
258 |
-
label="Upload Video File",
|
259 |
-
file_types=["video"],
|
260 |
-
height=100
|
261 |
-
)
|
262 |
-
|
263 |
-
model_dropdown = gr.Dropdown(
|
264 |
-
choices=["tiny", "base", "small", "medium", "large-v2", "large-v3"],
|
265 |
-
value="base",
|
266 |
-
label="Whisper Model",
|
267 |
-
info="Larger models are more accurate but slower"
|
268 |
-
)
|
269 |
-
|
270 |
-
transcription_mode = gr.Radio(
|
271 |
-
choices=["Segment-level", "Word-level", "Advanced Segment"],
|
272 |
-
value="Segment-level",
|
273 |
-
label="Transcription Mode",
|
274 |
-
info="Choose output format"
|
275 |
-
)
|
276 |
-
|
277 |
-
with gr.Accordion("Advanced Settings", open=False):
|
278 |
-
chunk_length = gr.Slider(
|
279 |
-
minimum=5,
|
280 |
-
maximum=60,
|
281 |
-
value=20,
|
282 |
-
step=5,
|
283 |
-
label="Chunk Length (minutes)",
|
284 |
-
info="Split long videos into chunks"
|
285 |
-
)
|
286 |
-
|
287 |
-
max_words = gr.Slider(
|
288 |
-
minimum=5,
|
289 |
-
maximum=50,
|
290 |
-
value=15,
|
291 |
-
step=1,
|
292 |
-
label="Max Words per Segment",
|
293 |
-
info="Only applies to Advanced Segment mode"
|
294 |
-
)
|
295 |
-
|
296 |
-
transcribe_btn = gr.Button("🎯 Start Transcription", variant="primary" size="lg")
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
show_copy_button=True
|
304 |
-
)
|
305 |
-
|
306 |
-
srt_output = gr.File(
|
307 |
-
label="Download SRT File",
|
308 |
-
visible=True
|
309 |
-
)
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
)
|
317 |
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
if __name__ == "__main__":
|
321 |
-
|
322 |
-
interface.launch(share=True, server_name="0.0.0.0", server_port=7860)
|
|
|
4 |
import tempfile
|
5 |
from pydub import AudioSegment
|
6 |
import math
|
7 |
+
import gc # Garbage Collector interface
|
|
|
8 |
|
9 |
# --- Helper Functions ---
|
10 |
|
|
|
16 |
milliseconds = int((seconds - int(seconds)) * 1000)
|
17 |
return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
|
18 |
|
19 |
+
def generate_srt_from_result(result, transcription_mode):
|
20 |
+
"""Generates SRT content from Whisper's result dictionary."""
|
21 |
+
srt_content = []
|
|
|
22 |
|
23 |
+
if transcription_mode == "word":
|
24 |
+
# Word-level SRT generation
|
25 |
+
entry_index = 1
|
26 |
+
for segment in result["segments"]:
|
27 |
+
for word_info in segment.get("words", []):
|
28 |
+
start_time = format_time(word_info["start"])
|
29 |
+
end_time = format_time(word_info["end"])
|
30 |
+
text = word_info["word"].strip()
|
31 |
+
if text: # Ensure we don't add empty entries
|
32 |
+
srt_content.append(f"{entry_index}\n{start_time} --> {end_time}\n{text}\n")
|
33 |
+
entry_index += 1
|
34 |
+
else: # Default to segment-level
|
35 |
+
for i, segment in enumerate(result["segments"], 1):
|
36 |
+
start_time = format_time(segment["start"])
|
37 |
+
end_time = format_time(segment["end"])
|
38 |
+
text = segment["text"].strip()
|
39 |
+
if text:
|
40 |
+
srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n")
|
41 |
|
42 |
+
return "\n".join(srt_content)
|
43 |
+
|
44 |
+
# --- New Function for Advanced Mode ---
|
45 |
+
|
46 |
+
def process_advanced_segments(full_result, max_words):
|
47 |
+
"""
|
48 |
+
Post-processes segments for Word-level Advanced mode.
|
49 |
+
Groups words into new segments with <= max_words per segment, splitting at nearest punctuation.
|
50 |
+
Adjusts timestamps based on actual word times (or proportional if needed).
|
51 |
+
Optimized: Single pass with limited lookahead.
|
52 |
+
"""
|
53 |
+
# Define punctuation for natural splits
|
54 |
+
punctuation = {'.', '!', '?', ';', ',', '--'}
|
55 |
|
56 |
+
# Flatten all words into a single list for continuous processing
|
57 |
+
all_words = []
|
58 |
+
for segment in full_result["segments"]:
|
59 |
+
all_words.extend(segment.get("words", []))
|
60 |
|
61 |
+
if not all_words:
|
62 |
+
return full_result # Nothing to process
|
|
|
|
|
|
|
63 |
|
64 |
+
new_segments = []
|
65 |
+
current_words = []
|
66 |
+
i = 0
|
67 |
+
while i < len(all_words):
|
68 |
+
current_words.append(all_words[i])
|
|
|
|
|
|
|
69 |
|
70 |
+
if len(current_words) >= max_words:
|
71 |
+
# Find nearest punctuation for split
|
72 |
+
split_index = -1
|
73 |
|
74 |
+
# Look backward in current words for last punctuation
|
75 |
+
for j in range(len(current_words) - 1, -1, -1):
|
76 |
+
word_text = current_words[j]["word"].strip()
|
77 |
+
if word_text[-1] in punctuation:
|
78 |
+
split_index = j + 1 # Split after this word
|
79 |
+
break
|
80 |
+
|
81 |
+
# If none, look forward in next words (limited lookahead to optimize)
|
82 |
+
if split_index == -1:
|
83 |
+
lookahead_end = min(i + 1 + 10, len(all_words)) # Cap lookahead for efficiency
|
84 |
+
for j in range(i + 1, lookahead_end):
|
85 |
+
word_text = all_words[j]["word"].strip()
|
86 |
+
current_words.append(all_words[j]) # Temporarily add to current
|
87 |
+
i += 1 # Advance i as we add
|
88 |
+
if word_text[-1] in punctuation:
|
89 |
+
split_index = len(current_words) # Split after this added word
|
90 |
+
break
|
91 |
+
|
92 |
+
# Fallback: Split at max_words if no punctuation found
|
93 |
+
if split_index == -1:
|
94 |
+
split_index = max_words
|
95 |
+
|
96 |
+
# Create new segment for current group up to split
|
97 |
+
group_words = current_words[:split_index]
|
98 |
+
if group_words:
|
99 |
+
text = " ".join(w["word"].strip() for w in group_words)
|
100 |
+
start = group_words[0]["start"]
|
101 |
+
end = group_words[-1]["end"]
|
102 |
+
new_segments.append({"start": start, "end": end, "text": text, "words": group_words})
|
103 |
+
|
104 |
+
# Remaining words become start of next group (timestamp adjustment: shifted to next)
|
105 |
+
current_words = current_words[split_index:]
|
106 |
|
107 |
+
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
+
# Add any remaining words as last segment
|
110 |
+
if current_words:
|
111 |
+
text = " ".join(w["word"].strip() for w in current_words)
|
112 |
+
start = current_words[0]["start"]
|
113 |
+
end = current_words[-1]["end"]
|
114 |
+
new_segments.append({"start": start, "end": end, "text": text, "words": current_words})
|
115 |
+
|
116 |
+
# Handle rare case: If no word timestamps, fall back to proportional adjustment
|
117 |
+
for seg in new_segments:
|
118 |
+
if "words" not in seg or not seg["words"]:
|
119 |
+
# Proportional split (as per your description: adjust based on word count ratio)
|
120 |
+
orig_start = seg["start"]
|
121 |
+
orig_end = seg["end"]
|
122 |
+
word_count = len(seg["text"].split())
|
123 |
+
if word_count > max_words:
|
124 |
+
ratio = max_words / word_count
|
125 |
+
split_time = orig_start + (orig_end - orig_start) * ratio
|
126 |
+
seg["end"] = split_time # Minus from current
|
127 |
+
# Next segment would start at split_time (but since we're rebuilding, it's handled in loop)
|
128 |
|
129 |
+
# Replace original segments with new ones
|
130 |
+
full_result["segments"] = new_segments
|
131 |
+
return full_result
|
132 |
+
|
133 |
+
# --- Main Transcription Logic ---
|
134 |
+
|
135 |
+
def transcribe_video(video_path, model_name, transcription_mode, chunk_length_min, max_words): # Added max_words
|
136 |
+
"""
|
137 |
+
Transcribes a video file by extracting audio, chunking it, processing chunks,
|
138 |
+
and generating a full SRT file with corrected timestamps.
|
139 |
+
"""
|
140 |
if video_path is None:
|
141 |
return "Please upload a video file first.", None
|
142 |
|
143 |
+
yield "Loading model...", None # Update status for the user
|
144 |
+
|
145 |
+
# Load the Whisper model. This is cached by Gradio for subsequent calls.
|
146 |
+
# Note: On a Hugging Face Space, the model is loaded once when the app starts.
|
147 |
try:
|
148 |
model = whisper.load_model(model_name)
|
|
|
149 |
except Exception as e:
|
150 |
+
return f"Error loading model: {e}", None
|
151 |
|
152 |
+
yield f"Model '{model_name}' loaded. Extracting audio...", None
|
153 |
+
|
154 |
+
# Use a temporary directory for all our files
|
155 |
with tempfile.TemporaryDirectory() as temp_dir:
|
156 |
+
audio_path = os.path.join(temp_dir, "extracted_audio.wav")
|
157 |
+
|
158 |
+
# Extract audio from video using pydub
|
159 |
try:
|
160 |
+
video = AudioSegment.from_file(video_path)
|
161 |
+
# Export as WAV, 16kHz, mono - ideal for Whisper
|
162 |
+
video.set_channels(1).set_frame_rate(16000).export(audio_path, format="wav")
|
163 |
+
audio = AudioSegment.from_wav(audio_path)
|
|
|
|
|
|
|
164 |
except Exception as e:
|
165 |
+
return f"Error processing video/audio: {e}", None
|
166 |
+
|
167 |
+
# --- Chunking Logic ---
|
168 |
+
chunk_length_ms = chunk_length_min * 60 * 1000
|
169 |
+
num_chunks = math.ceil(len(audio) / chunk_length_ms)
|
170 |
+
|
171 |
+
full_result = {"segments": []}
|
172 |
+
|
173 |
+
yield f"Audio extracted. Splitting into {num_chunks} chunk(s) of {chunk_length_min} min...", None
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
for i in range(num_chunks):
|
176 |
+
start_ms = i * chunk_length_ms
|
177 |
+
end_ms = start_ms + chunk_length_ms
|
178 |
+
chunk = audio[start_ms:end_ms]
|
179 |
|
|
|
180 |
chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
|
181 |
+
chunk.export(chunk_path, format="wav")
|
182 |
+
|
183 |
+
yield f"Transcribing chunk {i+1}/{num_chunks}...", None
|
184 |
+
|
185 |
+
# Determine if word-level timestamps are needed
|
186 |
+
should_get_word_timestamps = (transcription_mode in ["word", "Word-level Advanced"]) # Updated for new mode
|
187 |
|
188 |
+
# Transcribe the chunk
|
189 |
try:
|
|
|
|
|
190 |
result = model.transcribe(
|
191 |
chunk_path,
|
192 |
word_timestamps=should_get_word_timestamps,
|
193 |
+
fp16=False # Set to False for CPU-only inference
|
|
|
|
|
194 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
except Exception as e:
|
196 |
+
# Clean up and report error
|
197 |
del model
|
198 |
gc.collect()
|
199 |
+
return f"Error during transcription of chunk {i+1}: {e}", None
|
200 |
+
|
201 |
+
|
202 |
+
# --- Timestamp Correction ---
|
203 |
+
# Add the chunk's start time to all timestamps in the result
|
204 |
+
time_offset_s = start_ms / 1000.0
|
205 |
+
|
206 |
+
for segment in result["segments"]:
|
207 |
+
segment["start"] += time_offset_s
|
208 |
+
segment["end"] += time_offset_s
|
209 |
+
|
210 |
+
if "words" in segment:
|
211 |
+
for word_info in segment["words"]:
|
212 |
+
word_info["start"] += time_offset_s
|
213 |
+
word_info["end"] += time_offset_s
|
214 |
+
|
215 |
+
full_result["segments"].append(segment)
|
216 |
|
217 |
+
# Clean up the chunk file immediately
|
218 |
+
os.remove(chunk_path)
|
219 |
+
|
220 |
+
# Clean up the model from memory to be safe
|
221 |
del model
|
222 |
gc.collect()
|
223 |
|
224 |
+
# --- New: Process for Advanced Mode ---
|
225 |
+
if transcription_mode == "Word-level Advanced":
|
226 |
+
yield "Processing advanced word-level grouping...", None
|
227 |
+
full_result = process_advanced_segments(full_result, max_words)
|
228 |
+
|
229 |
+
yield "All chunks transcribed. Generating SRT file...", None
|
230 |
+
|
231 |
+
# Generate the final SRT file from the combined results
|
232 |
+
# For Advanced mode, force segment-level generation (grouped lines)
|
233 |
+
srt_mode = "segment" if transcription_mode == "Word-level Advanced" else transcription_mode
|
234 |
+
srt_output = generate_srt_from_result(full_result, srt_mode)
|
235 |
+
|
236 |
+
# Create a final SRT file in the temp directory to be returned by Gradio
|
237 |
+
srt_file_path = os.path.join(temp_dir, "output.srt")
|
238 |
+
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
|
239 |
+
srt_file.write(srt_output)
|
240 |
+
|
241 |
+
yield "Done!", srt_file_path
|
242 |
|
|
|
|
|
243 |
|
244 |
+
# --- Gradio UI ---
|
|
|
|
|
|
|
|
|
245 |
|
246 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
247 |
+
gr.Markdown(
|
248 |
+
"""
|
249 |
+
# Whisper Video Transcriber 🎥 -> 📝
|
250 |
+
Upload a video, choose your settings, and get a timed SRT subtitle file.
|
251 |
+
This app handles large videos by automatically splitting them into manageable chunks.
|
252 |
+
"""
|
253 |
+
)
|
254 |
+
with gr.Row():
|
255 |
+
with gr.Column():
|
256 |
+
video_input = gr.Video(label="Upload Video")
|
257 |
|
258 |
+
model_name = gr.Radio(
|
259 |
+
["tiny.en", "base.en"],
|
260 |
+
label="Whisper Model",
|
261 |
+
value="base.en",
|
262 |
+
info="`tiny.en` is faster, `base.en` is more accurate."
|
263 |
+
)
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
+
transcription_mode = gr.Radio(
|
266 |
+
["Segment-level", "Word-level", "Word-level Advanced"], # Added new mode
|
267 |
+
label="Transcription Granularity",
|
268 |
+
value="Segment-level",
|
269 |
+
info="Word-level is more detailed but may be slightly slower. Word-level Advanced groups into lines with max words, splitting at punctuation."
|
270 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
+
chunk_length_min = gr.Slider(
|
273 |
+
minimum=5,
|
274 |
+
maximum=20,
|
275 |
+
value=10,
|
276 |
+
step=1,
|
277 |
+
label="Chunk Length (minutes)",
|
278 |
+
info="Shorter chunks use less RAM but may be slightly less accurate at boundaries."
|
279 |
+
)
|
280 |
|
281 |
+
max_words = gr.Slider( # New input for max_words
|
282 |
+
minimum=5,
|
283 |
+
maximum=30,
|
284 |
+
value=10,
|
285 |
+
step=1,
|
286 |
+
label="Max Words per Line (Advanced mode only)",
|
287 |
+
info="For Word-level Advanced: Limits words per subtitle line, splitting intelligently at punctuation."
|
288 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
+
submit_button = gr.Button("Transcribe Video", variant="primary")
|
291 |
+
|
292 |
+
with gr.Column():
|
293 |
+
status_output = gr.Textbox(label="Status", interactive=False, lines=5)
|
294 |
+
srt_output_file = gr.File(label="Download SRT File")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
+
submit_button.click(
|
297 |
+
fn=transcribe_video,
|
298 |
+
inputs=[video_input, model_name, transcription_mode, chunk_length_min, max_words], # Added max_words
|
299 |
+
outputs=[status_output, srt_output_file]
|
300 |
+
)
|
|
|
301 |
|
302 |
+
gr.Markdown(
|
303 |
+
"""
|
304 |
+
### How to Use
|
305 |
+
1. **Upload a video file.**
|
306 |
+
2. **Select a Whisper model.** For English, `base.en` provides a great balance of speed and accuracy.
|
307 |
+
3. **Choose the granularity.** 'Segment-level' is good for standard subtitles. 'Word-level' is great for karaoke-style highlighting. 'Word-level Advanced' groups into optimized subtitle lines.
|
308 |
+
4. **Click 'Transcribe Video'.** The status box will show the progress.
|
309 |
+
5. **Download the SRT file** when the process is complete. You can open this file in any text editor or load it into a video player like VLC.
|
310 |
+
"""
|
311 |
+
)
|
312 |
|
313 |
if __name__ == "__main__":
|
314 |
+
demo.launch(debug=True)
|
|