Spaces:

kavehtaheri
/

transcribe

Sleeping

App Files Files Community

kavehtaheri commited on Jul 22

Commit

eaf7024

verified ·

1 Parent(s): a48c358

Update app.py

Browse files

Files changed (1) hide show

app.py +248 -256

app.py CHANGED Viewed

@@ -4,8 +4,7 @@ import os
 import tempfile
 from pydub import AudioSegment
 import math
-import gc
-import re
 # --- Helper Functions ---
@@ -17,306 +16,299 @@ def format_time(seconds):
     milliseconds = int((seconds - int(seconds)) * 1000)
     return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
-def find_split_point(text, max_length=100):
-    """Find the best point to split text at punctuation or natural breaks"""
-    if len(text) <= max_length:
-        return len(text)
-    # Look for punctuation marks in reverse from max_length
-    for i in range(min(max_length, len(text) - 1), max_length // 2, -1):
-        if text[i] in '.!?;:':
-            return i + 1
-    # Look for commas
-    for i in range(min(max_length, len(text) - 1), max_length // 2, -1):
-        if text[i] == ',':
-            return i + 1
-    # Look for spaces
-    for i in range(min(max_length, len(text) - 1), max_length // 2, -1):
-        if text[i] == ' ':
-            return i
-    # If no good split point, just split at max_length
-    return max_length
-def calculate_word_end_time(words, word_index):
-    """Calculate the end time for a word based on its position and surrounding context"""
-    if word_index >= len(words):
-        return words[-1]['end'] if words else 0
-    current_word = words[word_index]
-    if 'end' in current_word and current_word['end'] is not None:
-        return current_word['end']
-    # Estimate based on start time and word length
-    start_time = current_word.get('start', 0)
-    word_length = len(current_word.get('word', '').strip())
-    estimated_duration = max(0.1, word_length * 0.08)  # ~80ms per character
-    return start_time + estimated_duration
-def process_advanced_segments(segments, max_words_per_segment):
-    """Process segments to ensure they don't exceed word limits while preserving timing"""
-    processed_segments = []
-    for segment in segments:
-        words = segment.get('words', [])
-        if not words or len(words) <= max_words_per_segment:
-            processed_segments.append(segment)
-            continue
-        # Split long segments
-        current_words = []
-        for word in words:
-            current_words.append(word)
-            if len(current_words) >= max_words_per_segment:
-                # Create new segment
-                if current_words:
-                    new_segment = {
-                        'start': current_words[0]['start'],
-                        'end': calculate_word_end_time(current_words, len(current_words) - 1),
-                        'text': ''.join([w['word'] for w in current_words]).strip(),
-                        'words': current_words.copy()
-                    }
-                    processed_segments.append(new_segment)
-                    current_words = []
-        # Handle remaining words
-        if current_words:
-            new_segment = {
-                'start': current_words[0]['start'],
-                'end': calculate_word_end_time(current_words, len(current_words) - 1),
-                'text': ''.join([w['word'] for w in current_words]).strip(),
-                'words': current_words.copy()
-            }
-            processed_segments.append(new_segment)
-    return processed_segments
-def transcribe_video(video_path, model_name, transcription_mode, chunk_length_min, max_words_per_segment, progress=gr.Progress()):
-    """Enhanced transcription with advanced segment processing"""
     if video_path is None:
         return "Please upload a video file first.", None
-    progress(0, desc="Loading model...")
     try:
         model = whisper.load_model(model_name)
-        yield f"Model '{model_name}' loaded. Extracting audio...", None
     except Exception as e:
-        return f"Error loading model: {str(e)}", None
     with tempfile.TemporaryDirectory() as temp_dir:
-        progress(0.1, desc="Extracting audio...")
         try:
-            # Extract audio from video
-            audio = AudioSegment.from_file(video_path)
-            audio_path = os.path.join(temp_dir, "audio.wav")
-            audio.export(audio_path, format="wav", parameters=["-ar", "16000", "-ac", "1"])
-            yield f"Audio extracted. Duration: {len(audio)/1000:.1f} seconds", None
         except Exception as e:
-            del model
-            gc.collect()
-            return f"Error extracting audio: {str(e)}", None
-        # **IMPORTANT: This is the chunking logic that was missing**
-        chunk_length_ms = chunk_length_min * 60 * 1000  # Convert minutes to milliseconds
-        audio_length_ms = len(audio)
-        if audio_length_ms <= chunk_length_ms:
-            # Single chunk
-            chunks = [audio]
-            num_chunks = 1
-        else:
-            # Multiple chunks
-            chunks = []
-            for i in range(0, audio_length_ms, chunk_length_ms):
-                chunk = audio[i:i + chunk_length_ms]
-                chunks.append(chunk)
-            num_chunks = len(chunks)
-        yield f"Processing {num_chunks} chunk(s)...", None
-        all_segments = []
-        total_offset = 0
         for i in range(num_chunks):
-            progress(0.2 + (i / num_chunks) * 0.6, desc=f"Transcribing chunk {i+1}/{num_chunks}...")
-            chunk = chunks[i]
             chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
-            chunk.export(chunk_path, format="wav", parameters=["-ar", "16000", "-ac", "1"])
             try:
-                should_get_word_timestamps = (transcription_mode in ["Word-level", "Advanced Segment"])
                 result = model.transcribe(
                     chunk_path,
                     word_timestamps=should_get_word_timestamps,
-                    temperature=0.0,
-                    no_speech_threshold=0.6,
-                    logprob_threshold=-1.0
                 )
-                # Adjust timestamps for chunk offset
-                chunk_offset = total_offset
-                for segment in result['segments']:
-                    segment['start'] += chunk_offset
-                    segment['end'] += chunk_offset
-                    # Adjust word timestamps if available
-                    if 'words' in segment and segment['words']:
-                        for word in segment['words']:
-                            if 'start' in word and word['start'] is not None:
-                                word['start'] += chunk_offset
-                            if 'end' in word and word['end'] is not None:
-                                word['end'] += chunk_offset
-                all_segments.extend(result['segments'])
-                total_offset += len(chunk) / 1000.0  # Convert to seconds
             except Exception as e:
                 del model
                 gc.collect()
-                return f"Error during transcription of chunk {i+1}: {str(e)}", None
-        # Cleanup
         del model
         gc.collect()
-        progress(0.9, desc="Processing results...")
-        if not all_segments:
-            return "No speech detected in the video.", None
-        # Process segments based on mode
-        if transcription_mode == "Advanced Segment":
-            processed_segments = process_advanced_segments(all_segments, max_words_per_segment)
-        else:
-            processed_segments = all_segments
-        # Generate output based on transcription mode
-        if transcription_mode == "Segment-level":
-            result_text = "\n".join([segment['text'].strip() for segment in processed_segments])
-        elif transcription_mode == "Word-level":
-            word_list = []
-            for segment in processed_segments:
-                if 'words' in segment and segment['words']:
-                    for word in segment['words']:
-                        start_time = word.get('start', 0)
-                        end_time = word.get('end', start_time + 0.5)
-                        word_text = word.get('word', '').strip()
-                        if word_text:
-                            word_list.append(f"[{start_time:.2f}s - {end_time:.2f}s] {word_text}")
-            result_text = "\n".join(word_list)
-        else:  # Advanced Segment
-            advanced_list = []
-            for i, segment in enumerate(processed_segments, 1):
-                start_time = segment.get('start', 0)
-                end_time = segment.get('end', start_time + 1)
-                text = segment.get('text', '').strip()
-                word_count = len(segment.get('words', []))
-                advanced_list.append(f"Segment {i}: [{format_time(start_time)} --> {format_time(end_time)}] "
-                                   f"({word_count} words)")
-                advanced_list.append(f"  {text}")
-                advanced_list.append("")
-            result_text = "\n".join(advanced_list)
-        # Generate SRT
-        srt_content = ""
-        for i, segment in enumerate(processed_segments, 1):
-            start_time = format_time(segment.get('start', 0))
-            end_time = format_time(segment.get('end', segment.get('start', 0) + 1))
-            text = segment.get('text', '').strip()
-            srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
-        progress(1.0, desc="Complete!")
-        yield result_text, srt_content
-# --- Gradio Interface ---
-def create_interface():
-    with gr.Blocks(title="Video Transcription Tool", theme=gr.themes.Soft()) as iface:
-        gr.Markdown("# 🎥 Video Transcription with Whisper")
-        with gr.Row():
-            with gr.Column(scale=1):
-                video_input = gr.File(
-                    label="Upload Video File",
-                    file_types=["video"],
-                    height=100
-                )
-                model_dropdown = gr.Dropdown(
-                    choices=["tiny", "base", "small", "medium", "large-v2", "large-v3"],
-                    value="base",
-                    label="Whisper Model",
-                    info="Larger models are more accurate but slower"
-                )
-                transcription_mode = gr.Radio(
-                    choices=["Segment-level", "Word-level", "Advanced Segment"],
-                    value="Segment-level",
-                    label="Transcription Mode",
-                    info="Choose output format"
-                )
-                with gr.Accordion("Advanced Settings", open=False):
-                    chunk_length = gr.Slider(
-                        minimum=5,
-                        maximum=60,
-                        value=20,
-                        step=5,
-                        label="Chunk Length (minutes)",
-                        info="Split long videos into chunks"
-                    )
-                    max_words = gr.Slider(
-                        minimum=5,
-                        maximum=50,
-                        value=15,
-                        step=1,
-                        label="Max Words per Segment",
-                        info="Only applies to Advanced Segment mode"
-                    )
-                transcribe_btn = gr.Button("🎯 Start Transcription", variant="primary" size="lg")
-            with gr.Column(scale=2):
-                result_text = gr.Textbox(
-                    label="Transcription Result",
-                    lines=20,
-                    max_lines=30,
-                    show_copy_button=True
-                )
-                srt_output = gr.File(
-                    label="Download SRT File",
-                    visible=True
-                )
-        transcribe_btn.click(
-            fn=transcribe_video,
-            inputs=[video_input, model_dropdown, transcription_mode, chunk_length, max_words],
-            outputs=[result_text, srt_output],
-            show_progress=True
-        )
-    return iface
 if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch(share=True, server_name="0.0.0.0", server_port=7860)

 import tempfile
 from pydub import AudioSegment
 import math
+import gc # Garbage Collector interface
 # --- Helper Functions ---
     milliseconds = int((seconds - int(seconds)) * 1000)
     return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
+def generate_srt_from_result(result, transcription_mode):
+    """Generates SRT content from Whisper's result dictionary."""
+    srt_content = []
+    if transcription_mode == "word":
+        # Word-level SRT generation
+        entry_index = 1
+        for segment in result["segments"]:
+            for word_info in segment.get("words", []):
+                start_time = format_time(word_info["start"])
+                end_time = format_time(word_info["end"])
+                text = word_info["word"].strip()
+                if text: # Ensure we don't add empty entries
+                    srt_content.append(f"{entry_index}\n{start_time} --> {end_time}\n{text}\n")
+                    entry_index += 1
+    else: # Default to segment-level
+        for i, segment in enumerate(result["segments"], 1):
+            start_time = format_time(segment["start"])
+            end_time = format_time(segment["end"])
+            text = segment["text"].strip()
+            if text:
+                srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n")
+    return "\n".join(srt_content)
+# --- New Function for Advanced Mode ---
+def process_advanced_segments(full_result, max_words):
+    """
+    Post-processes segments for Word-level Advanced mode.
+    Groups words into new segments with <= max_words per segment, splitting at nearest punctuation.
+    Adjusts timestamps based on actual word times (or proportional if needed).
+    Optimized: Single pass with limited lookahead.
+    """
+    # Define punctuation for natural splits
+    punctuation = {'.', '!', '?', ';', ',', '--'}
+    # Flatten all words into a single list for continuous processing
+    all_words = []
+    for segment in full_result["segments"]:
+        all_words.extend(segment.get("words", []))
+    if not all_words:
+        return full_result  # Nothing to process
+    new_segments = []
+    current_words = []
+    i = 0
+    while i < len(all_words):
+        current_words.append(all_words[i])
+        if len(current_words) >= max_words:
+            # Find nearest punctuation for split
+            split_index = -1
+            # Look backward in current words for last punctuation
+            for j in range(len(current_words) - 1, -1, -1):
+                word_text = current_words[j]["word"].strip()
+                if word_text[-1] in punctuation:
+                    split_index = j + 1  # Split after this word
+                    break
+            # If none, look forward in next words (limited lookahead to optimize)
+            if split_index == -1:
+                lookahead_end = min(i + 1 + 10, len(all_words))  # Cap lookahead for efficiency
+                for j in range(i + 1, lookahead_end):
+                    word_text = all_words[j]["word"].strip()
+                    current_words.append(all_words[j])  # Temporarily add to current
+                    i += 1  # Advance i as we add
+                    if word_text[-1] in punctuation:
+                        split_index = len(current_words)  # Split after this added word
+                        break
+            # Fallback: Split at max_words if no punctuation found
+            if split_index == -1:
+                split_index = max_words
+            # Create new segment for current group up to split
+            group_words = current_words[:split_index]
+            if group_words:
+                text = " ".join(w["word"].strip() for w in group_words)
+                start = group_words[0]["start"]
+                end = group_words[-1]["end"]
+                new_segments.append({"start": start, "end": end, "text": text, "words": group_words})
+            # Remaining words become start of next group (timestamp adjustment: shifted to next)
+            current_words = current_words[split_index:]
+        i += 1
+    # Add any remaining words as last segment
+    if current_words:
+        text = " ".join(w["word"].strip() for w in current_words)
+        start = current_words[0]["start"]
+        end = current_words[-1]["end"]
+        new_segments.append({"start": start, "end": end, "text": text, "words": current_words})
+    # Handle rare case: If no word timestamps, fall back to proportional adjustment
+    for seg in new_segments:
+        if "words" not in seg or not seg["words"]:
+            # Proportional split (as per your description: adjust based on word count ratio)
+            orig_start = seg["start"]
+            orig_end = seg["end"]
+            word_count = len(seg["text"].split())
+            if word_count > max_words:
+                ratio = max_words / word_count
+                split_time = orig_start + (orig_end - orig_start) * ratio
+                seg["end"] = split_time  # Minus from current
+                # Next segment would start at split_time (but since we're rebuilding, it's handled in loop)
+    # Replace original segments with new ones
+    full_result["segments"] = new_segments
+    return full_result
+# --- Main Transcription Logic ---
+def transcribe_video(video_path, model_name, transcription_mode, chunk_length_min, max_words):  # Added max_words
+    """
+    Transcribes a video file by extracting audio, chunking it, processing chunks,
+    and generating a full SRT file with corrected timestamps.
+    """
     if video_path is None:
         return "Please upload a video file first.", None
+    yield "Loading model...", None # Update status for the user
+    # Load the Whisper model. This is cached by Gradio for subsequent calls.
+    # Note: On a Hugging Face Space, the model is loaded once when the app starts.
     try:
         model = whisper.load_model(model_name)
     except Exception as e:
+        return f"Error loading model: {e}", None
+    yield f"Model '{model_name}' loaded. Extracting audio...", None
+    # Use a temporary directory for all our files
     with tempfile.TemporaryDirectory() as temp_dir:
+        audio_path = os.path.join(temp_dir, "extracted_audio.wav")
+        # Extract audio from video using pydub
         try:
+            video = AudioSegment.from_file(video_path)
+            # Export as WAV, 16kHz, mono - ideal for Whisper
+            video.set_channels(1).set_frame_rate(16000).export(audio_path, format="wav")
+            audio = AudioSegment.from_wav(audio_path)
         except Exception as e:
+            return f"Error processing video/audio: {e}", None
+        # --- Chunking Logic ---
+        chunk_length_ms = chunk_length_min * 60 * 1000
+        num_chunks = math.ceil(len(audio) / chunk_length_ms)
+        full_result = {"segments": []}
+        yield f"Audio extracted. Splitting into {num_chunks} chunk(s) of {chunk_length_min} min...", None
         for i in range(num_chunks):
+            start_ms = i * chunk_length_ms
+            end_ms = start_ms + chunk_length_ms
+            chunk = audio[start_ms:end_ms]
             chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
+            chunk.export(chunk_path, format="wav")
+            yield f"Transcribing chunk {i+1}/{num_chunks}...", None
+            # Determine if word-level timestamps are needed
+            should_get_word_timestamps = (transcription_mode in ["word", "Word-level Advanced"])  # Updated for new mode
+            # Transcribe the chunk
             try:
                 result = model.transcribe(
                     chunk_path,
                     word_timestamps=should_get_word_timestamps,
+                    fp16=False # Set to False for CPU-only inference
                 )
             except Exception as e:
+                # Clean up and report error
                 del model
                 gc.collect()
+                return f"Error during transcription of chunk {i+1}: {e}", None
+            # --- Timestamp Correction ---
+            # Add the chunk's start time to all timestamps in the result
+            time_offset_s = start_ms / 1000.0
+            for segment in result["segments"]:
+                segment["start"] += time_offset_s
+                segment["end"] += time_offset_s
+                if "words" in segment:
+                    for word_info in segment["words"]:
+                        word_info["start"] += time_offset_s
+                        word_info["end"] += time_offset_s
+                full_result["segments"].append(segment)
+            # Clean up the chunk file immediately
+            os.remove(chunk_path)
+        # Clean up the model from memory to be safe
         del model
         gc.collect()
+        # --- New: Process for Advanced Mode ---
+        if transcription_mode == "Word-level Advanced":
+            yield "Processing advanced word-level grouping...", None
+            full_result = process_advanced_segments(full_result, max_words)
+        yield "All chunks transcribed. Generating SRT file...", None
+        # Generate the final SRT file from the combined results
+        # For Advanced mode, force segment-level generation (grouped lines)
+        srt_mode = "segment" if transcription_mode == "Word-level Advanced" else transcription_mode
+        srt_output = generate_srt_from_result(full_result, srt_mode)
+        # Create a final SRT file in the temp directory to be returned by Gradio
+        srt_file_path = os.path.join(temp_dir, "output.srt")
+        with open(srt_file_path, "w", encoding="utf-8") as srt_file:
+            srt_file.write(srt_output)
+        yield "Done!", srt_file_path
+# --- Gradio UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # Whisper Video Transcriber 🎥 -> 📝
+        Upload a video, choose your settings, and get a timed SRT subtitle file.
+        This app handles large videos by automatically splitting them into manageable chunks.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Upload Video")
+            model_name = gr.Radio(
+                ["tiny.en", "base.en"],
+                label="Whisper Model",
+                value="base.en",
+                info="`tiny.en` is faster, `base.en` is more accurate."
+            )
+            transcription_mode = gr.Radio(
+                ["Segment-level", "Word-level", "Word-level Advanced"],  # Added new mode
+                label="Transcription Granularity",
+                value="Segment-level",
+                info="Word-level is more detailed but may be slightly slower. Word-level Advanced groups into lines with max words, splitting at punctuation."
+            )
+            chunk_length_min = gr.Slider(
+                minimum=5,
+                maximum=20,
+                value=10,
+                step=1,
+                label="Chunk Length (minutes)",
+                info="Shorter chunks use less RAM but may be slightly less accurate at boundaries."
+            )
+            max_words = gr.Slider(  # New input for max_words
+                minimum=5,
+                maximum=30,
+                value=10,
+                step=1,
+                label="Max Words per Line (Advanced mode only)",
+                info="For Word-level Advanced: Limits words per subtitle line, splitting intelligently at punctuation."
+            )
+            submit_button = gr.Button("Transcribe Video", variant="primary")
+        with gr.Column():
+            status_output = gr.Textbox(label="Status", interactive=False, lines=5)
+            srt_output_file = gr.File(label="Download SRT File")
+    submit_button.click(
+        fn=transcribe_video,
+        inputs=[video_input, model_name, transcription_mode, chunk_length_min, max_words],  # Added max_words
+        outputs=[status_output, srt_output_file]
+    )
+    gr.Markdown(
+        """
+        ### How to Use
+        1.  **Upload a video file.**
+        2.  **Select a Whisper model.** For English, `base.en` provides a great balance of speed and accuracy.
+        3.  **Choose the granularity.** 'Segment-level' is good for standard subtitles. 'Word-level' is great for karaoke-style highlighting. 'Word-level Advanced' groups into optimized subtitle lines.
+        4.  **Click 'Transcribe Video'.** The status box will show the progress.
+        5.  **Download the SRT file** when the process is complete. You can open this file in any text editor or load it into a video player like VLC.
+        """
+    )
 if __name__ == "__main__":
+    demo.launch(debug=True)