Spaces:

kavehtaheri
/

transcribe

Running

App Files Files Community

kavehtaheri commited on Jul 26

Commit

8e98dab

verified ·

1 Parent(s): 1a96a76

Update app.py

Browse files

Files changed (1) hide show

app.py +696 -120

app.py CHANGED Viewed

@@ -5,13 +5,15 @@ import tempfile
 from pydub import AudioSegment
 import math
 import gc # Garbage Collector interface
-import requests # <-- NEW: For downloading files
-import zipfile  # <-- NEW: For unzipping files
 # --- Helper Functions ---
 def format_time(seconds):
-    """Converts seconds to SRT time format (HH:MM:SS,ms)"""
     hours = int(seconds / 3600)
     minutes = int((seconds % 3600) / 60)
     secs = int(seconds % 60)
@@ -52,13 +54,16 @@ def process_advanced_segments(full_result, max_words):
     Adjusts timestamps based on actual word times (or proportional if needed).
     Optimized: Single pass with limited lookahead.
     """
     punctuation = {'.', '!', '?', ';', ',', '--'}
     all_words = []
     for segment in full_result["segments"]:
         all_words.extend(segment.get("words", []))
     if not all_words:
-        return full_result
     new_segments = []
     current_words = []
@@ -67,27 +72,32 @@ def process_advanced_segments(full_result, max_words):
         current_words.append(all_words[i])
         if len(current_words) >= max_words:
             split_index = -1
             for j in range(len(current_words) - 1, -1, -1):
                 word_text = current_words[j]["word"].strip()
-                if word_text and word_text[-1] in punctuation:
-                    split_index = j + 1
                     break
             if split_index == -1:
-                lookahead_end = min(i + 1 + 10, len(all_words))
                 for j in range(i + 1, lookahead_end):
                     word_text = all_words[j]["word"].strip()
-                    current_words.append(all_words[j])
-                    i += 1
-                    if word_text and word_text[-1] in punctuation:
-                        split_index = len(current_words)
                         break
             if split_index == -1:
                 split_index = max_words
             group_words = current_words[:split_index]
             if group_words:
                 text = " ".join(w["word"].strip() for w in group_words)
@@ -95,129 +105,691 @@ def process_advanced_segments(full_result, max_words):
                 end = group_words[-1]["end"]
                 new_segments.append({"start": start, "end": end, "text": text, "words": group_words})
             current_words = current_words[split_index:]
         i += 1
     if current_words:
         text = " ".join(w["word"].strip() for w in current_words)
         start = current_words[0]["start"]
         end = current_words[-1]["end"]
         new_segments.append({"start": start, "end": end, "text": text, "words": current_words})
     for seg in new_segments:
         if "words" not in seg or not seg["words"]:
             orig_start = seg["start"]
             orig_end = seg["end"]
             word_count = len(seg["text"].split())
             if word_count > max_words:
                 ratio = max_words / word_count
                 split_time = orig_start + (orig_end - orig_start) * ratio
-                seg["end"] = split_time
     full_result["segments"] = new_segments
     return full_result
-# --- NEW HELPER FUNCTION FOR GOOGLE DRIVE ---
-def download_gdrive_zip(url, target_dir):
     """
-    Downloads a file from a public Google Drive link and saves it.
-    Handles the specific URL format for direct downloads.
     """
     try:
-        # Standard GDrive share link format: https://drive.google.com/file/d/FILE_ID/view?usp=sharing
-        file_id = url.split('/d/')[1].split('/')[0]
-        download_url = f'https://drive.google.com/uc?export=download&id={file_id}'
-        zip_path = os.path.join(target_dir, "downloaded.zip")
-        with requests.get(download_url, stream=True) as r:
-            r.raise_for_status()
-            with open(zip_path, 'wb') as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-        return zip_path
     except Exception as e:
-        raise ValueError(f"Failed to download from Google Drive. Please ensure the link is public and correct. Error: {e}")
-# --- MODIFIED Main Transcription Logic ---
-def transcribe_video(video_path, gdrive_url, model_name, transcription_mode, chunk_length_min, max_words):
-    """
-    Transcribes a video file by either direct upload or from a Google Drive ZIP link.
-    It extracts audio, chunks it, processes chunks, and generates a full SRT file.
-    """
-    # Determine the source of the video
-    source_path = None
-    # Use a single temp directory for all operations
     with tempfile.TemporaryDirectory() as temp_dir:
-        # --- NEW LOGIC: Handle Google Drive URL input ---
-        if gdrive_url:
-            yield "Input is a Google Drive URL. Starting download...", None
             try:
-                # 1. Download the ZIP file
-                zip_filepath = download_gdrive_zip(gdrive_url, temp_dir)
-                yield f"Download complete. Unzipping file at {zip_filepath}...", None
-                # 2. Unzip the file
-                unzip_dir = os.path.join(temp_dir, "unzipped")
-                os.makedirs(unzip_dir, exist_ok=True)
-                with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
-                    zip_ref.extractall(unzip_dir)
-                yield "Unzipping complete. Searching for video file...", None
-                # 3. Find the video file within the unzipped contents
-                video_extensions = ['.mp4', '.mkv', '.mov', '.avi', '.webm', '.flv', '.wmv']
-                found_video = None
-                for root, _, files in os.walk(unzip_dir):
-                    if found_video: break # Stop if we've already found a video
                     for file in files:
-                        # Ignore macOS resource fork files
-                        if not file.startswith('._') and any(file.lower().endswith(ext) for ext in video_extensions):
-                            found_video = os.path.join(root, file)
-                            break
-                if not found_video:
-                    return "Error: No video file found in the provided ZIP archive.", None
-                source_path = found_video
-                yield f"Video file found: {os.path.basename(source_path)}. Proceeding with transcription...", None
-            except Exception as e:
-                return f"An error occurred: {e}", None
-        # --- EXISTING LOGIC: Handle direct video upload ---
-        elif video_path:
-            source_path = video_path
-            yield "Input is a direct upload. Proceeding with transcription...", None
-        # --- Handle case where no input is provided ---
-        else:
-            return "Please upload a video file or provide a Google Drive ZIP link.", None
-        # --- The rest of the function now uses `source_path` ---
-        # This part remains unchanged as it now works with a determined path
-        yield "Loading model...", None
         try:
-            model = whisper.load_model(model_name)
         except Exception as e:
-            return f"Error loading model: {e}", None
         yield f"Model '{model_name}' loaded. Extracting audio...", None
         audio_path = os.path.join(temp_dir, "extracted_audio.wav")
         try:
-            video = AudioSegment.from_file(source_path)
             video.set_channels(1).set_frame_rate(16000).export(audio_path, format="wav")
             audio = AudioSegment.from_wav(audio_path)
         except Exception as e:
             return f"Error processing video/audio: {e}", None
         chunk_length_ms = chunk_length_min * 60 * 1000
         num_chunks = math.ceil(len(audio) / chunk_length_ms)
         full_result = {"segments": []}
         yield f"Audio extracted. Splitting into {num_chunks} chunk(s) of {chunk_length_min} min...", None
@@ -231,75 +803,81 @@ def transcribe_video(video_path, gdrive_url, model_name, transcription_mode, chu
             chunk.export(chunk_path, format="wav")
             yield f"Transcribing chunk {i+1}/{num_chunks}...", None
-            should_get_word_timestamps = (transcription_mode in ["Word-level", "Word-level Advanced"])
             try:
                 result = model.transcribe(
                     chunk_path,
                     word_timestamps=should_get_word_timestamps,
-                    fp16=False
                 )
             except Exception as e:
                 del model
                 gc.collect()
                 return f"Error during transcription of chunk {i+1}: {e}", None
             time_offset_s = start_ms / 1000.0
             for segment in result["segments"]:
                 segment["start"] += time_offset_s
                 segment["end"] += time_offset_s
                 if "words" in segment:
                     for word_info in segment["words"]:
                         word_info["start"] += time_offset_s
                         word_info["end"] += time_offset_s
                 full_result["segments"].append(segment)
             os.remove(chunk_path)
         del model
         gc.collect()
         if transcription_mode == "Word-level Advanced":
             yield "Processing advanced word-level grouping...", None
             full_result = process_advanced_segments(full_result, max_words)
         yield "All chunks transcribed. Generating SRT file...", None
         srt_mode = "segment" if transcription_mode == "Word-level Advanced" else transcription_mode
         srt_output = generate_srt_from_result(full_result, srt_mode)
         srt_file_path = os.path.join(temp_dir, "output.srt")
         with open(srt_file_path, "w", encoding="utf-8") as srt_file:
             srt_file.write(srt_output)
-        # We must return a new copy of the file, not the one in the temp dir
-        # Gradio handles the final temp file cleanup
-        return "Done!", srt_file_path
-# --- MODIFIED Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Whisper Video Transcriber 🎥 -> 📝
-        Upload a video, or provide a link to a **public Google Drive ZIP file** containing a video.
-        The app handles large videos by automatically splitting them into manageable chunks.
         """
     )
     with gr.Row():
-        with gr.Column(scale=2):
-            # Use Tabs to allow for either upload or URL
-            with gr.Tabs():
-                with gr.TabItem("Upload Video"):
-                    video_input = gr.Video(label="Upload a Video File")
-                with gr.TabItem("Google Drive Link"):
-                    gdrive_url_input = gr.Textbox(
-                        label="Public Google Drive ZIP File URL",
-                        placeholder="e.g., https://drive.google.com/file/d/1a2b3c.../view?usp=sharing",
-                        info="Paste the public share link to a ZIP file containing your video."
-                    )
-            gr.Markdown("### Transcription Settings")
             model_name = gr.Radio(
                 ["tiny.en", "base.en"],
                 label="Whisper Model",
@@ -308,10 +886,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             )
             transcription_mode = gr.Radio(
-                ["Segment-level", "Word-level", "Word-level Advanced"],
                 label="Transcription Granularity",
                 value="Segment-level",
-                info="Word-level is more detailed. Word-level Advanced groups words into optimized lines."
             )
             chunk_length_min = gr.Slider(
@@ -323,37 +901,35 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 info="Shorter chunks use less RAM but may be slightly less accurate at boundaries."
             )
-            max_words = gr.Slider(
                 minimum=5,
                 maximum=30,
                 value=10,
                 step=1,
                 label="Max Words per Line (Advanced mode only)",
-                info="For Word-level Advanced: Limits words per subtitle line, splitting intelligently."
             )
-            submit_button = gr.Button("Transcribe", variant="primary")
-        with gr.Column(scale=3):
-            status_output = gr.Textbox(label="Status", interactive=False, lines=10)
             srt_output_file = gr.File(label="Download SRT File")
-    # The click function now takes both video_input and gdrive_url_input
     submit_button.click(
         fn=transcribe_video,
-        inputs=[video_input, gdrive_url_input, model_name, transcription_mode, chunk_length_min, max_words],
         outputs=[status_output, srt_output_file]
     )
     gr.Markdown(
         """
         ### How to Use
-        1.  **Choose your input method:**
-            *   **Upload Video:** Select the tab and upload your video file directly.
-            *   **Google Drive Link:** Select the tab, paste a public link to a **ZIP file** that contains your video. Make sure the link is set to "Anyone with the link can view".
-        2.  **Select your settings.** For English, `base.en` provides a great balance of speed and accuracy.
-        3.  **Click 'Transcribe'.** The status box will show the progress (including download and unzip steps for GDrive links).
-        4.  **Download the SRT file** when the process is complete.
         """
     )

 from pydub import AudioSegment
 import math
 import gc # Garbage Collector interface
+import requests
+import zipfile
+import re
 # --- Helper Functions ---
 def format_time(seconds):
+    """Converts seconds to SRT- Adding drive_link as an optional parameter to transcribe_video function helps manage inputs.
+ time format (HH:MM:SS,ms)"""
     hours = int(seconds / 3600)
     minutes = int((seconds % 3600) / 60)
     secs = int(seconds % 60)
     Adjusts timestamps based on actual word times (or proportional if needed).
     Optimized: Single pass with limited lookahead.
     """
+    # Define punctuation for natural splits
     punctuation = {'.', '!', '?', ';', ',', '--'}
+    # Flatten all words into a single list for continuous processing
     all_words = []
     for segment in full_result["segments"]:
         all_words.extend(segment.get("words", []))
     if not all_words:
+        return full_result  # Nothing to process
     new_segments = []
     current_words = []
         current_words.append(all_words[i])
         if len(current_words) >= max_words:
+            # Find nearest punctuation for split
             split_index = -1
+            # Look backward in current words for last punctuation
             for j in range(len(current_words) - 1, -1, -1):
                 word_text = current_words[j]["word"].strip()
+                if word_text[-1] in punctuation:
+                    split_index = j + 1  # Split after this word
                     break
+            # If none, look forward in next words (limited lookahead to optimize)
             if split_index == -1:
+                lookahead_end = min(i + 1 + 10, len(all_words))  # Cap lookahead for efficiency
                 for j in range(i + 1, lookahead_end):
                     word_text = all_words[j]["word"].strip()
+                    current_words.append(all_words[j])  # Temporarily add to current
+                    i += 1  # Advance i as we add
+                    if word_text[-1] in punctuation:
+                        split_index = len(current_words)  # Split after this added word
                         break
+            # Fallback: Split at max_words if no punctuation found
             if split_index == -1:
                 split_index = max_words
+            # Create new segment for current group up to split
             group_words = current_words[:split_index]
             if group_words:
                 text = " ".join(w["word"].strip() for w in group_words)
                 end = group_words[-1]["end"]
                 new_segments.append({"start": start, "end": end, "text": text, "words": group_words})
+            # Remaining words become start of next group (timestamp adjustment: shifted to next)
             current_words = current_words[split_index:]
         i += 1
+    # Add any remaining words as last segment
     if current_words:
         text = " ".join(w["word"].strip() for w in current_words)
         start = current_words[0]["start"]
         end = current_words[-1]["end"]
         new_segments.append({"start": start, "end": end, "text": text, "words": current_words})
+    # Handle rare case: If no word timestamps, fall back to proportional adjustment
     for seg in new_segments:
         if "words" not in seg or not seg["words"]:
+            # Proportional split (as per your description: adjust based on word count ratio)
             orig_start = seg["start"]
             orig_end = seg["end"]
             word_count = len(seg["text"].split())
             if word_count > max_words:
                 ratio = max_words / word_count
                 split_time = orig_start + (orig_end - orig_start) * ratio
+                seg["end"] = split_time  # Minus from current
+                # Next segment would start at split_time (but since we're rebuilding, it's handled in loop)
+    # Replace original segments with new ones
     full_result["segments"] = new_segments
     return full_result
+# --- Main Transcription Logic ---
+def transcribe_video(video_path, model_name, transcription_mode, chunk_length_min, max_words, drive_link):  # Added drive_link
     """
+    Transcribes a video file by extracting audio, chunking it, processing chunks,
+    and generating a full SRT file with corrected timestamps.
+    Supports either uploaded video or Google Drive public zip link containing the video.
     """
+    if video_path is None and not drive_link:
+        return "Please upload a video file or provide a Google Drive link.", None
+    yield "Loading model...", None # Update status for the user
+    # Load the Whisper model. This is cached by Gradio for subsequent calls.
+    # Note: On a Hugging Face Space, the model is loaded once when the app starts.
     try:
+        model = whisper.load_model(model_name)
     except Exception as e:
+        return f"Error loading model: {e}", None
+    yield f"Model '{model_name}' loaded. Extracting audio...", None
+    # Use a temporary directory for all our files
     with tempfile.TemporaryDirectory() as temp_dir:
+        if drive_link:
             try:
+                yield "Parsing Google Drive link...", None
+                # Extract file ID from Google Drive link
+                match = re.search(r'/d/([a-zA-Z0-9_-]+)', drive_link)
+                if not match:
+                    return "Invalid Google Drive link format.", None
+                file_id = match.group(1)
+                download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+                yield "Downloading zip from Google Drive...", None
+                zip_path = os.path.join(temp_dir, "downloaded.zip")
+                response = requests.get(download_url, stream=True)
+                if response.status_code != 200:
+                    return f"Download failed with status {response.status_code}. Ensure the link is public.", None
+                # Handle large file confirmation if needed
+                if "confirm" in response.text:
+                    confirm_token = re.search(r'confirm=([0-9A-Za-z\_-]+)', response.text)
+                    if confirm_token:
+                        confirm_token = confirm_token.group(1)
+                        download_url = f"https://drive.google.com/uc?export=download&confirm={confirm_token}&id={file_id}"
+                        response = requests.get(download_url, stream=True)
+                with open(zip_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                yield "Unzipping file...", None
+                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                    zip_ref.extractall(temp_dir)
+                # Find the video file in the extracted contents (search recursively)
+                video_files = []
+                for root, _, files in os.walk(temp_dir):
                     for file in files:
+                        if file.lower().endswith(('.mp4', '.avi', '.mkv', '.mov', '.wmv')):
+                            video_files.append(os.path.join(root, file))
+                if not video_files:
+                    return "No video file found in the zip archive.", None
+                elif len(video_files) > 1:
+                    return "Multiple video files found in the zip; only one is supported.", None
+                video_path = video_files[0]
+                yield f"Video extracted: {os.path.basename(video_path)}. Proceeding with transcription...", None
+            except Exception as e:
+                return f"Error handling Google Drive zip: {str(e)}", None
+        # Now proceed with audio extraction from video_path (either uploaded or extracted)
+        audio_path = os.path.join(temp_dir, "extracted_audio.wav")
+        # Extract audio from video using pydub
         try:
+            video = AudioSegment.from_file(video_path)
+            # Export as WAV, 16kHz, mono - ideal for Whisper
+            video.set_channels(1).set_frame_rate(16000).export(audio_path, format="wav")
+            audio = AudioSegment.from_wav(audio_path)
         except Exception as e:
+            return f"Error processing video/audio: {e}", None
+        # --- Chunking Logic ---
+        chunk_length_ms = chunk_length_min * 60 * 1000
+        num_chunks = math.ceil(len(audio) / chunk_length_ms)
+        full_result = {"segments": []}
+        yield f"Audio extracted. Splitting into {num_chunks} chunk(s) of {chunk_length_min} min...", None
+        for i in range(num_chunks):
+            start_ms = i * chunk_length_ms
+            end_ms = start_ms + chunk_length_ms
+            chunk = audio[start_ms:end_ms]
+            chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
+            chunk.export(chunk_path, format="wav")
+            yield f"Transcribing chunk {i+1}/{num_chunks}...", None
+            # Determine if word-level timestamps are needed
+            should_get_word_timestamps = (transcription_mode in ["word", "Word-level Advanced"])  # Updated for new mode
+            # Transcribe the chunk
+            try:
+                result = model.transcribe(
+                    chunk_path,
+                    word_timestamps=should_get_word_timestamps,
+                    fp16=False # Set to False for CPU-only inference
+                )
+            except Exception as e:
+                # Clean up and report error
+                del model
+                gc.collect()
+                return f"Error during transcription of chunk {i+1}: {e}", None
+            # --- Timestamp Correction ---
+            # Add the chunk's start time to all timestamps in the result
+            time_offset_s = start_ms / 1000.0
+            for segment in result["segments"]:
+                segment["start"] += time_offset_s
+                segment["end"] += time_offset_s
+                if "words" in segment:
+                    for word_info in segment["words"]:
+                        word_info["start"] += time_offset_s
+                        word_info["end"] += time_offset_s
+ The user wants to modify their code to support transcribing from a public Google Drive zip file link. The process should be: take the link, download the zip, unzip it, and then transcribe like a normal video.
+The current code uses Gradio for a UI where you upload a video, choose settings, and transcribe to SRT.
+I need to edit the essential parts to add this new feature. The rest stays the same.
+First, add an input for the Google Drive link in the UI.
+The input will be either a video upload or a Google Drive link.
+The transcribe function needs to handle either a video path or a link.
+If it's a link, download the zip, unzip, and assume it contains the video file, then proceed.
+The user said "public link of google drive zip file", so it's a zip file on Google Drive.
+To download from Google Drive public link in Python.
+Google Drive links for direct download need to handle the confirmation if it's large, but for public links, we can construct the direct download URL.
+For a Google Drive file shared publicly, the link is like https://drive.google.com/file/d/FILE_ID/view?usp=sharing
+To direct download: https://drive.google.com/uc?export=download&id=FILE_ID
+For large files, it might require confirmation, but for scripts, we can use requests to handle it.
+I need to add code to download the zip using requests, save to temp, unzip using zipfile, and then find the video file inside.
+Assume the zip contains one video file.
+Then, use that video path in the transcription.
+In the UI, add a textbox for the Google Drive link.
+Make it optional, so if link is provided, use that, else use uploaded video.
+The user said "give a public link of google drive zip file", so probably add a new input.
+To keep it simple, add a textbox for the link, and in the function, check if link is provided, if yes, download and unzip, else use video_path.
+But video_input is gr.Video, which provides the path.
+Add the link input.
+Modify the transcribe_video function to accept the link as well.
+Inputs: add the link.
+If link is not None, process the link.
+For downloading, use requests.
+Import requests and zipfile.
+In the function, if google_drive_link:
+    # extract file id
+    import re
+    file_id = re.search(r'/d/([^/]+)', google_drive_link).group(1)
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+    # download
+    with requests.get(download_url, stream=True) as r:
+        r.raise_for_status()
+        zip_path = os.path.join(temp_dir, "downloaded.zip")
+        with open(zip_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+    # unzip
+    import zipfile
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+    # find the video file, assume one .mp4 or something
+    video_files = [f for f in os.listdir(temp_dir) if f.endswith(('.mp4', '.avi', '.mkv', etc.))]
+    if not video_files:
+        error
+    video_path = os.path.join(temp_dir, video_files[0])
+Then proceed with that video_path.
+The temp_dir is already there.
+For large files, Google Drive might return a virus scan page, so need to handle that.
+For files that require confirmation, the initial GET returns a HTML with a confirm token.
+To handle large files, need to:
+- GET the download_url with params
+- If content-type is not octet-stream, parse the cookie and confirm token, then POST or something.
+But to keep it simple, assume small files or public without confirmation.
+Since this is in Gradio, probably running on HF spaces, which might have limitations.
+But anyway, I need to provide the code.
+The user might expect a simple way.
+Since it's Python, I can add the code.
+Add imports: import requests, import zipfile, import re
+In the UI, add google_drive_link = gr.Textbox(label="Or Google Drive Zip Link")
+Then, in click, inputs add google_drive_link
+In fn, def transcribe_video(video_path, model_name, ..., max_words, google_drive_link=None)
+If google_drive_link:
+    yield "Downloading from Google Drive..."
+    try:
+        file_id = re.findall(r"\/d\/(.+?)\/", google_drive_link)[0]
+        # or better parsing
+        download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+        response = requests.get(download_url, stream=True)
+        if "Content-Disposition" not in response.headers:
+            # Handle confirmation
+            # This is complicated, perhaps use gdown library, but can't install.
+The tool says: Keep in mind you have no internet access. Therefore, you CANNOT install any additional packages via pip install
+But in the code, it's the user's code that will run, so if they run it locally, they can, but assuming in HF space, can't.
+The code is using whisper, which is installed.
+For downloading from GDrive, to handle large files, often people use gdown, but can't install.
+So, for simplicity, I'll implement a basic download, and note that for large files it might not work.
+Use the browse_page tool? No, this is for the code.
+I am Grok, I need to edit the code.
+Let's implement a function to download from GDrive handling the confirmation.
+I can search for a snippet.
+Use web_search to find how to download from GDrive without gdown.
+But perhaps it's overkill.
+Let me think.
+A common way:
+def download_from_gdrive(url, dest_path):
+    import requests
+    def get_confirm_token(response):
+        for key, value in response.cookies.items():
+            if key.startswith('download_warning'):
+                return value
+        return None
+    URL = "https://drive.google.com/uc?export=download"
+    file_id = # extract
+    session = requests.Session()
+    response = session.get(URL, params={'id': file_id}, stream=True)
+    token = get_confirm_token(response)
+    if token:
+        params = {'id': file_id, 'confirm': token}
+        response = session.get(URL, params=params, stream=True)
+    # then save
+Yes, this is a standard snippet.
+I can add this.
+In the code, add this function.
+Now, for extracting file_id, from the link.
+If link is https://drive.google.com/file/d/FILE_ID/view?usp=sharing
+Use re.search(r'file/d/([a-zA-Z0-9_-]+)', link)
+Yes.
+For the zip, after download, unzip to temp_dir, then find files ending with video extensions.
+What if zip contains multiple files? Assume it's the video zip, perhaps the video is the only one or the main one.
+User said "zip file", probably contains the video.
+To be safe, look for the largest file or something, but simple: find the first video file.
+Video extensions: mp4, mkv, avi, mov, etc.
+I can list some.
+In the code, after unzip, video_path = None
+for f in os.listdir(temp_dir):
+    if f.lower().endswith(('.mp4', '.mkv', '.avi', '.mov', '.wmv')):
+        video_path = os.path.join(temp_dir, f)
+        break
+If not, error.
+If video_path from upload, use that.
+If video_path is None and not google_drive_link, error.
+But in current, video_path is required, but in Gradio, if not uploaded, it's None.
+Yes.
+Also, since it's generator, yield "Unzipping..."
+Yes.
+For the audio extraction, same.
+The temp_dir is with tempfile.TemporaryDirectory(), so all good.
+Add the imports at top: import requests, import re, import zipfile
+Add the download function.
+Since it's stateful? No.
+Ok.
+In the transcribe_video, it's a generator, yield status, None
+Yes.
+The outputs are status_output which is Textbox, but since it's generator, Gradio handles streaming?
+In current code, it's yield "message", None
+But for outputs=[status, file], and file is updated at end.
+In Gradio, for streaming, fn should be generator.
+Yes.
+Ok.
+In the UI, add the input.
+And in how to use, add option for link.
+But user said "edit essential part", rest same.
+But to make it work, need to add the input.
+Yes.
+I think that's it.```python
+import gradio as gr
+import whisper
+import os
+import tempfile
+from pydub import AudioSegment
+import math
+import gc # Garbage Collector interface
+import requests
+import re
+import zipfile
+# --- Helper Functions ---
+def format_time(seconds):
+    """Converts seconds to SRT time format (HH:MM:SS,ms)"""
+    hours = int(seconds / 3600)
+    minutes = int((seconds % 3600) / 60)
+    secs = int(seconds % 60)
+    milliseconds = int((seconds - int(seconds)) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
+def generate_srt_from_result(result, transcription_mode):
+    """Generates SRT content from Whisper's result dictionary."""
+    srt_content = []
+    if transcription_mode == "word":
+        # Word-level SRT generation
+        entry_index = 1
+        for segment in result["segments"]:
+            for word_info in segment.get("words", []):
+                start_time = format_time(word_info["start"])
+                end_time = format_time(word_info["end"])
+                text = word_info["word"].strip()
+                if text: # Ensure we don't add empty entries
+                    srt_content.append(f"{entry_index}\n{start_time} --> {end_time}\n{text}\n")
+                    entry_index += 1
+    else: # Default to segment-level
+        for i, segment in enumerate(result["segments"], 1):
+            start_time = format_time(segment["start"])
+            end_time = format_time(segment["end"])
+            text = segment["text"].strip()
+            if text:
+                srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n")
+    return "\n".join(srt_content)
+# --- New Function for Advanced Mode ---
+def process_advanced_segments(full_result, max_words):
+    """
+    Post-processes segments for Word-level Advanced mode.
+    Groups words into new segments with <= max_words per segment, splitting at nearest punctuation.
+    Adjusts timestamps based on actual word times (or proportional if needed).
+    Optimized: Single pass with limited lookahead.
+    """
+    # Define punctuation for natural splits
+    punctuation = {'.', '!', '?', ';', ',', '--'}
+    # Flatten all words into a single list for continuous processing
+    all_words = []
+    for segment in full_result["segments"]:
+        all_words.extend(segment.get("words", []))
+    if not all_words:
+        return full_result  # Nothing to process
+    new_segments = []
+    current_words = []
+    i = 0
+    while i < len(all_words):
+        current_words.append(all_words[i])
+        if len(current_words) >= max_words:
+            # Find nearest punctuation for split
+            split_index = -1
+            # Look backward in current words for last punctuation
+            for j in range(len(current_words) - 1, -1, -1):
+                word_text = current_words[j]["word"].strip()
+                if word_text[-1] in punctuation:
+                    split_index = j + 1  # Split after this word
+                    break
+            # If none, look forward in next words (limited lookahead to optimize)
+            if split_index == -1:
+                lookahead_end = min(i + 1 + 10, len(all_words))  # Cap lookahead for efficiency
+                for j in range(i + 1, lookahead_end):
+                    word_text = all_words[j]["word"].strip()
+                    current_words.append(all_words[j])  # Temporarily add to current
+                    i += 1  # Advance i as we add
+                    if word_text[-1] in punctuation:
+                        split_index = len(current_words)  # Split after this added word
+                        break
+            # Fallback: Split at max_words if no punctuation found
+            if split_index == -1:
+                split_index = max_words
+            # Create new segment for current group up to split
+            group_words = current_words[:split_index]
+            if group_words:
+                text = " ".join(w["word"].strip() for w in group_words)
+                start = group_words[0]["start"]
+                end = group_words[-1]["end"]
+                new_segments.append({"start": start, "end": end, "text": text, "words": group_words})
+            # Remaining words become start of next group (timestamp adjustment: shifted to next)
+            current_words = current_words[split_index:]
+        i += 1
+    # Add any remaining words as last segment
+    if current_words:
+        text = " ".join(w["word"].strip() for w in current_words)
+        start = current_words[0]["start"]
+        end = current_words[-1]["end"]
+        new_segments.append({"start": start, "end": end, "text": text, "words": current_words})
+    # Handle rare case: If no word timestamps, fall back to proportional adjustment
+    for seg in new_segments:
+        if "words" not in seg or not seg["words"]:
+            # Proportional split (as per your description: adjust based on word count ratio)
+            orig_start = seg["start"]
+            orig_end = seg["end"]
+            word_count = len(seg["text"].split())
+            if word_count > max_words:
+                ratio = max_words / word_count
+                split_time = orig_start + (orig_end - orig_start) * ratio
+                seg["end"] = split_time  # Minus from current
+                # Next segment would start at split_time (but since we're rebuilding, it's handled in loop)
+    # Replace original segments with new ones
+    full_result["segments"] = new_segments
+    return full_result
+def download_from_gdrive(gdrive_link, dest_path):
+    """
+    Downloads a file from a public Google Drive link, handling large file confirmation if needed.
+    """
+    # Extract file ID from the link
+    match = re.search(r'/file/d/([a-zA-Z0-9_-]+)', gdrive_link)
+    if not match:
+        raise ValueError("Invalid Google Drive link. Ensure it's a public share link.")
+    file_id = match.group(1)
+    URL = "https://drive.google.com/uc?export=download"
+    session = requests.Session()
+    response = session.get(URL, params={'id': file_id}, stream=True)
+    token = get_confirm_token(response)
+    if token:
+        params = {'id': file_id, 'confirm': token}
+        response = session.get(URL, params=params, stream=True)
+    response.raise_for_status()
+    with open(dest_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+def get_confirm_token(response):
+    for key, value in response.cookies.items():
+        if key.startswith('download_warning'):
+            return value
+    return None
+# --- Main Transcription Logic ---
+def transcribe_video(video_path, model_name, transcription_mode, chunk_length_min, max_words, google_drive_link=None):  # Added google_drive_link
+    """
+    Transcribes a video file by extracting audio, chunking it, processing chunks,
+    and generating a full SRT file with corrected timestamps.
+    Supports downloading and unzipping from a Google Drive zip link.
+    """
+    if video_path is None and not google_drive_link:
+        return "Please upload a video file or provide a Google Drive link.", None
+    yield "Loading model...", None # Update status for the user
+    # Load the Whisper model. This is cached by Gradio for subsequent calls.
+    # Note: On a Hugging Face Space, the model is loaded once when the app starts.
+    try:
+        model = whisper.load_model(model_name)
+    except Exception as e:
+        return f"Error loading model: {e}", None
+    # Use a temporary directory for all our files
+    with tempfile.TemporaryDirectory() as temp_dir:
+        if google_drive_link:
+            yield "Downloading zip from Google Drive...", None
+            try:
+                zip_path = os.path.join(temp_dir, "downloaded.zip")
+                download_from_gdrive(google_drive_link, zip_path)
+            except Exception as e:
+                return f"Error downloading from Google Drive: {e}", None
+            yield "Unzipping file...", None
+            try:
+                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                    zip_ref.extractall(temp_dir)
+                os.remove(zip_path)  # Clean up zip
+            except Exception as e:
+                return f"Error unzipping file: {e}", None
+            # Find the video file in the temp_dir
+            video_extensions = ('.mp4', '.mkv', '.avi', '.mov', '.wmv')
+            video_files = [f for f in os.listdir(temp_dir) if f.lower().endswith(video_extensions)]
+            if not video_files:
+                return "No video file found in the unzipped contents.", None
+            video_path = os.path.join(temp_dir, video_files[0])  # Assume first video file
         yield f"Model '{model_name}' loaded. Extracting audio...", None
         audio_path = os.path.join(temp_dir, "extracted_audio.wav")
+        # Extract audio from video using pydub
         try:
+            video = AudioSegment.from_file(video_path)
+            # Export as WAV, 16kHz, mono - ideal for Whisper
             video.set_channels(1).set_frame_rate(16000).export(audio_path, format="wav")
             audio = AudioSegment.from_wav(audio_path)
         except Exception as e:
             return f"Error processing video/audio: {e}", None
+        # --- Chunking Logic ---
         chunk_length_ms = chunk_length_min * 60 * 1000
         num_chunks = math.ceil(len(audio) / chunk_length_ms)
         full_result = {"segments": []}
         yield f"Audio extracted. Splitting into {num_chunks} chunk(s) of {chunk_length_min} min...", None
             chunk.export(chunk_path, format="wav")
             yield f"Transcribing chunk {i+1}/{num_chunks}...", None
+            # Determine if word-level timestamps are needed
+            should_get_word_timestamps = (transcription_mode in ["word", "Word-level Advanced"])  # Updated for new mode
+            # Transcribe the chunk
             try:
                 result = model.transcribe(
                     chunk_path,
                     word_timestamps=should_get_word_timestamps,
+                    fp16=False # Set to False for CPU-only inference
                 )
             except Exception as e:
+                # Clean up and report error
                 del model
                 gc.collect()
                 return f"Error during transcription of chunk {i+1}: {e}", None
+            # --- Timestamp Correction ---
+            # Add the chunk's start time to all timestamps in the result
             time_offset_s = start_ms / 1000.0
             for segment in result["segments"]:
                 segment["start"] += time_offset_s
                 segment["end"] += time_offset_s
                 if "words" in segment:
                     for word_info in segment["words"]:
                         word_info["start"] += time_offset_s
                         word_info["end"] += time_offset_s
                 full_result["segments"].append(segment)
+            # Clean up the chunk file immediately
             os.remove(chunk_path)
+        # Clean up the model from memory to be safe
         del model
         gc.collect()
+        # --- New: Process for Advanced Mode ---
         if transcription_mode == "Word-level Advanced":
             yield "Processing advanced word-level grouping...", None
             full_result = process_advanced_segments(full_result, max_words)
         yield "All chunks transcribed. Generating SRT file...", None
+        # Generate the final SRT file from the combined results
+        # For Advanced mode, force segment-level generation (grouped lines)
         srt_mode = "segment" if transcription_mode == "Word-level Advanced" else transcription_mode
         srt_output = generate_srt_from_result(full_result, srt_mode)
+        # Create a final SRT file in the temp directory to be returned by Gradio
         srt_file_path = os.path.join(temp_dir, "output.srt")
         with open(srt_file_path, "w", encoding="utf-8") as srt_file:
             srt_file.write(srt_output)
+        yield "Done!", srt_file_path
+# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Whisper Video Transcriber 🎥 -> 📝
+        Upload a video, choose your settings, and get a timed SRT subtitle file.
+        This app handles large videos by automatically splitting them into manageable chunks.
         """
     )
     with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Upload Video")
+            google_drive_link = gr.Textbox(label="Or Public Google Drive Zip Link", placeholder="https://drive.google.com/file/d/[FILE_ID]/view?usp=sharing")
             model_name = gr.Radio(
                 ["tiny.en", "base.en"],
                 label="Whisper Model",
             )
             transcription_mode = gr.Radio(
+                ["Segment-level", "Word-level", "Word-level Advanced"],  # Added new mode
                 label="Transcription Granularity",
                 value="Segment-level",
+                info="Word-level is more detailed but may be slightly slower. Word-level Advanced groups into lines with max words, splitting at punctuation."
             )
             chunk_length_min = gr.Slider(
                 info="Shorter chunks use less RAM but may be slightly less accurate at boundaries."
             )
+            max_words = gr.Slider(  # New input for max_words
                 minimum=5,
                 maximum=30,
                 value=10,
                 step=1,
                 label="Max Words per Line (Advanced mode only)",
+                info="For Word-level Advanced: Limits words per subtitle line, splitting intelligently at punctuation."
             )
+            submit_button = gr.Button("Transcribe Video", variant="primary")
+        with gr.Column():
+            status_output = gr.Textbox(label="Status", interactive=False, lines=5)
             srt_output_file = gr.File(label="Download SRT File")
     submit_button.click(
         fn=transcribe_video,
+        inputs=[video_input, model_name, transcription_mode, chunk_length_min, max_words, google_drive_link],  # Added google_drive_link
         outputs=[status_output, srt_output_file]
     )
     gr.Markdown(
         """
         ### How to Use
+        1.  **Upload a video file or provide a public Google Drive zip link containing the video.**
+        2.  **Select a Whisper model.** For English, `base.en` provides a great balance of speed and accuracy.
+        3.  **Choose the granularity.** 'Segment-level' is good for standard subtitles. 'Word-level' is great for karaoke-style highlighting. 'Word-level Advanced' groups into optimized subtitle lines.
+        4.  **Click 'Transcribe Video'.** The status box will show the progress.
+        5.  **Download the SRT file** when the process is complete. You can open this file in any text editor or load it into a video player like VLC.
         """
     )