Spaces:

kavehtaheri
/

ocrvideo2

Sleeping

App Files Files Community

kavehtaheri commited on Jul 27

Commit

48ce277

verified ·

1 Parent(s): 307e11d

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -179

app.py CHANGED Viewed

@@ -10,205 +10,131 @@ import arabic_reshaper
 from bidi.algorithm import get_display
 import os
 import time
 # --- CONFIGURATION ---
-# IMPORTANT: This should be set as a Secret in your Hugging Face Space
-# For local testing, you can uncomment the line below.
-# os.environ['GEMINI_API_KEY'] = "YOUR_API_KEY_HERE"
-API_KEY = "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
-# Ensure these font files are in your Hugging Face repository
 PERSIAN_FONT_PATH = "Vazir.ttf"
-OUTPUT_VIDEO_FILENAME = f"translated_video_{int(time.time())}.mp4"
-# Video effect settings
-FADE_IN_DURATION_SECONDS = 1.0
-INITIAL_BLACK_SCREEN_SECONDS = 1.0
 # --- GLOBAL INITIALIZATION ---
 reader = None
 def initialize_reader():
-    """Initializes the EasyOCR reader if it hasn't been already."""
     global reader
     if reader is None:
         print("Loading EasyOCR model...")
-        # For a CPU-only environment like HF Spaces free tier, gpu=False is essential.
         reader = easyocr.Reader(['en'], gpu=False, verbose=False)
         print("EasyOCR model loaded successfully!")
     return reader
-# --- YOUR CORE FUNCTIONS (Slightly Adapted) ---
 def extract_text_and_bbox(image: Image.Image):
-    """
-    Extracts text from a PIL Image and calculates a single consolidated
-    bounding box for all text found.
-    (This function is kept exactly as you wrote it)
-    """
     ocr_reader = initialize_reader()
     img_array = np.array(image)
     results = ocr_reader.readtext(img_array)
-    if not results:
-        return "No text detected in the image.", None
-    min_x, min_y = float('inf'), float('inf')
-    max_x, max_y = float('-inf'), float('-inf')
     text_parts = []
     for (bbox, text, prob) in results:
         text_parts.append(text)
         (tl, tr, br, bl) = bbox
-        min_x = min(min_x, tl[0], bl[0])
-        min_y = min(min_y, tl[1], tr[1])
-        max_x = max(max_x, tr[0], br[0])
-        max_y = max(max_y, bl[1], br[1])
     extracted_text = ' '.join(text_parts)
     consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
     return extracted_text, consolidated_bbox
 def translate_text_gemini(text: str) -> str:
-    """
-    Translates text to colloquial Persian using the Gemini API.
-    (This function is kept exactly as you wrote it, but with safer API key handling)
-    """
-    if not API_KEY:
-        raise gr.Error("GEMINI_API_KEY is not set. Please configure it in your Hugging Face Space Secrets.")
-    if not text or "No text" in text or "Error" in text:
-        return "No valid text to translate."
     try:
         genai.configure(api_key=API_KEY)
         model = genai.GenerativeModel('gemini-1.5-flash')
-        # Your excellent, detailed prompt is preserved
-        prompt =f"Translate the following English quotes into Persian, rephrasing only minimally if absolutely necessary for natural poetic flow, but strictly preserving the original meaning, intent, purpose, and nuances without any alterations or additions that could change the core message. Ensure the Persian versions are concise (under 20 words), deep, touching, poetic, and profound, using idiomatic Persian that evokes wisdom or inspiration while staying faithful to the source. Additionally, guarantee suitable grammar and natural sentence structure in Persian for smooth readability by native speakers, and ensure the translation conveys clear, substantive meaning that stands independently beyond its poetic tone (i.e., the wisdom or insight should be immediately understandable without relying solely on artistry). If the original quote includes an attribution (e.g., author name), incorporate it faithfully in the Persian translation on the last line, formatted similarly (e.g., ‘- Author Name -’ in Persian). Your response must contain ONLY the translated Persian texts in Perso-Arabic script, one per quote, numbered (e.g., 1., 2.) for separation, with no other text, labels, explanations, or information whatsoever Quotes: [{text}]"
         response = model.generate_content(prompt)
         return response.text.strip()
-    except Exception as e:
-        return f"Error during translation with Gemini: {str(e)}"
-# --- NEW FUNCTION: Renders a reusable overlay "stamp" ---
 def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
-    """
-    Creates a single, pre-rendered RGBA image of the translated text on a
-    background sampled from the original image. This "stamp" can be efficiently
-    overlaid on every video frame.
-    This function adapts the logic from your original 'overlay_text_on_image'.
-    """
-    # 1. Define the box where the new text will live (with padding)
     padding = 15
-    overlay_box = (
-        max(0, bbox[0] - padding),
-        max(0, bbox[1] - padding),
-        min(original_image.width, bbox[2] + padding),
-        min(original_image.height, bbox[3] + padding)
-    )
     overlay_width = overlay_box[2] - overlay_box[0]
     overlay_height = overlay_box[3] - overlay_box[1]
-    # 2. Sample the background color from the original image
     try:
-        sample_x = max(0, int(overlay_box[0]) - 5)
-        sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
-    except (ValueError, IndexError):
-        bg_color = (25, 25, 25, 255) # Fallback color
-    # 3. Create the base layer for our overlay "stamp"
-    # This is an RGBA image with the sampled background color
     overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
     draw = ImageDraw.Draw(overlay_layer)
-    # 4. Dynamically find best font size and wrap text (your brilliant logic)
     target_width = overlay_width * 0.90
     font_size = 100
     final_wrapped_lines = []
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
-        words = text_to_overlay.split()
         if not words: break
         raw_lines = []; current_line = ""
         for word in words:
             test_line = (current_line + " " + word).strip()
-            reshaped_test_line = arabic_reshaper.reshape(test_line)
-            bidi_test_line = get_display(reshaped_test_line)
-            line_width = draw.textbbox((0, 0), bidi_test_line, font=font)[2]
             if line_width <= target_width: current_line = test_line
             else: raw_lines.append(current_line); current_line = word
         raw_lines.append(current_line)
-        # Check total height
-        total_height = 0
-        for line in raw_lines:
-            reshaped_line = arabic_reshaper.reshape(line)
-            bidi_line = get_display(reshaped_line)
-            total_height += draw.textbbox((0,0), bidi_line, font=font)[3]
-        if total_height <= overlay_height * 0.9:
-            final_wrapped_lines = raw_lines
-            break
-        else:
-            font_size -= 2
-    if not final_wrapped_lines:
-        print("Warning: Text could not fit. It may be truncated.")
-        final_wrapped_lines = raw_lines # Use last attempt if no fit found
-    # 5. Draw the final, wrapped text onto our stamp
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
     line_spacing = font_size * 0.3
-    # BIDI and Reshape for correct RTL rendering
-    reshaped_lines = [get_display(arabic_reshaper.reshape(l)) for l in final_wrapped_lines]
-    line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in reshaped_lines]
-    total_text_height = sum(line_heights) + (len(reshaped_lines) - 1) * line_spacing
     y_start = (overlay_height - total_text_height) / 2
     current_y = y_start
-    for i, line_to_draw in enumerate(reshaped_lines):
-        x_center = overlay_width / 2
         # Draw shadow then text for readability
-        draw.text((x_center + 1, current_y + 1), line_to_draw, font=final_font, fill=(0, 0, 0, 180), anchor="mt")
-        draw.text((x_center, current_y), line_to_draw, font=final_font, fill=(255, 255, 255, 255), anchor="mt")
         current_y += line_heights[i] + line_spacing
     return overlay_layer, overlay_box
-# --- MAIN VIDEO PROCESSING PIPELINE ---
 def process_video(video_path, progress=gr.Progress()):
-    """
-    Main function to orchestrate the entire video translation process.
-    """
-    if video_path is None:
-        raise gr.Error("Please upload a video file first.")
-    progress(0, desc="Loading Video...")
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened(): raise gr.Error("Could not open video file.")
-    # Video properties
-    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # 1. ANALYSIS (OCR & TRANSLATION) - Done only once
-    progress(0.1, desc="Extracting Middle Frame for Analysis...")
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
     if not ret: raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
     progress(0.2, desc="Detecting Text (EasyOCR)...")
@@ -219,96 +145,85 @@ def process_video(video_path, progress=gr.Progress()):
     translated_text = translate_text_gemini(extracted_text)
     if "Error" in translated_text: raise gr.Error(translated_text)
-    progress(0.6, desc="Rendering Translated Text Overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
-    # Convert the PIL RGBA stamp to an OpenCV BGRA image for compositing
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
-    # 2. VIDEO COMPOSITION
-    progress(0.7, desc="Composing Final Video...")
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(OUTPUT_VIDEO_FILENAME, fourcc, fps, (frame_width, frame_height))
-    # Add initial black screen
-    num_black_frames = int(INITIAL_BLACK_SCREEN_SECONDS * fps)
-    black_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
-    for _ in range(num_black_frames): out.write(black_frame)
-    # Add fade-in effect
-    num_fade_frames = int(FADE_IN_DURATION_SECONDS * fps)
-    cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind video
-    ret, first_frame = cap.read()
-    if ret:
-        for i in range(num_fade_frames):
-            alpha = (i + 1) / num_fade_frames
-            blended_frame = cv2.addWeighted(black_frame, 1 - alpha, first_frame, alpha, 0)
-            out.write(blended_frame)
-    # Process all frames and overlay the pre-rendered stamp
-    cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind again
     frame_idx = 0
-    # Get position for stamping
     x_min, y_min, x_max, y_max = overlay_position_box
     while True:
         ret, frame = cap.read()
         if not ret: break
-        # Skip frames used in fade-in
-        if frame_idx < num_fade_frames:
-            frame_idx += 1
-            continue
-        # --- Efficient Alpha Blending (Stamping) ---
         roi = frame[y_min:y_max, x_min:x_max]
-        # Ensure ROI and stamp have same dimensions before blending
         stamp_h, stamp_w, _ = overlay_stamp_cv.shape
         roi_h, roi_w, _ = roi.shape
-        if stamp_h != roi_h or stamp_w != roi_w:
-            # This can happen if padding makes the box go out of bounds. Resize stamp to fit.
-            overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h))
-        else:
-            overlay_resized = overlay_stamp_cv
         alpha = overlay_resized[:, :, 3] / 255.0
         alpha_mask = cv2.merge([alpha, alpha, alpha])
         blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
         frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
         out.write(frame)
         frame_idx += 1
-        progress(0.7 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
-    cap.release()
-    out.release()
-    progress(1, desc="Done!")
-    return OUTPUT_VIDEO_FILENAME
-# --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎬 Persian Video Quote Translator")
-    gr.Markdown("Upload a short video with English text. The app will detect the text, translate it, and create a new video with the Persian translation overlaid.")
     with gr.Row():
         video_input = gr.Video(label="Upload Video")
         video_output = gr.Video(label="Translated Video Output")
     translate_button = gr.Button("Translate Video", variant="primary")
-    translate_button.click(
-        fn=process_video,
-        inputs=[video_input],
-        outputs=[video_output]
-    )
     gr.Markdown("---")
-    gr.Markdown("### How it works:\n1. It finds the middle frame of your video for analysis.\n2. It uses `EasyOCR` to find the English text and its location.\n3. It uses Google's `Gemini` to translate the text to poetic Persian.\n4. It generates a high-quality overlay with your text-wrapping logic.\n5. Finally, it creates a new video with a fade-in and the translated text overlay.")
 if __name__ == "__main__":
     demo.launch(debug=True)

 from bidi.algorithm import get_display
 import os
 import time
+import ffmpeg # ### --- CHANGE --- ###: Import the ffmpeg-python library
 # --- CONFIGURATION ---
+API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
 PERSIAN_FONT_PATH = "Vazir.ttf"
+FADE_IN_DURATION_SECONDS = 1.0 # The fade-in will be exactly 1 second long
 # --- GLOBAL INITIALIZATION ---
 reader = None
 def initialize_reader():
     global reader
     if reader is None:
         print("Loading EasyOCR model...")
         reader = easyocr.Reader(['en'], gpu=False, verbose=False)
         print("EasyOCR model loaded successfully!")
     return reader
+# --- YOUR CORE FUNCTIONS (Unchanged) ---
 def extract_text_and_bbox(image: Image.Image):
     ocr_reader = initialize_reader()
     img_array = np.array(image)
     results = ocr_reader.readtext(img_array)
+    if not results: return "No text detected in the image.", None
+    min_x, min_y, max_x, max_y = float('inf'), float('inf'), float('-inf'), float('-inf')
     text_parts = []
     for (bbox, text, prob) in results:
         text_parts.append(text)
         (tl, tr, br, bl) = bbox
+        min_x = min(min_x, tl[0], bl[0]); min_y = min(min_y, tl[1], tr[1])
+        max_x = max(max_x, tr[0], br[0]); max_y = max(max_y, bl[1], br[1])
     extracted_text = ' '.join(text_parts)
     consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
     return extracted_text, consolidated_bbox
 def translate_text_gemini(text: str) -> str:
+    if not API_KEY: raise gr.Error("GEMINI_API_KEY is not set.")
+    if not text or "No text" in text: return "No valid text to translate."
     try:
         genai.configure(api_key=API_KEY)
         model = genai.GenerativeModel('gemini-1.5-flash')
+        # Your prompt here
+        prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
         response = model.generate_content(prompt)
         return response.text.strip()
+    except Exception as e: return f"Error during translation with Gemini: {str(e)}"
+# --- TEXT OVERLAY FUNCTION (RTL Logic Corrected) ---
 def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
     padding = 15
+    overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
+                   min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
     overlay_width = overlay_box[2] - overlay_box[0]
     overlay_height = overlay_box[3] - overlay_box[1]
     try:
+        sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
+    except (ValueError, IndexError): bg_color = (25, 25, 25, 255)
     overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
     draw = ImageDraw.Draw(overlay_layer)
     target_width = overlay_width * 0.90
     font_size = 100
     final_wrapped_lines = []
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
+        words = text_to_overlay.split();
         if not words: break
         raw_lines = []; current_line = ""
         for word in words:
             test_line = (current_line + " " + word).strip()
+            line_width = draw.textbbox((0, 0), get_display(arabic_reshaper.reshape(test_line)), font=font)[2]
             if line_width <= target_width: current_line = test_line
             else: raw_lines.append(current_line); current_line = word
         raw_lines.append(current_line)
+        total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
+        if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
+        else: font_size -= 2
+    if not final_wrapped_lines: final_wrapped_lines = raw_lines
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
     line_spacing = font_size * 0.3
+    line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
+    total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
     y_start = (overlay_height - total_text_height) / 2
     current_y = y_start
+    for i, line in enumerate(final_wrapped_lines):
+        # ### --- CHANGE --- ###: Reverted to your original, proven RTL centering logic
+        # This is the fix for the RTL text display issue.
+        reshaped_line = arabic_reshaper.reshape(line)
+        bidi_line = get_display(reshaped_line)
+        # Manually calculate line width and center position
+        line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
+        line_width = line_bbox[2] - line_bbox[0]
+        x_position = (overlay_width - line_width) / 2
         # Draw shadow then text for readability
+        draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
+        draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
         current_y += line_heights[i] + line_spacing
     return overlay_layer, overlay_box
+# --- MAIN VIDEO PROCESSING PIPELINE (Now with FFMPEG) ---
 def process_video(video_path, progress=gr.Progress()):
+    if video_path is None: raise gr.Error("Please upload a video file first.")
+    progress(0, desc="Loading Video & Analyzing...")
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened(): raise gr.Error("Could not open video file.")
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
     if not ret: raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
     progress(0.2, desc="Detecting Text (EasyOCR)...")
     translated_text = translate_text_gemini(extracted_text)
     if "Error" in translated_text: raise gr.Error(translated_text)
+    progress(0.5, desc="Rendering Translated Text Overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
+    # ### --- CHANGE --- ###: Define filenames for temporary and final outputs
+    timestamp = int(time.time())
+    temp_silent_path = f"temp_silent_{timestamp}.mp4"
+    final_output_path = f"translated_video_{timestamp}.mp4"
+    # Part 1: Create a silent video with the overlay using OpenCV
+    progress(0.6, desc="Composing Silent Video with Overlay...")
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
+    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
     frame_idx = 0
     x_min, y_min, x_max, y_max = overlay_position_box
     while True:
         ret, frame = cap.read()
         if not ret: break
         roi = frame[y_min:y_max, x_min:x_max]
         stamp_h, stamp_w, _ = overlay_stamp_cv.shape
         roi_h, roi_w, _ = roi.shape
+        overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
         alpha = overlay_resized[:, :, 3] / 255.0
         alpha_mask = cv2.merge([alpha, alpha, alpha])
         blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
         frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
         out.write(frame)
         frame_idx += 1
+        progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
+    cap.release(); out.release()
+    # ### --- CHANGE --- ###: Part 2: Use ffmpeg to combine video with original audio and add fade
+    progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
+    try:
+        input_video = ffmpeg.input(temp_silent_path)
+        input_audio = ffmpeg.input(video_path)
+        (
+            ffmpeg
+            .output(
+                input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS), # Apply fade-in to video stream
+                input_audio.audio, # Take audio stream from original
+                final_output_path,
+                c='copy', # Use 'copy' for audio codec to prevent re-encoding
+                shortest=None
+            )
+            .run(overwrite_output=True, quiet=True)
+        )
+    except ffmpeg.Error as e:
+        # Provide more detailed ffmpeg error logging if something goes wrong
+        print('ffmpeg stdout:', e.stdout.decode('utf8'))
+        print('ffmpeg stderr:', e.stderr.decode('utf8'))
+        raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
+    finally:
+        # Clean up the temporary silent video file
+        if os.path.exists(temp_silent_path):
+            os.remove(temp_silent_path)
+    progress(1, desc="Done!")
+    return final_output_path
+# --- GRADIO INTERFACE (Unchanged) ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎬 Persian Video Quote Translator")
+    gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
     with gr.Row():
         video_input = gr.Video(label="Upload Video")
         video_output = gr.Video(label="Translated Video Output")
     translate_button = gr.Button("Translate Video", variant="primary")
+    translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
     gr.Markdown("---")
+    gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. **(New)** It uses `ffmpeg` to merge the new video with the **original audio**, apply a 1-second fade-in, and ensure the final duration matches the input.")
 if __name__ == "__main__":
     demo.launch(debug=True)