Spaces:

kavehtaheri
/

ocrvideo2

Sleeping

App Files Files Community

kavehtaheri commited on Jul 28

Commit

2194011

verified ·

1 Parent(s): 24d53ed

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -333

app.py CHANGED Viewed

@@ -11,18 +11,18 @@ from bidi.algorithm import get_display
 import os
 import time
 import ffmpeg
-import tempfile
-import shutil
 # --- CONFIGURATION ---
-API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
-PERSIAN_FONT_PATH = "Vazir.ttf"
 FADE_IN_DURATION_SECONDS = 1.0
 # --- GLOBAL INITIALIZATION ---
 reader = None
 def initialize_reader():
     global reader
     if reader is None:
         print("Loading EasyOCR model...")
@@ -30,106 +30,54 @@ def initialize_reader():
         print("EasyOCR model loaded successfully!")
     return reader
-# --- CORE FUNCTIONS ---
 def extract_text_and_bbox(image: Image.Image):
-    """Extract text and calculate consolidated bounding box"""
-    if image is None:
-        return "Please upload an image first.", None
-    try:
-        ocr_reader = initialize_reader()
-        img_array = np.array(image)
-        results = ocr_reader.readtext(img_array)
-        if not results:
-            return "No text detected in the image.", None
-        min_x, min_y = float('inf'), float('inf')
-        max_x, max_y = float('-inf'), float('-inf')
-        text_parts = []
-        for (bbox, text, prob) in results:
-            text_parts.append(text)
-            (tl, tr, br, bl) = bbox
-            min_x = min(min_x, tl[0], bl[0])
-            min_y = min(min_y, tl[1], tr[1])
-            max_x = max(max_x, tr[0], br[0])
-            max_y = max(max_y, bl[1], br[1])
-        extracted_text = ' '.join(text_parts)
-        consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
-        return extracted_text, consolidated_bbox
-    except Exception as e:
-        return f"Error processing image with OCR: {str(e)}", None
 def translate_text_gemini(text: str) -> str:
-    """Translate text to Persian using Gemini API"""
-    if not API_KEY:
-        raise gr.Error("GEMINI_API_KEY is not set.")
-    if not text or "No text" in text or "Error" in text or "Please upload" in text:
         return "No valid text to translate."
     try:
         genai.configure(api_key=API_KEY)
         model = genai.GenerativeModel('gemini-1.5-flash')
-        prompt = f"""Translate the following English quotes into Persian. The translation should be:
-        - Colloquial and natural
-        - Poetic and meaningful
-        - Concise (under 20 words)
-        - Preserving the original meaning and tone
-        - Using proper Persian grammar
-        Provide only the translated Persian text. Quotes: [{text}]"""
         response = model.generate_content(prompt)
         return response.text.strip()
     except Exception as e:
         return f"Error during translation with Gemini: {str(e)}"
-def wrap_persian_text_properly(text, font, max_width, draw):
-    """Properly wrap Persian text maintaining RTL flow"""
-    words = text.split()
-    lines = []
-    current_line_words = []
-    for word in words:
-        # Test with current line + new word
-        test_words = current_line_words + [word]
-        test_line = ' '.join(test_words)
-        # Process this test line for RTL to get actual display width
-        try:
-            reshaped_test = arabic_reshaper.reshape(test_line)
-            display_test = get_display(reshaped_test)
-            test_width = draw.textbbox((0, 0), display_test, font=font)[2]
-        except:
-            # Fallback if RTL processing fails
-            test_width = draw.textbbox((0, 0), test_line, font=font)[2]
-        if test_width <= max_width:
-            current_line_words.append(word)
-        else:
-            # Save current line and start new one
-            if current_line_words:
-                lines.append(' '.join(current_line_words))
-            current_line_words = [word]
-    # Don't forget the last line
-    if current_line_words:
-        lines.append(' '.join(current_line_words))
-    return lines
-def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> tuple:
-    """Render Persian text overlay with proper RTL support"""
-    # Check for font file
-    if not os.path.exists(PERSIAN_FONT_PATH):
-        raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please ensure Vazir.ttf is in the repository.")
     padding = 15
     overlay_box = (
         max(0, bbox[0] - padding),
@@ -137,232 +85,166 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
         min(original_image.width, bbox[2] + padding),
         min(original_image.height, bbox[3] + padding)
     )
     overlay_width = overlay_box[2] - overlay_box[0]
     overlay_height = overlay_box[3] - overlay_box[1]
-    # Sample background color
     try:
         sample_x = max(0, int(overlay_box[0]) - 5)
         sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
     except (ValueError, IndexError):
-        bg_color = (25, 25, 25, 255)
     overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
     draw = ImageDraw.Draw(overlay_layer)
     target_width = overlay_width * 0.90
     font_size = 100
     final_wrapped_lines = []
-    # Find optimal font size
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
-        # Wrap text with current font size
-        wrapped_lines = wrap_persian_text_properly(text_to_overlay, font, target_width, draw)
-        # Calculate total height needed
-        total_height = 0
-        for line in wrapped_lines:
-            try:
-                # Process each line for RTL to get accurate height
-                reshaped_line = arabic_reshaper.reshape(line)
-                display_line = get_display(reshaped_line)
-                line_bbox = draw.textbbox((0, 0), display_line, font=font)
-                line_height = line_bbox[3] - line_bbox[1]
-                total_height += line_height
-            except:
-                # Fallback height calculation
-                line_bbox = draw.textbbox((0, 0), line, font=font)
-                line_height = line_bbox[3] - line_bbox[1]
-                total_height += line_height
-        # Add spacing between lines
-        total_height += (len(wrapped_lines) - 1) * (font_size * 0.3)
-        if total_height <= overlay_height * 0.9:
-            final_wrapped_lines = wrapped_lines
             break
         else:
             font_size -= 2
     if not final_wrapped_lines:
-        final_wrapped_lines = [text_to_overlay]
-    # Render text with proper RTL processing
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
     line_spacing = font_size * 0.3
-    # Process each line for RTL and calculate positions
-    processed_lines = []
-    line_heights = []
-    for line in final_wrapped_lines:
-        try:
-            # CRITICAL: Process each line individually for RTL
-            reshaped_line = arabic_reshaper.reshape(line)
-            display_line = get_display(reshaped_line)
-            processed_lines.append(display_line)
-            line_bbox = draw.textbbox((0, 0), display_line, font=final_font)
-            line_height = line_bbox[3] - line_bbox[1]
-            line_heights.append(line_height)
-        except Exception as e:
-            print(f"RTL processing failed for line '{line}': {e}")
-            # Fallback to original line
-            processed_lines.append(line)
-            line_bbox = draw.textbbox((0, 0), line, font=final_font)
-            line_height = line_bbox[3] - line_bbox[1]
-            line_heights.append(line_height)
-    total_text_height = sum(line_heights) + (len(processed_lines) - 1) * line_spacing
     y_start = (overlay_height - total_text_height) / 2
-    # Draw each line
     current_y = y_start
-    for i, display_line in enumerate(processed_lines):
-        # Calculate line width and center position
-        line_bbox = draw.textbbox((0, 0), display_line, font=final_font)
-        line_width = line_bbox[2] - line_bbox[0]
-        x_position = (overlay_width - line_width) / 2
-        # Draw shadow for better readability
-        draw.text((x_position + 1, current_y + 1), display_line, font=final_font, fill=(0, 0, 0, 180))
-        # Draw main text
-        draw.text((x_position, current_y), display_line, font=final_font, fill=(255, 255, 255, 255))
         current_y += line_heights[i] + line_spacing
-    return overlay_layer, overlay_box
-def process_image(image):
-    """Process image: detect, translate, and overlay text"""
-    if image is None:
-        return "Please upload an image.", "Translation will appear here.", None
-    # Extract text
-    extracted_text, bbox = extract_text_and_bbox(image)
-    if bbox is None:
-        return extracted_text, "No text to translate.", None
-    # Translate text
-    translated_text = translate_text_gemini(extracted_text)
-    if "Error" in translated_text:
-        return extracted_text, translated_text, None
-    # Create overlay
-    overlay_layer, overlay_box = render_translated_overlay(image, translated_text, bbox)
-    # Apply overlay to image
-    image_copy = image.copy().convert("RGBA")
-    # Create background rectangle
-    draw = ImageDraw.Draw(image_copy)
-    draw.rectangle(overlay_box, fill=image.getpixel((overlay_box[0], overlay_box[1])))
-    # Paste overlay
-    image_copy.paste(overlay_layer, (overlay_box[0], overlay_box[1]), overlay_layer)
-    return extracted_text, translated_text, image_copy.convert("RGB")
 def process_video(video_path, progress=gr.Progress()):
-    """Process video: detect text in middle frame, translate, and overlay on all frames"""
-    if video_path is None:
-        raise gr.Error("Please upload a video file first.")
     progress(0, desc="Loading Video & Analyzing...")
     cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened():
-        raise gr.Error("Could not open video file.")
-    # Get video properties
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Extract middle frame for text detection
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
-    if not ret:
-        raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
     progress(0.2, desc="Detecting Text (EasyOCR)...")
     extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
-    if bbox is None:
-        raise gr.Error(extracted_text)
     progress(0.4, desc="Translating Text (Gemini API)...")
     translated_text = translate_text_gemini(extracted_text)
-    if "Error" in translated_text:
-        raise gr.Error(translated_text)
     progress(0.5, desc="Rendering Translated Text Overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
-    # Create temporary files
     timestamp = int(time.time())
-    temp_dir = tempfile.mkdtemp()
-    temp_silent_path = os.path.join(temp_dir, f"temp_silent_{timestamp}.mp4")
     final_output_path = f"translated_video_{timestamp}.mp4"
-    try:
-        progress(0.6, desc="Composing Silent Video with Overlay...")
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
-        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
-        frame_idx = 0
-        x_min, y_min, x_max, y_max = overlay_position_box
-        while True:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            # Extract ROI and resize overlay if needed
-            roi = frame[y_min:y_max, x_min:x_max]
-            stamp_h, stamp_w, _ = overlay_stamp_cv.shape
-            roi_h, roi_w, _ = roi.shape
-            if stamp_h != roi_h or stamp_w != roi_w:
-                overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h))
-            else:
-                overlay_resized = overlay_stamp_cv
-            # Alpha blend
-            alpha = overlay_resized[:, :, 3] / 255.0
-            alpha_mask = cv2.merge([alpha, alpha, alpha])
-            blended_roi = (roi.astype(float) * (1.0 - alpha_mask) +
-                          overlay_resized[:, :, :3].astype(float) * alpha_mask)
-            frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
-            out.write(frame)
-            frame_idx += 1
-            progress(0.6 + (0.3 * frame_idx / total_frames),
-                    desc=f"Processing frame {frame_idx}/{total_frames}")
-        cap.release()
-        out.release()
-        progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
-        # Use ffmpeg to merge video and audio
         input_video = ffmpeg.input(temp_silent_path)
-        input_audio = ffmpeg.input(video_path)
         (
             ffmpeg
             .output(
                 input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
-                input_audio.audio,
                 final_output_path,
                 vcodec='libx264',
                 acodec='copy',
@@ -370,79 +252,34 @@ def process_video(video_path, progress=gr.Progress()):
             )
             .run(overwrite_output=True, quiet=True)
         )
-        progress(1, desc="Done!")
-        return final_output_path
     except ffmpeg.Error as e:
-        print('ffmpeg stdout:', e.stdout.decode('utf8') if e.stdout else 'None')
-        print('ffmpeg stderr:', e.stderr.decode('utf8') if e.stderr else 'None')
-        raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8') if e.stderr else 'Unknown error'}")
     finally:
-        # Clean up temporary directory
-        if os.path.exists(temp_dir):
-            shutil.rmtree(temp_dir)
 # --- GRADIO INTERFACE ---
-with gr.Blocks(theme=gr.themes.Soft(), title="Persian Text Translator") as demo:
-    gr.Markdown("# 🎬📝 Persian Text Translator")
-    gr.Markdown("Upload an image or video with English text. The app will detect, translate to Persian, and overlay the text back properly.")
-    with gr.Tabs():
-        with gr.TabItem("📝 Image Translation"):
-            gr.Markdown("Upload an image with English text for Persian translation overlay.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    image_input = gr.Image(label="Upload Quote Image", type="pil", sources=["upload", "clipboard"])
-                    img_text_output = gr.Textbox(label="Extracted English Text", lines=3, show_copy_button=True)
-                    img_translated_output = gr.Textbox(label="Persian Translation", lines=3, show_copy_button=True)
-                with gr.Column(scale=1):
-                    image_output = gr.Image(label="Translated Image Output", type="pil")
-            image_input.change(
-                fn=process_image,
-                inputs=[image_input],
-                outputs=[img_text_output, img_translated_output, image_output]
-            )
-        with gr.TabItem("🎬 Video Translation"):
-            gr.Markdown("Upload a video with English text. The app will preserve audio and add Persian translation overlay.")
-            with gr.Row():
-                video_input = gr.Video(label="Upload Video")
-                video_output = gr.Video(label="Translated Video Output")
-            translate_button = gr.Button("Translate Video", variant="primary")
-            translate_button.click(
-                fn=process_video,
-                inputs=[video_input],
-                outputs=[video_output]
-            )
     gr.Markdown("---")
-    gr.Markdown("""
-    ### How it works:
-    **Image Mode:**
-    1. Detects English text using OCR
-    2. Translates to Persian using Gemini AI
-    3. Overlays properly formatted RTL Persian text
-    **Video Mode:**
-    1. Analyzes middle frame to detect text location
-    2. Translates English text to Persian
-    3. Applies Persian overlay to all frames with background
-    4. Merges with original audio and adds fade-in effect
-    **Features:**
-    - Proper RTL (Right-to-Left) Persian text rendering
-    - Automatic font sizing to fit available space
-    - Text wrapping for longer translations
-    - Background color sampling for natural overlay
-    - Audio preservation in video mode
-    """)
 if __name__ == "__main__":
     demo.launch(debug=True)

 import os
 import time
 import ffmpeg
 # --- CONFIGURATION ---
+# IMPORTANT: For deployment on Hugging Face, set this as a "Secret".
+# For local testing, you can paste your key here.
+API_KEY = "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
+PERSIAN_FONT_PATH = "Vazir.ttf"  # Make sure this font file is in your repository
 FADE_IN_DURATION_SECONDS = 1.0
 # --- GLOBAL INITIALIZATION ---
 reader = None
 def initialize_reader():
+    """Initializes the EasyOCR reader if it hasn't been already."""
     global reader
     if reader is None:
         print("Loading EasyOCR model...")
         print("EasyOCR model loaded successfully!")
     return reader
+# --- CORE PROCESSING FUNCTIONS ---
 def extract_text_and_bbox(image: Image.Image):
+    """Extracts text and a consolidated bounding box from a PIL Image."""
+    ocr_reader = initialize_reader()
+    img_array = np.array(image)
+    results = ocr_reader.readtext(img_array)
+    if not results: return "No text detected in the image.", None
+    min_x, min_y = float('inf'), float('inf')
+    max_x, max_y = float('-inf'), float('-inf')
+    text_parts = []
+    for (bbox, text, prob) in results:
+        text_parts.append(text)
+        (tl, tr, br, bl) = bbox
+        min_x = min(min_x, tl[0], bl[0])
+        min_y = min(min_y, tl[1], tr[1])
+        max_x = max(max_x, tr[0], br[0])
+        max_y = max(max_y, bl[1], br[1])
+    extracted_text = ' '.join(text_parts)
+    consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
+    return extracted_text, consolidated_bbox
 def translate_text_gemini(text: str) -> str:
+    """Translates text to colloquial Persian using the Gemini API."""
+    if not API_KEY or "YOUR_GEMINI_API_KEY_HERE" in API_KEY:
+        raise gr.Error("GEMINI_API_KEY is not set. Please add it as a Secret in your Hugging Face Space.")
+    if not text or "No text" in text:
         return "No valid text to translate."
     try:
         genai.configure(api_key=API_KEY)
         model = genai.GenerativeModel('gemini-1.5-flash')
+        prompt = f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
         response = model.generate_content(prompt)
         return response.text.strip()
     except Exception as e:
         return f"Error during translation with Gemini: {str(e)}"
+# ### --- THE NEW AND CORRECTED TEXT OVERLAY FUNCTION --- ###
+# This function is now based on the superior logic from your textoverimage.txt script.
+def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
+    """
+    Creates an overlay layer with correctly rendered, wrapped Persian text.
+    This function erases the background area defined by the bbox and draws new text.
+    """
+    # 1. Define the area to work with, adding padding
     padding = 15
     overlay_box = (
         max(0, bbox[0] - padding),
         min(original_image.width, bbox[2] + padding),
         min(original_image.height, bbox[3] + padding)
     )
     overlay_width = overlay_box[2] - overlay_box[0]
     overlay_height = overlay_box[3] - overlay_box[1]
+    # 2. Create the background layer by sampling a color from the original image
     try:
+        # Sample color from just outside the original text box to get a clean background
         sample_x = max(0, int(overlay_box[0]) - 5)
         sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
     except (ValueError, IndexError):
+        bg_color = (25, 25, 25) # Fallback color
     overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
     draw = ImageDraw.Draw(overlay_layer)
+    # 3. Check for Font File
+    if not os.path.exists(PERSIAN_FONT_PATH):
+         raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it to your Space.")
+    # 4. Dynamically find the best font size and wrap the text
     target_width = overlay_width * 0.90
+    target_height = overlay_height * 0.90
     font_size = 100
     final_wrapped_lines = []
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
+        words = text_to_overlay.split()
+        if not words: break
+        raw_lines = []
+        current_line = ""
+        for word in words:
+            test_line = (current_line + " " + word).strip()
+            # To measure width correctly, we MUST reshape it first. This is the key.
+            reshaped_test_line = get_display(arabic_reshaper.reshape(test_line))
+            line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
+            if line_width <= target_width:
+                current_line = test_line
+            else:
+                raw_lines.append(current_line)
+                current_line = word
+        raw_lines.append(current_line)
+        line_spacing = font_size * 0.3
+        reshaped_for_height_calc = [get_display(arabic_reshaper.reshape(l)) for l in raw_lines]
+        line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
+        total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
+        if total_height <= target_height:
+            final_wrapped_lines = raw_lines
             break
         else:
             font_size -= 2
     if not final_wrapped_lines:
+        final_wrapped_lines = [text_to_overlay] # Fallback
+    # 5. Draw the final, wrapped, and correctly shaped text
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
     line_spacing = font_size * 0.3
+    # Reshape final lines for drawing and calculate total height
+    final_display_lines = [get_display(arabic_reshaper.reshape(l)) for l in final_wrapped_lines]
+    line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_display_lines]
+    total_text_height = sum(line_heights) + (len(final_display_lines) - 1) * line_spacing
     y_start = (overlay_height - total_text_height) / 2
     current_y = y_start
+    for i, display_line in enumerate(final_display_lines):
+        x_center = overlay_width / 2
+        line_y_center = current_y + line_heights[i] / 2
+        # Use anchor="mm" to perfectly center the text block horizontally and vertically
+        # Draw a subtle shadow for better readability
+        draw.text((x_center + 1, line_y_center + 1), display_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
+        # Draw the main text
+        draw.text((x_center, line_y_center), display_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
         current_y += line_heights[i] + line_spacing
+    return overlay_layer, overlay_box
+# --- MAIN VIDEO PROCESSING PIPELINE ---
 def process_video(video_path, progress=gr.Progress()):
+    if video_path is None: raise gr.Error("Please upload a video file first.")
     progress(0, desc="Loading Video & Analyzing...")
     cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened(): raise gr.Error("Could not open video file.")
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Analyze the middle frame for text
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
+    if not ret: raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
     progress(0.2, desc="Detecting Text (EasyOCR)...")
     extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
+    if bbox is None: raise gr.Error(extracted_text)
     progress(0.4, desc="Translating Text (Gemini API)...")
     translated_text = translate_text_gemini(extracted_text)
+    if "Error" in translated_text: raise gr.Error(translated_text)
     progress(0.5, desc="Rendering Translated Text Overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
+    # Convert the overlay to a format OpenCV can use (BGRA)
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
     timestamp = int(time.time())
+    temp_silent_path = f"temp_silent_{timestamp}.mp4"
     final_output_path = f"translated_video_{timestamp}.mp4"
+    progress(0.6, desc="Composing Silent Video with Overlay...")
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
+    cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind video to the beginning
+    frame_idx = 0
+    x_min, y_min, x_max, y_max = overlay_position_box
+    while True:
+        ret, frame = cap.read()
+        if not ret: break
+        # Define the region of interest (ROI) where the overlay will go
+        roi = frame[y_min:y_max, x_min:x_max]
+        # Simple alpha blending
+        alpha = overlay_stamp_cv[:, :, 3] / 255.0
+        alpha_mask = cv2.merge([alpha, alpha, alpha])
+        blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_stamp_cv[:, :, :3].astype(float) * alpha_mask)
+        frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
+        out.write(frame)
+        frame_idx += 1
+        progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
+    cap.release()
+    out.release()
+    progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
+    try:
         input_video = ffmpeg.input(temp_silent_path)
+        input_audio = ffmpeg.input(video_path).audio # Select audio stream only
         (
             ffmpeg
             .output(
                 input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
+                input_audio,
                 final_output_path,
                 vcodec='libx264',
                 acodec='copy',
             )
             .run(overwrite_output=True, quiet=True)
         )
     except ffmpeg.Error as e:
+        print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
+        print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
+        raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8', errors='ignore')}")
     finally:
+        if os.path.exists(temp_silent_path):
+            os.remove(temp_silent_path)
+    progress(1, desc="Done!")
+    return final_output_path
 # --- GRADIO INTERFACE ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Persian Video Quote Translator") as demo:
+    gr.Markdown("# 🎬 Persian Video Quote Translator")
+    gr.Markdown("Upload a short video containing English text. The app will detect the text, replace it with a poetic Persian translation, and preserve the original audio and video duration.")
+    with gr.Row():
+        video_input = gr.Video(label="Upload Video")
+        video_output = gr.Video(label="Translated Video Output")
+    translate_button = gr.Button("Translate Video", variant="primary")
+    translate_button.click(
+        fn=process_video,
+        inputs=[video_input],
+        outputs=[video_output]
+    )
     gr.Markdown("---")
+    gr.Markdown("### How it works:\n1. It analyzes the middle frame to find the text and its location.\n2. It uses the Gemini API to get a high-quality, poetic Persian translation.\n3. It renders the Persian text correctly onto a background that matches the original video.\n4. It composites this new text overlay onto every frame of the video.\n5. Finally, it uses `ffmpeg` to merge the new video with the **original audio** and add a 1-second fade-in effect.")
 if __name__ == "__main__":
     demo.launch(debug=True)