Spaces:

kavehtaheri
/

ocrvideo2

Sleeping

App Files Files Community

kavehtaheri commited on Aug 2

Commit

45c5aeb

verified ·

1 Parent(s): 587bc00

Update app.py

Browse files

Files changed (1) hide show

app.py +371 -140

app.py CHANGED Viewed

@@ -1,25 +1,35 @@
-# app.py
 import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
-import easyocr
 import google.generativeai as genai
-import arabic_reshaper # We only need this one
-# from bidi.algorithm import get_display # <<< REMOVED THIS LINE
 import os
 import time
 import ffmpeg
 # --- CONFIGURATION ---
-API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")
 PERSIAN_FONT_PATH = "Vazir.ttf"
 FADE_IN_DURATION_SECONDS = 1.0
 # --- GLOBAL INITIALIZATION ---
 reader = None
-def initialize_reader():
     """Initializes the EasyOCR reader if it hasn't been already."""
     global reader
     if reader is None:
@@ -28,165 +38,286 @@ def initialize_reader():
         print("EasyOCR model loaded successfully!")
     return reader
-# --- CORE PROCESSING FUNCTIONS ---
-def extract_text_and_bbox(image: Image.Image):
-    """Extracts text and a consolidated bounding box from a PIL Image."""
-    ocr_reader = initialize_reader()
     img_array = np.array(image)
     results = ocr_reader.readtext(img_array)
-    if not results: return "No text detected in the image.", None
     min_x, min_y = float('inf'), float('inf')
     max_x, max_y = float('-inf'), float('-inf')
-    text_parts = []
     for (bbox, text, prob) in results:
-        text_parts.append(text)
-        (tl, tr, br, bl) = bbox
-        min_x = min(min_x, tl[0], bl[0])
-        min_y = min(min_y, tl[1], tr[1])
-        max_x = max(max_x, tr[0], br[0])
-        max_y = max(max_y, bl[1], br[1])
-    extracted_text = ' '.join(text_parts)
-    consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
-    return extracted_text, consolidated_bbox
-def translate_text_gemini(text: str) -> str:
-    """Translates text to colloquial Persian using the Gemini API."""
-    if not API_KEY or "YOUR_GEMINI_API_KEY_HERE" in API_KEY:
-        raise gr.Error("GEMINI_API_KEY is not set. Please add it as a Secret in your Hugging Face Space.")
-    if not text or "No text" in text:
-        return "No valid text to translate."
-    try:
-        genai.configure(api_key=API_KEY)
-        model = genai.GenerativeModel('gemini-2.5-flash')
-        prompt = f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
-        response = model.generate_content(prompt)
-        return response.text.strip()
-    except Exception as e:
-        return f"Error during translation with Gemini: {str(e)}"
-# ### --- THE DEFINITIVELY CORRECTED TEXT OVERLAY FUNCTION --- ###
-# This version REMOVES `get_display` and uses `arabic_reshaper` only,
-# just like the working image script.
 def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> (Image.Image, tuple):
     """
-    Creates an overlay layer with correctly rendered, wrapped Persian text.
     """
-    padding = 15
-    overlay_box = (
-        max(0, bbox[0] - padding),
-        max(0, bbox[1] - padding),
-        min(original_image.width, bbox[2] + padding),
-        min(original_image.height, bbox[3] + padding)
-    )
-    overlay_width = overlay_box[2] - overlay_box[0]
-    overlay_height = overlay_box[3] - overlay_box[1]
     try:
-        sample_x = max(0, int(overlay_box[0]) - 5)
-        sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
-    except (ValueError, IndexError):
-        bg_color = (25, 25, 25)
     overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
     draw = ImageDraw.Draw(overlay_layer)
     if not os.path.exists(PERSIAN_FONT_PATH):
-         raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it to your Space.")
     target_width = overlay_width * 0.90
     target_height = overlay_height * 0.90
     font_size = 100
     final_wrapped_lines = []
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
-        words = text_to_overlay.split()
-        if not words: break
-        raw_lines = []
-        current_line = ""
-        for word in words:
-            test_line = (current_line + " " + word).strip()
-            # To measure width, we MUST reshape it first. This is the key.
-            # We DO NOT use get_display().
-            reshaped_test_line = arabic_reshaper.reshape(test_line)
-            line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
-            if line_width <= target_width:
-                current_line = test_line
-            else:
-                raw_lines.append(current_line)
-                current_line = word
-        raw_lines.append(current_line)
-        line_spacing = font_size * 0.3
-        reshaped_for_height_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
-        line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
-        total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
-        if total_height <= target_height:
             final_wrapped_lines = raw_lines
             break
         else:
             font_size -= 2
     if not final_wrapped_lines:
-        final_wrapped_lines = [text_to_overlay]
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
-    line_spacing = font_size * 0.3
-    # Reshape the final lines for drawing, WITHOUT get_display()
     final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
-    line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
-    total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
     y_start = (overlay_height - total_text_height) / 2
     current_y = y_start
     for i, reshaped_line in enumerate(final_reshaped_lines):
-        x_center = overlay_width / 2
-        line_y_center = current_y + line_heights[i] / 2
-        draw.text((x_center + 1, line_y_center + 1), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
-        draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
-        current_y += line_heights[i] + line_spacing
-    return overlay_layer, overlay_box
-# --- MAIN VIDEO PROCESSING PIPELINE (No changes needed here) ---
-def process_video(video_path, progress=gr.Progress()):
-    if video_path is None: raise gr.Error("Please upload a video file first.")
-    progress(0, desc="Loading Video & Analyzing...")
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened(): raise gr.Error("Could not open video file.")
-    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
     if not ret: raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
-    progress(0.2, desc="Detecting Text (EasyOCR)...")
-    extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
-    if bbox is None: raise gr.Error(extracted_text)
-    progress(0.4, desc="Translating Text (Gemini API)...")
-    translated_text = translate_text_gemini(extracted_text)
-    if "Error" in translated_text: raise gr.Error(translated_text)
-    progress(0.5, desc="Rendering Translated Text Overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
@@ -194,39 +325,38 @@ def process_video(video_path, progress=gr.Progress()):
     temp_silent_path = f"temp_silent_{timestamp}.mp4"
     final_output_path = f"translated_video_{timestamp}.mp4"
-    progress(0.6, desc="Composing Silent Video with Overlay...")
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
     cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
     frame_idx = 0
     x_min, y_min, x_max, y_max = overlay_position_box
     while True:
         ret, frame = cap.read()
         if not ret: break
         roi = frame[y_min:y_max, x_min:x_max]
-        alpha = overlay_stamp_cv[:, :, 3] / 255.0
         alpha_mask = cv2.merge([alpha, alpha, alpha])
-        blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_stamp_cv[:, :, :3].astype(float) * alpha_mask)
         frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
         out.write(frame)
         frame_idx += 1
-        progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
     cap.release(); out.release()
-    progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
     try:
         input_video = ffmpeg.input(temp_silent_path)
         input_audio = ffmpeg.input(video_path).audio
         (ffmpeg.output(
             input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
             input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
         ).run(overwrite_output=True, quiet=True))
     except ffmpeg.Error as e:
         print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
         print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
@@ -235,21 +365,122 @@ def process_video(video_path, progress=gr.Progress()):
         if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
     progress(1, desc="Done!")
-    return final_output_path
-# --- GRADIO INTERFACE (No changes needed here) ---
-with gr.Blocks(theme=gr.themes.Soft(), title="Persian Video Quote Translator") as demo:
-    gr.Markdown("# 🎬 Persian Video Quote Translator")
-    gr.Markdown("Upload a short video containing English text. The app will detect the text, replace it with a poetic Persian translation, and preserve the original audio and video duration.")
     with gr.Row():
-        video_input = gr.Video(label="Upload Video")
-        video_output = gr.Video(label="Translated Video Output")
-    translate_button = gr.Button("Translate Video", variant="primary")
-    translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
     gr.Markdown("---")
-    gr.Markdown("### How it works:\n1. It analyzes the middle frame to find the text and its location.\n2. It uses the Gemini API to get a high-quality, poetic Persian translation.\n3. It renders the Persian text correctly onto a background that matches the original video.\n4. It composites this new text overlay onto every frame of the video.\n5. Finally, it uses `ffmpeg` to merge the new video with the **original audio** and add a 1-second fade-in effect.")
 if __name__ == "__main__":
-    demo.launch(debug=True)

+# advanced_video_transcreator_v3.4.py
 import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 import google.generativeai as genai
+import arabic_reshaper
 import os
 import time
 import ffmpeg
+import json
+import easyocr
+import requests
+import io
+import json
+original_dumps = json.dumps
+def custom_dumps(*args, **kwargs):
+    kwargs['ensure_ascii'] = False
+    return original_dumps(*args, **kwargs)
+json.dumps = custom_dumps
 # --- CONFIGURATION ---
+API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")  # Replace with your actual API key or use os.getenv
+ONE_API_KEY = os.getenv("ONE_API_KEY", "268976:66f4f58a2a905")  # Key for the Instagram download service
 PERSIAN_FONT_PATH = "Vazir.ttf"
 FADE_IN_DURATION_SECONDS = 1.0
 # --- GLOBAL INITIALIZATION ---
 reader = None
+def initialize_easyocr_reader():
     """Initializes the EasyOCR reader if it hasn't been already."""
     global reader
     if reader is None:
         print("EasyOCR model loaded successfully!")
     return reader
+# --- CORE AI AND VIDEO FUNCTIONS ---
+def analyze_and_transcreate_with_gemini(video_path: str, english_caption: str, progress: gr.Progress):
+    """
+    Analyzes a video using the new comprehensive "Transcreation" prompt and extracts the result.
+    This single call performs analysis, translation, and caption generation, incorporating the user-provided English caption.
+    """
+    if not API_KEY or API_KEY == "YOUR_GEMINI_API_KEY":
+        raise gr.Error("GEMINI_API_KEY is not set.")
+    try:
+        genai.configure(api_key=API_KEY)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+        progress(0.2, desc="[1/4] Performing deep analysis & transcreation with Gemini...")
+        ### MODIFIED PROMPT (Requirements 1, 2, 3: Author Name, Category Definitions, English Caption) ###
+        prompt_template = f"""
+        Objective: Analyze the provided video (containing text) across all modalities (visuals, audio, existing text) and the user-provided English caption to generate a superior Persian translation and a suitable Instagram caption. The translation must be contextually perfect, stylistically appropriate, and culturally resonant, avoiding the feel of a literal or AI-driven translation. The caption should be concise, engaging, and aligned with the video's mood, content, and the provided English caption, without hashtags.
+        User-Provided English Caption: "{english_caption if english_caption else 'No caption provided.'}"
+        Instructions:
+        1.  **Multi-Modal Analysis**: Perform a deep analysis of the video. Synthesize information from all three channels: visual, audio, and textual. Additionally, incorporate the user-provided English caption to inform the tone, context, and intent of the Instagram caption.
+        2.  **Isolate Essential Text**: Use OCR to find all text, but identify only the **core, persistent message** intended for the audience. **You MUST INCLUDE any author, poet, or famous person's name (e.g., '- Rumi') in the essential text if present.** **You MUST IGNORE temporary text such as usernames that flash on screen, watermarks, or English subtitles at the bottom of the frame.** The essential text is typically the main quote or statement that stays on screen.
+        3.  **Category Selection**: Choose the most appropriate content category based on the video's text, audio, and visuals. Use the following definitions:
+            - **MEME_HUMOR**: Videos with a white text box at the top, often containing phrases like "POV", "Me when...", or similar humorous, casual text, typically with playful or comedic intent.
+            - **COLD_MOTIVATIONAL**: Videos with dark themes (visuals or mood) and intense, strong music that evokes motivation or a driven mindset.
+            - **WISE_QUOTE**: Videos with peaceful, calm music and literary, poetic grammar, often quoting famous figures.
+            - **TWITTER_JOKE**: Videos with casual, friendly, simple text tone, accompanied by funny or lighthearted music.
+        4.  **Synthesize and Guide**: Use the visual, audio, textual analysis, and the English caption(if provided)to define the exact emotional and stylistic parameters for the translation and Instagram caption.
+        5.  **Instagram Caption**: Generate a concise, engaging Instagram caption in Persian that reflects the video's mood, content, cultural context, and the tone of the English caption (if provided). The caption should be standalone (not a direct translation of the text or English caption) and suitable for posting without hashtags.
+        6.  **Format Output**: Respond ONLY with a single, raw JSON object as specified below. Do not include any explanatory text before or after the JSON.
+        7.  **Author Formatting**: If an author's name is present (e.g., "- Rumi"), format the final translation so the author's name (in Persian) is on its own, separate line at the very end.
+        JSON Structure:
+        {{
+          "asset_id": "video_frame_01",
+          "content_category": "CHOOSE ONE: [MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE]",
+          "source_language": "en",
+          "target_language": "fa",
+          "comprehensive_analysis": {{
+            "visual_context": {{
+              "mood_and_aesthetics": "Describe the emotional mood conveyed by the visuals. (e.g., 'Somber and melancholic, uses slow zooms and a desaturated color palette to evoke a sense of loneliness.')",
+              "cinematic_style": "Describe the filming style. (e.g., 'UGC-style phone recording, shaky cam, feels raw and authentic.')",
+              "subject_matter": "Briefly describe what is happening visually, independent of the text. (e.g., 'A person is walking alone on a rainy street at night.')"
+            }},
+            "audio_context": {{
+              "music_analysis": "Describe the music's genre, tempo, and emotional impact. (e.g., 'Slow, ambient piano music, creates a feeling of introspection and sadness.')",
+              "sfx_analysis": "Describe any relevant sound effects. (e.g., 'The sound of rain and distant city ambiance is prominent, enhancing the feeling of isolation.')"
+            }},
+            "textual_context": {{
+              "full_text_detected": "The complete text from OCR, including ALL parts.",
+              "essential_text": "The core message INCLUDING author attribution if present (e.g., 'The wound is the place where the light enters you - Rumi'). THIS IS THE MOST IMPORTANT FIELD. Remember to exclude temporary usernames and subtitles."
+            }}
+          }},
+          "transcreation_directive": {{
+            "target_emotional_impact": "Synthesize the analysis above to define the precise emotion the Persian translation should evoke. (e.g., 'The translation should feel like a quiet, personal realization; a mix of sadness and acceptance, not dramatic grief.')",
+            "stylistic_guidance": {{
+              "formality": "CHOOSE ONE: [FORMAL_LITERARY, MODERN_POETIC, COLLOQUIAL_CASUAL, PROFESSIONAL_INFORMATIVE]",
+              "register": "Describe the linguistic 'flavor'. (e.g., 'Use sophisticated but natural vocabulary. Avoid slang but don't be overly academic. It should sound like a thoughtful, well-spoken friend.')"
+            }},
+            "cultural_adaptation_notes": "Provide guidance on adapting cultural nuances for a Persian audience. (e.g., 'The English concept of 'just being okay with it' can be translated to a more poetic Persian concept of resignation, like «کنار آمدن» or «پذیرفتن».')"
+          }},
+          "final_output": {{
+            "recommended_translation": "ONLY the final, high-quality Persian translation goes here. It should be the direct result of following the transcreation_directive.",
+            "translation_rationale": "Briefly explain WHY this translation was chosen, referencing the analysis.",
+            "instagram_caption": "A concise, engaging Persian caption for the Instagram post, without hashtags, reflecting the video's mood, content, and the English caption (if provided)."
+          }}
+        }}
+        """
+        video_file = genai.upload_file(path=video_path)
+        while video_file.state.name == "PROCESSING":
+            time.sleep(2)
+            video_file = genai.get_file(video_file.name)
+        if video_file.state.name == "FAILED":
+            raise gr.Error("Gemini file upload failed.")
+        response = model.generate_content([prompt_template, video_file], request_options={"timeout": 180})
+        genai.delete_file(video_file.name)
+        analysis_json_text = response.text.strip()
+        if analysis_json_text.startswith("```json"):
+            analysis_json_text = analysis_json_text[7:-3].strip()
+        analysis_data = json.loads(analysis_json_text)
+        essential_text = analysis_data.get("comprehensive_analysis", {}).get("textual_context", {}).get("essential_text", "")
+        final_translation = analysis_data.get("final_output", {}).get("recommended_translation", "")
+        instagram_caption = analysis_data.get("final_output", {}).get("instagram_caption", "")
+        if not essential_text or not final_translation or not instagram_caption:
+            raise gr.Error("Gemini analysis did not return the essential text, final translation, or Instagram caption.")
+        return analysis_data, essential_text, final_translation, instagram_caption
+    except json.JSONDecodeError:
+        error_message = f"Gemini returned invalid JSON. The response was:\n{response.text.strip()}"
+        raise gr.Error(error_message)
+    except Exception as e:
+        error_message = f"An error occurred with the Gemini API: {str(e)}"
+        raise gr.Error(error_message)
+def detect_white_header_box(image: Image.Image, progress: gr.Progress):
+    """
+    Detects if a prominent white header box exists at the top of the video.
+    Returns the bounding box of this header if found, otherwise returns None.
+    """
+    progress(0.35, desc="[2/4] Checking for white header box...")
+    img_array = np.array(image.convert('L'))  # Convert to grayscale
+    frame_width, frame_height = image.size
+    # Analyze the top 25% of the image
+    scan_height = int(frame_height * 0.25)
+    top_section = img_array[0:scan_height, :]
+    # Threshold the image to find very light areas (potential white box)
+    _, thresh = cv2.threshold(top_section, 230, 255, cv2.THRESH_BINARY)
+    # Find contours
+    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for cnt in contours:
+        x, y, w, h = cv2.boundingRect(cnt)
+        # Check if the contour is a large, wide rectangle typical of a header
+        if w > frame_width * 0.8 and h > frame_height * 0.05:
+            print(f"Detected potential white header box of size {w}x{h}.")
+            # Give it a little padding
+            padding_x = int(frame_width * 0.02)
+            padding_y = int(frame_height * 0.02)
+            final_bbox = (
+                max(0, x - padding_x), max(0, y - padding_y),
+                min(frame_width, x + w + padding_x), min(frame_height, y + h + padding_y)
+            )
+            print(f"Using white header as final bounding box: {final_bbox}")
+            return final_bbox
+    print("No dominant white header box found. Proceeding with standard text detection.")
+    return None
+def get_bbox_for_essential_text(image: Image.Image, essential_text: str, progress: gr.Progress):
+    """
+    Uses EasyOCR to find the precise bounding box for the essential text identified by Gemini.
+    """
+    progress(0.4, desc="[2/4] Locating text with EasyOCR...")
+    ocr_reader = initialize_easyocr_reader()
     img_array = np.array(image)
     results = ocr_reader.readtext(img_array)
+    if not results: raise gr.Error("EasyOCR could not detect any text on the frame.")
+    essential_words = set(char.lower() for char in essential_text if char.isalnum())
     min_x, min_y = float('inf'), float('inf')
     max_x, max_y = float('-inf'), float('-inf')
+    found_match = False
+    print(f"Gemini's essential text: '{essential_text}'")
+    print("EasyOCR Results:")
     for (bbox, text, prob) in results:
+        print(f"- Detected: '{text}'")
+        text_words = set(char.lower() for char in text if char.isalnum())
+        if len(essential_words.intersection(text_words)) > 0:
+            found_match = True
+            (tl, tr, br, bl) = bbox
+            min_x = min(min_x, tl[0], bl[0])
+            min_y = min(min_y, tl[1], tr[1])
+            max_x = max(max_x, tr[0], br[0])
+            max_y = max(max_y, bl[1], br[1])
+            print(f"  ^-- Matched! Updating consolidated bbox.")
+    if not found_match: raise gr.Error(f"EasyOCR ran but could not locate the essential text '{essential_text}' on the video frame.")
+    original_height = max_y - min_y
+    height_reduction = original_height * 0.10
+    min_y += height_reduction / 2
+    max_y -= height_reduction / 2
+    print(f"Bbox height adjusted: Reduced by {height_reduction:.2f} pixels for a tighter fit.")
+    frame_width, frame_height = image.size
+    padding_x = int(frame_width * 0.02)
+    padding_y = int(frame_height * 0.02)
+    final_bbox = (
+        max(0, int(min_x) - padding_x), max(0, int(min_y) - padding_y),
+        min(frame_width, int(max_x) + padding_x), min(frame_height, int(max_y) + padding_y)
+    )
+    print(f"Final consolidated bbox (x1, y1, x2, y2): {final_bbox}")
+    return final_bbox
 def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> (Image.Image, tuple):
     """
+    Creates an overlay with adaptive color and robust, auto-fitting wrapped Persian text.
     """
+    overlay_width = bbox[2] - bbox[0]
+    overlay_height = bbox[3] - bbox[1]
     try:
+        sample_x = max(0, int(bbox[0]) - 5)
+        sample_y = int((bbox[1] + bbox[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
+    except (ValueError, IndexError): bg_color = (25, 25, 25)
     overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
     draw = ImageDraw.Draw(overlay_layer)
+    luminance = (0.299 * bg_color[0] + 0.587 * bg_color[1] + 0.114 * bg_color[2])
+    if luminance > 128:
+        text_color, shadow_color = (0, 0, 0, 255), (200, 200, 200, 100)
+        print("Light background detected. Using BLACK text.")
+    else:
+        text_color, shadow_color = (255, 255, 255, 255), (0, 0, 0, 180)
+        print("Dark background detected. Using WHITE text.")
     if not os.path.exists(PERSIAN_FONT_PATH):
+         raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it.")
     target_width = overlay_width * 0.90
     target_height = overlay_height * 0.90
     font_size = 100
     final_wrapped_lines = []
+    raw_lines = text_to_overlay.split('\n')
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
+        max_line_width = 0
+        reshaped_lines_for_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
+        for line in reshaped_lines_for_calc:
+            max_line_width = max(max_line_width, font.getlength(line))
+        line_heights = [font.getbbox(l)[3] for l in reshaped_lines_for_calc if l]
+        total_height = sum(line_heights) + (len(raw_lines) - 1) * (font_size * 0.3)
+        if total_height <= target_height and max_line_width <= target_width:
             final_wrapped_lines = raw_lines
             break
         else:
             font_size -= 2
     if not final_wrapped_lines:
+        font_size = 10
+        final_wrapped_lines = raw_lines
+        print("Warning: Text was too long to fit perfectly. Using minimum font size.")
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
+    print(f"Final font size: {font_size}px")
     final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
+    line_heights_render = [final_font.getbbox(l)[3] for l in final_reshaped_lines]
+    total_text_height = sum(line_heights_render) + (len(final_reshaped_lines) - 1) * (font_size * 0.3)
     y_start = (overlay_height - total_text_height) / 2
     current_y = y_start
     for i, reshaped_line in enumerate(final_reshaped_lines):
+        line_width = final_font.getlength(reshaped_line)
+        x_position = (overlay_width - line_width) / 2
+        draw.text((x_position + 1, current_y + 1), reshaped_line, font=final_font, fill=shadow_color)
+        draw.text((x_position, current_y), reshaped_line, font=final_font, fill=text_color)
+        current_y += line_heights_render[i] + (font_size * 0.3)
+    return overlay_layer, bbox
+# --- MAIN VIDEO PROCESSING PIPELINE ---
+def process_video(video_path, english_caption, progress=gr.Progress()):
+    if video_path is None: raise gr.Error("Please upload or download a video file first.")
+    progress(0, desc="Starting process...")
+    analysis_data, essential_text, translated_text, instagram_caption = analyze_and_transcreate_with_gemini(video_path, english_caption, progress)
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened(): raise gr.Error("Could not open video file.")
+    frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps, total_frames = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
     if not ret: raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
+    # Prioritize white header box detection
+    bbox = detect_white_header_box(middle_frame_rgb_pil, progress)
+    if bbox is None:
+        # Fallback to the original EasyOCR method if no header is found
+        bbox = get_bbox_for_essential_text(middle_frame_rgb_pil, essential_text, progress)
+    progress(0.5, desc="[3/4] Rendering translated text overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
     temp_silent_path = f"temp_silent_{timestamp}.mp4"
     final_output_path = f"translated_video_{timestamp}.mp4"
+    progress(0.6, desc="[4/4] Composing video with overlay...")
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
     cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
     frame_idx = 0
     x_min, y_min, x_max, y_max = overlay_position_box
     while True:
         ret, frame = cap.read()
         if not ret: break
         roi = frame[y_min:y_max, x_min:x_max]
+        if roi.shape[:2] != (overlay_stamp_cv.shape[0], overlay_stamp_cv.shape[1]):
+            h, w = roi.shape[:2]
+            resized_overlay = cv2.resize(overlay_stamp_cv, (w, h))
+        else: resized_overlay = overlay_stamp_cv
+        alpha = resized_overlay[:, :, 3] / 255.0
         alpha_mask = cv2.merge([alpha, alpha, alpha])
+        blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + resized_overlay[:, :, :3].astype(float) * alpha_mask)
         frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
         out.write(frame)
         frame_idx += 1
+        progress(0.6 + (0.35 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
     cap.release(); out.release()
+    progress(0.95, desc="Merging Audio and Applying Fade...")
     try:
         input_video = ffmpeg.input(temp_silent_path)
         input_audio = ffmpeg.input(video_path).audio
         (ffmpeg.output(
             input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
             input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
         ).run(overwrite_output=True, quiet=True))
     except ffmpeg.Error as e:
         print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
         print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
         if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
     progress(1, desc="Done!")
+    return final_output_path, analysis_data, instagram_caption
+# --- INSTAGRAM DOWNLOADER FUNCTION ---
+def download_instagram_video(ig_url: str, progress: gr.Progress = None):
+    """Fetch video from Instagram post using One-API and save it locally."""
+    if not ig_url:
+        raise gr.Error("Please provide an Instagram URL.")
+    if not ONE_API_KEY:
+        raise gr.Error("ONE_API_KEY is not set for Instagram downloads.")
+    if progress is not None:
+        progress(0, desc="Downloading from Instagram...")
+    try:
+        shortcode = ig_url.split("/")[-2]
+        url_one = "https://api.one-api.ir/instagram/v1/post/?shortcode=" + shortcode
+        headers = {
+            "accept": "application/json",
+            "one-api-token": ONE_API_KEY,
+            "Content-Type": "application/json"
+        }
+        response = requests.get(url_one, headers=headers, timeout=30)
+        response.raise_for_status()
+        result = response.json().get("result", {})
+        media_list = result.get('media', [])
+        if not media_list:
+            raise ValueError("No media found in the API response.")
+        # Find the first video URL in the media list
+        video_url = None
+        for media_item in media_list:
+            if media_item.get("type") == "video":
+                video_url = media_item.get("url")
+                break
+        if not video_url:
+             raise ValueError("API response did not contain a direct video URL.")
+        if progress is not None:
+            progress(0.5, desc="Found video link. Downloading content...")
+        video_response = requests.get(video_url, stream=True, timeout=60)
+        video_response.raise_for_status()
+        # Save the video to a temporary file
+        timestamp = int(time.time())
+        local_filename = f"ig_download_{timestamp}.mp4"
+        with open(local_filename, 'wb') as f:
+            for chunk in video_response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Instagram video successfully downloaded to {local_filename}")
+        if progress is not None:
+            progress(1, desc="Download complete!")
+        return local_filename
+    except requests.exceptions.RequestException as e:
+        raise gr.Error(f"Network error while downloading from Instagram: {str(e)}")
+    except (ValueError, KeyError) as e:
+        print(f"API parsing error: {response.text}")
+        raise gr.Error(f"Could not process the Instagram API response: {str(e)}")
+    except Exception as e:
+        raise gr.Error(f"An unexpected error occurred during Instagram download: {str(e)}")
+# --- GRADIO INTERFACE (Updated) ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Video Transcreator") as demo:
+    gr.Markdown("# 🎬 Advanced Video Transcreator v3.4")
+    gr.Markdown(
+        "**This version uses a powerful multi-modal prompt for superior, context-aware 'Transcreation'.**\n\n"
+        "Upload a short video with English text, or provide an Instagram URL and an optional English caption. Clicking 'Download from URL' will download and automatically process the video. The app will analyze the video's mood, style, and caption to generate a perfectly integrated Persian translation and an Instagram caption. Author names (e.g., '- Rumi') are included in the translation and overlaid on a separate line."
+    )
     with gr.Row():
+        with gr.Column(scale=2):
+            video_input = gr.Video(label="Upload Video or Use URL Below")
+            with gr.Row():
+                ig_url_input = gr.Textbox(label="Instagram Post URL", placeholder="e.g., https://www.instagram.com/p/C1a2b3Y4deF/")
+                english_caption_input = gr.Textbox(label="English Caption (Optional)", placeholder="e.g., A moment of reflection with Rumi's wisdom")
+                download_button = gr.Button("Download from URL")
+        with gr.Column(scale=3):
+            video_output = gr.Video(label="Translated Video Output")
+            caption_output = gr.Textbox(label="Instagram Caption (No Hashtags)", lines=3, interactive=False)
+            json_output = gr.JSON(label="Gemini Transcreation Analysis")
+    translate_button = gr.Button("Analyze and Transcreate Video", variant="primary")
+    # Define the logic flow
+    def chain_download_and_process(ig_url, english_caption):
+        """Chains Instagram download with video processing."""
+        video_path = download_instagram_video(ig_url)
+        return process_video(video_path, english_caption)
+    download_button.click(
+        fn=chain_download_and_process,
+        inputs=[ig_url_input, english_caption_input],
+        outputs=[video_output, json_output, caption_output]
+    )
+    translate_button.click(
+        fn=process_video,
+        inputs=[video_input, english_caption_input],
+        outputs=[video_output, json_output, caption_output]
+    )
     gr.Markdown("---")
+    gr.Markdown(
+        "### How it works:\n"
+        "1. **Gemini Transcreation:** The video and optional English caption are sent to Gemini for a deep, multi-modal analysis. Gemini is specifically instructed to **include author names** (e.g., '- Rumi') in the essential text, **ignore temporary text** (like usernames or subtitles), and generate a Persian Instagram caption based on the video and caption input.\n"
+        "2. **Category Classification:** The app selects a category (MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE) based on text, audio, and visuals, using clear definitions for accurate translation.\n"
+        "3. **Smart BBox Detection:** The app first checks for a **prominent white header box**. If found, it uses that for a clean overlay. If not, it falls back to `EasyOCR` to find the *exact pixel location* of the essential text Gemini identified.\n"
+        "4. **Render & Composite:** The Persian text, including author names on a separate line, is rendered with **adaptive color** inside the detected bounding box, with a font size that's **guaranteed to fit**, and placed precisely over the original.\n"
+        "5. **Finalize with Fade-In:** The original audio is merged back into the new video, and a **1-second fade-in** is applied using `ffmpeg`.\n"
+        "6. **Instagram Caption:** A concise, culturally appropriate caption is generated, incorporating the English caption (if provided), and displayed for use with the translated video."
+    )
 if __name__ == "__main__":
+    if not os.path.exists(PERSIAN_FONT_PATH):
+        print(f"WARNING: Font file '{PERSIAN_FONT_PATH}' not found. The app will likely fail. Please ensure it's in the same directory.")
+    demo.launch(debug=True)