Spaces:

kavehtaheri
/

ocrvideo2

Sleeping

App Files Files Community

kavehtaheri commited on Jul 27

Commit

606c838

verified ·

1 Parent(s): 14c703e

Update app.py

Browse files

Files changed (1) hide show

app.py +272 -134

app.py CHANGED Viewed

@@ -7,57 +7,42 @@ import numpy as np
 import google.generativeai as genai
 import arabic_reshaper
 import os
-import subprocess
 import tempfile
-import shutil
-import glob
 # --- CONFIGURATION ---
-# IMPORTANT: Replace with your actual Gemini API key
 api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"
-# Ensure the Vazir.ttf font file is in the same directory as this script
 PERSIAN_FONT_PATH = "Vazir.ttf"
 # --- GLOBAL INITIALIZATION ---
-# Lazily initialize the OCR reader to avoid loading it on script import
 reader = None
-translation_cache =- The text "Output video paths need to persist outside the temporary directory for Gradio display" seems like a note accidentally included.
- {}
 def initialize_reader():
     """Initializes the EasyOCR reader if it hasn't been already."""
     global reader
-    # Output video paths need to persist outside the temporary directory for Gradio display.
     if reader is None:
         print("Loading EasyOCR model... (This may take a moment on first run)")
-        # We only need to detect English, as we are translating from it.
         reader = easyocr.Reader(['en'], gpu=False, verbose=False)
         print("EasyOCR model loaded successfully!")
     return reader
-# --- CORE FUNCTIONS ---
 def extract_text_and_bbox(image: Image.Image):
     """
     Extracts text from a PIL Image and calculates a single consolidated
-    bounding box for all text found. Uses resized image for faster OCR.
     """
     if image is None:
         return "Please upload an image first.", None
     try:
-        # Resize for faster OCR (max width 640)
-        ocr_width = 640
-        if image.width > ocr_width:
-            scale_factor = ocr_width / image.width
-            ocr_image = image.resize((ocr_width, int(image.height * scale_factor)))
-        else:
-            ocr_image = image
-            scale_factor = 1.0
         ocr_reader = initialize_reader()
-        img_array = np.array(ocr_image)
         results = ocr_reader.readtext(img_array)
         if not results:
@@ -76,18 +61,13 @@ def extract_text_and_bbox(image: Image.Image):
             max_y = max(max_y, bl[1], br[1])
         extracted_text = ' '.join(text_parts)
-        # Scale bbox back to original size
-        consolidated_bbox = (
-            int(min_x / scale_factor), int(min_y / scale_factor),
-            int(max_x / scale_factor), int(max_y / scale_factor)
-        )
         return extracted_text, consolidated_bbox
     except Exception as e:
         return f"Error processing image with OCR: {str(e)}", None
 def translate_text_gemini(text: str) -> str:
     """Translates text to colloquial Persian using the Gemini API."""
     if not text or "No text" in text or "Error" in text or "Please upload" in text:
@@ -96,18 +76,13 @@ def translate_text_gemini(text: str) -> str:
     try:
         genai.configure(api_key=api_key)
         model = genai.GenerativeModel('gemini-1.5-flash')
-        prompt =f"Translate the following English quotes into Persian, rephrasing only minimally if absolutely necessary for natural poetic flow, but strictly preserving the original meaning, intent, purpose, and nuances without any alterations or additions that could change the core message. Ensure the Persian versions are concise (under 20 words), deep, touching, poetic, and profound, using idiomatic Persian that evokes wisdom or inspiration while staying faithful to the source. Additionally, guarantee suitable grammar and natural sentence structure in Persian for smooth readability by native speakers, and ensure the translation conveys clear, substantive meaning that stands independently beyond its poetic tone (i.e., the wisdom or insight should be immediately understandable without relying solely on artistry). If the original quote includes an attribution (e.g., author name), incorporate it faithfully in the Persian translation on the last line, formatted similarly (e.g., ‘- Author Name -’ in Persian). Your response must contain ONLY the translated Persian texts in Perso-Arabic script, one per quote, numbered (e.g., 1., 2.) for separation, with no other text, labels, explanations, or information whatsoever Quotes: [{text}]"
         response = model.generate_content(prompt)
-        translated = response.text.strip()
-        # Strip numbering if present (assuming single quote)
-        if translated.startswith('1. '):
-            translated = translated[3:].strip()
-        return translated
     except Exception as e:
         return f"Error during translation: {str(e)}"
-# --- THE NEW AND CORRECTED IMAGE OVERLAY FUNCTION ---
 def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
     """
     Overlays Persian text onto an image, erasing the content within the given
@@ -133,7 +108,7 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
         sample_y = int((erase_box[1] + erase_box[3]) / 2)
         bg_color = image_copy.getpixel((sample_x, sample_y))
     except (ValueError, IndexError):
-        bg_color = (255, 255, 255) # Fallback to white
     draw_erase.rectangle(erase_box, fill=bg_color)
@@ -156,9 +131,7 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
         current_line = ""
         for word in words:
             test_line = (current_line + " " + word).strip()
-            # To measure width, we MUST reshape it first. This is the key.
             reshaped_test_line = arabic_reshaper.reshape(test_line)
-            # Use textbbox for more accurate size calculation
             line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
             if line_width <= target_width:
@@ -187,7 +160,6 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
     line_spacing = font_size * 0.3
-    # Reshape the final lines for drawing
     final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
     line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
     total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
@@ -199,9 +171,7 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
         x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
         line_y_center = current_y + line_heights[i] / 2
-        # Draw a subtle shadow for better readability
         draw.text((x_center + 2, line_y_center + 2), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
-        # Draw the main text
         draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
         current_y += line_heights[i] + line_spacing
@@ -210,120 +180,288 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
     out_image = Image.alpha_composite(erase_layer, txt_layer)
     return out_image.convert("RGB")
-# --- VIDEO PROCESSING FUNCTION ---
-def process_video(video_path, progress=gr.Progress()):
-    if video_path is None:
         return None
-    progress(0, desc="Starting video processing...")
-    # Create persistent output file outside tmpdir
-    output_video_fd = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
-    output_video_path = output_video_fd.name
-    output_video_fd.close()
-    with tempfile.TemporaryDirectory() as tmpdir:
-        frames_dir = os.path.join(tmpdir, 'frames')
-        out_frames_dir = os.path.join(tmpdir, 'out_frames')
-        audio_path = os.path.join(tmpdir, 'audio.mp3')
-        os.makedirs(frames_dir)
-        os.makedirs(out_frames_dir)
-        progress(0.1, desc="Extracting audio...")
-        # Extract audio (if any)
-        subprocess.run(['ffmpeg', '-y', '-i', video_path, '-vn', '-acodec', 'libmp3lame', audio_path], capture_output=True)
-        progress(0.2, desc="Getting video info...")
-        # Get FPS
-        ffprobe_cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', 'stream=avg_frame_rate', '-of', 'default=noprint_wrappers=1:nokey=1', video_path]
-        fps_str = subprocess.check_output(ffprobe_cmd).decode().strip()
-        fps = eval(fps_str)
-        progress(0.3, desc="Extracting frames...")
-        # Extract frames
-        frame_pattern = os.path.join(frames_dir, 'frame_%06d.png')
-        subprocess.run(['ffmpeg', '-y', '-i', video_path, frame_pattern], capture_output=True, check=True)
-        # List and sort frames
-        frames = sorted(glob.glob(os.path.join(frames_dir, '*.png')))
-        progress(0.4, desc="Processing frames...")
-        num_frames = len(frames)
-        if num_frames == 0:
-            return None
-        prev_translated_text = None
-        prev_bbox = None
-        for i, frame_path in enumerate(frames):
-            progress(0.4 + (i / num_frames) * 0.4, desc=f"Processing frame {i+1}/{num_frames}")
-            image = Image.open(frame_path)
-            extracted_text, bbox = extract_text_and_bbox(image)
-            out_frame_path = os.path.join(out_frames_dir, os.path.basename(frame_path))
-            if bbox is None:
-                # No text, copy original frame
-                shutil.copy(frame_path, out_frame_path)
-                prev_translated_text = None
-                continue
-            # Check if text changed
-            if extracted_text in translation_cache:
-                translated_text = translation_cache[extracted_text]
             else:
-                translated_text = translate_text_gemini(extracted_text)
-                if "Error" in translated_text:
-                    # On error, copy original
-                    shutil.copy(frame_path, out_frame_path)
-                    continue
-                translation_cache[extracted_text] = translated_text
-            # If same as previous, and bbox similar, copy previous out frame (but since frames may differ, better to overlay again, but to save time, overlay is fast)
-            # Overlay is PIL, fast; OCR is slow, but we already did OCR.
-            # To further optimize, perhaps skip OCR if previous had no text, but for simplicity, keep as is.
-            # Overlay
-            final_image = overlay_text_on_image(image, translated_text, bbox)
-            final_image.save(out_frame_path)
-        progress(0.8, desc="Reassembling video...")
-        # Reassemble video
-        out_frame_pattern = os.path.join(out_frames_dir, 'frame_%06d.png')
-        video_cmd = ['ffmpeg', '-y', '-framerate', str(fps), '-i', out_frame_pattern, '-c:v', 'libx264', '-pix_fmt', 'yuv420p', output_video_path]
-        has_audio = os.path.exists(audio_path) and os.path.getsize(audio_path) > 0
-        if has_audio:
-            video_cmd = ['ffmpeg', '-y', '-framerate', str(fps), '-i', out_frame_pattern, '-i', audio_path, '-c:v', 'libx264', '-c:a', 'aac', '-pix_fmt', 'yuv420p', output_video_path]
-        subprocess.run(video_cmd, capture_output=True, check=True)
-    progress(1, desc="Done!")
-    # Return the output video path
-    return output_video_path
 # --- GRADIO INTERFACE ---
-with gr.Blocks(title="Persian Quote Video Translator", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📝 Persian Quote Video Translator")
-    gr.Markdown("Upload a video with English text. The app will automatically detect, erase, translate, and overlay the Persian text back onto each frame of the video.")
-    gr.Markdown("**Note:** For best performance on free tier, use short videos (<30s). Longer videos may take time and could reconnect.")
     with gr.Row():
         with gr.Column(scale=1):
-            video_input = gr.Video(label="Upload Quote Video", sources=["upload"])
         with gr.Column(scale=1):
-            video_output = gr.Video(label="Translated Video Output")
-    video_input.change(
-        fn=process_video,
-        inputs=[video_input],
-        outputs=[video_output]
     )
 if __name__ == "__main__":
     demo.launch()

 import google.generativeai as genai
 import arabic_reshaper
 import os
+import cv2
+from moviepy.editor import *
+from moviepy.video.fx import resize, fadein, fadeout
 import tempfile
+import math
+import random
 # --- CONFIGURATION ---
 api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"
 PERSIAN_FONT_PATH = "Vazir.ttf"
 # --- GLOBAL INITIALIZATION ---
 reader = None
 def initialize_reader():
     """Initializes the EasyOCR reader if it hasn't been already."""
     global reader
     if reader is None:
         print("Loading EasyOCR model... (This may take a moment on first run)")
         reader = easyocr.Reader(['en'], gpu=False, verbose=False)
         print("EasyOCR model loaded successfully!")
     return reader
+# --- CORE FUNCTIONS FROM YOUR ORIGINAL CODE ---
 def extract_text_and_bbox(image: Image.Image):
     """
     Extracts text from a PIL Image and calculates a single consolidated
+    bounding box for all text found.
     """
     if image is None:
         return "Please upload an image first.", None
     try:
         ocr_reader = initialize_reader()
+        img_array = np.array(image)
         results = ocr_reader.readtext(img_array)
         if not results:
             max_y = max(max_y, bl[1], br[1])
         extracted_text = ' '.join(text_parts)
+        consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
         return extracted_text, consolidated_bbox
     except Exception as e:
         return f"Error processing image with OCR: {str(e)}", None
 def translate_text_gemini(text: str) -> str:
     """Translates text to colloquial Persian using the Gemini API."""
     if not text or "No text" in text or "Error" in text or "Please upload" in text:
     try:
         genai.configure(api_key=api_key)
         model = genai.GenerativeModel('gemini-1.5-flash')
+        prompt = f"Translate the following English quotes into Persian, rephrasing only minimally if absolutely necessary for natural poetic flow, but strictly preserving the original meaning, intent, purpose, and nuances without any alterations or additions that could change the core message. Ensure the Persian versions are concise (under 20 words), deep, touching, poetic, and profound, using idiomatic Persian that evokes wisdom or inspiration while staying faithful to the source. Additionally, guarantee suitable grammar and natural sentence structure in Persian for smooth readability by native speakers, and ensure the translation conveys clear, substantive meaning that stands independently beyond its poetic tone (i.e., the wisdom or insight should be immediately understandable without relying solely on artistry). If the original quote includes an attribution (e.g., author name), incorporate it faithfully in the Persian translation on the last line, formatted similarly (e.g., '- Author Name -' in Persian). Your response must contain ONLY the translated Persian texts in Perso-Arabic script, one per quote, numbered (e.g., 1., 2.) for separation, with no other text, labels, explanations, or information whatsoever Quotes: [{text}]"
         response = model.generate_content(prompt)
+        return response.text.strip()
     except Exception as e:
         return f"Error during translation: {str(e)}"
 def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
     """
     Overlays Persian text onto an image, erasing the content within the given
         sample_y = int((erase_box[1] + erase_box[3]) / 2)
         bg_color = image_copy.getpixel((sample_x, sample_y))
     except (ValueError, IndexError):
+        bg_color = (255, 255, 255)
     draw_erase.rectangle(erase_box, fill=bg_color)
         current_line = ""
         for word in words:
             test_line = (current_line + " " + word).strip()
             reshaped_test_line = arabic_reshaper.reshape(test_line)
             line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
             if line_width <= target_width:
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
     line_spacing = font_size * 0.3
     final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
     line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
     total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
         x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
         line_y_center = current_y + line_heights[i] / 2
         draw.text((x_center + 2, line_y_center + 2), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
         draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
         current_y += line_heights[i] + line_spacing
     out_image = Image.alpha_composite(erase_layer, txt_layer)
     return out_image.convert("RGB")
+# --- NEW VIDEO PROCESSING FUNCTIONS ---
+def extract_middle_frame(video_path):
+    """Extract the middle frame from video for OCR processing."""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        middle_frame_idx = total_frames // 2
+        cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
+        ret, frame = cap.read()
+        cap.release()
+        if ret:
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            return Image.fromarray(frame_rgb)
         return None
+    except Exception as e:
+        print(f"Error extracting middle frame: {e}")
+        return None
+def create_sama_intro_effect(duration=3, size=(1920, 1080), fps=30):
+    """Create a sama-style intro effect similar to the ukulele video."""
+    def make_frame(t):
+        # Create base frame
+        img = np.zeros((size[1], size[0], 3), dtype=np.uint8)
+        # Create warm gradient background
+        for i in range(size[1]):
+            warm_intensity = int(25 + 15 * math.sin(i * 0.01))
+            img[i, :] = [warm_intensity//2, warm_intensity//3, warm_intensity]
+        center_x, center_y = size[0]//2, size[1]//2
+        # Musical rhythm visualization (simulating ukulele strums)
+        beat_time = t * 4  # 4 beats per second like ukulele strumming
+        beat_intensity = abs(math.sin(beat_time * math.pi)) ** 0.5
+        # Create pulsing circles (like sound waves)
+        for radius_base in [100, 150, 200, 250]:
+            radius = int(radius_base + beat_intensity * 30)
+            alpha = max(0, 0.3 - (t / duration) * 0.2)
+            circle_intensity = int(alpha * 255 * beat_intensity)
+            if circle_intensity > 10:
+                cv2.circle(img, (center_x, center_y), radius,
+                          (circle_intensity//3, circle_intensity//4, circle_intensity//2), 2)
+        # Add rotating elements (like guitar picks or musical notes)
+        for i in range(6):
+            angle = (t * 60 + i * 60) % 360  # Rotating elements
+            distance = 180 + 20 * math.sin(beat_time)
+            x = int(center_x + distance * math.cos(math.radians(angle)))
+            y = int(center_y + distance * math.sin(math.radians(angle)))
+            # Draw musical note-like shapes
+            note_size = int(8 + beat_intensity * 4)
+            cv2.circle(img, (x, y), note_size, (150, 100, 50), -1)
+            cv2.circle(img, (x, y), note_size + 2, (200, 150, 100), 2)
+        # Add string-like lines (simulating ukulele strings)
+        for i in range(4):
+            y_pos = center_y - 60 + i * 40
+            line_alpha = beat_intensity * 0.5
+            line_intensity = int(line_alpha * 255)
+            if line_intensity > 20:
+                # Create wavy lines like vibrating strings
+                points = []
+                for x in range(0, size[0], 10):
+                    wave_y = y_pos + int(10 * math.sin(x * 0.02 + t * 8) * beat_intensity)
+                    points.append((x, wave_y))
+                for j in range(len(points)-1):
+                    cv2.line(img, points[j], points[j+1],
+                            (line_intensity//2, line_intensity//3, line_intensity//4), 2)
+        # Add fade in/out effects
+        fade_alpha = 1.0
+        if t < 0.5:
+            fade_alpha = t / 0.5
+        elif t > duration - 0.5:
+            fade_alpha = (duration - t) / 0.5
+        img = (img * fade_alpha).astype(np.uint8)
+        return img
+    return VideoClip(make_frame, duration=duration)
+def apply_text_overlay_to_frame(frame, text_to_overlay, bbox):
+    """Apply text overlay to a single frame using your existing function."""
+    pil_frame = Image.fromarray(frame)
+    overlaid_frame = overlay_text_on_image(pil_frame, text_to_overlay, bbox)
+    return np.array(overlaid_frame)
+def process_video_with_text_overlay(video_path, translated_text, bbox):
+    """Process video and apply text overlay to all frames."""
+    def apply_overlay(get_frame, t):
+        frame = get_frame(t)
+        return apply_text_overlay_to_frame(frame, translated_text, bbox)
+    video = VideoFileClip(video_path)
+    video_with_overlay = video.fl(apply_overlay)
+    return video_with_overlay
+def create_final_video_with_intro(video_path, translated_text, bbox, output_path):
+    """Create the final video with sama intro effect and original music."""
+    try:
+        # Load original video
+        original_video = VideoFileClip(video_path)
+        # Create intro with same dimensions as original video
+        intro_duration = 3
+        intro = create_sama_intro_effect(
+            duration=intro_duration,
+            size=(int(original_video.w), int(original_video.h)),
+            fps=original_video.fps
+        )
+        intro = intro.set_fps(original_video.fps)
+        # Apply text overlay to main video
+        main_video_with_text = process_video_with_text_overlay(video_path, translated_text, bbox)
+        # Add smooth transitions
+        intro = fadeout(intro, 0.3)
+        main_video_with_text = fadein(main_video_with_text, 0.3)
+        # Concatenate intro and main video
+        final_video = concatenate_videoclips([intro, main_video_with_text])
+        # Handle audio - extend original audio to cover intro + main video
+        if original_video.audio:
+            # Create a loop of the original audio to cover intro duration
+            original_audio = original_video.audio
+            # If original audio is shorter than intro, loop it
+            if original_audio.duration < intro_duration:
+                loops_needed = int(intro_duration / original_audio.duration) + 1
+                extended_audio = concatenate_audioclips([original_audio] * loops_needed)
+                intro_audio = extended_audio.subclip(0, intro_duration)
             else:
+                intro_audio = original_audio.subclip(0, intro_duration)
+            # Combine intro audio + full original audio
+            full_audio = concatenate_audioclips([intro_audio, original_audio])
+            # Apply fade effects to audio
+            full_audio = full_audio.fx(audio_fadein, 0.3).fx(audio_fadeout, 0.3)
+            # Set audio to final video
+            final_video = final_video.set_audio(full_audio)
+        # Write the final video
+        final_video.write_videofile(
+            output_path,
+            codec='libx264',
+            audio_codec='aac',
+            temp_audiofile='temp-audio.m4a',
+            remove_temp=True,
+            fps=original_video.fps,
+            preset='medium'
+        )
+        # Clean up
+        original_video.close()
+        final_video.close()
+        return output_path
+    except Exception as e:
+        print(f"Error creating final video: {e}")
+        return None
+def process_video_pipeline(video_file):
+    """Main processing pipeline for video."""
+    if video_file is None:
+        return "Please upload a video.", "Translation will appear here.", None, None
+    try:
+        # Create temporary files
+        temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+        temp_output = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+        # Save uploaded video
+        with open(temp_input.name, 'wb') as f:
+            f.write(video_file)
+        # Extract middle frame for OCR
+        print("Extracting middle frame for OCR...")
+        middle_frame = extract_middle_frame(temp_input.name)
+        if middle_frame is None:
+            return "Error extracting frame from video.", "No text to translate.", None, None
+        # Extract text and bbox using your existing function
+        print("Performing OCR on middle frame...")
+        extracted_text, bbox = extract_text_and_bbox(middle_frame)
+        if bbox is None:
+            return extracted_text, "No text to translate.", middle_frame, None
+        # Translate text using your existing function
+        print("Translating text to Persian...")
+        translated_text = translate_text_gemini(extracted_text)
+        if "Error" in translated_text:
+            return extracted_text, translated_text, middle_frame, None
+        # Create final video with intro and text overlay
+        print("Creating final video with intro effect...")
+        output_path = create_final_video_with_intro(temp_input.name, translated_text, bbox, temp_output.name)
+        if output_path is None:
+            return extracted_text, translated_text, middle_frame, None
+        print("Video processing completed successfully!")
+        return extracted_text, translated_text, middle_frame, output_path
+    except Exception as e:
+        return f"Error processing video: {str(e)}", "Translation failed.", None, None
 # --- GRADIO INTERFACE ---
+with gr.Blocks(title="Persian Video Quote Translator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎬 Persian Video Quote Translator with Sama Intro")
+    gr.Markdown("Upload a video with English text. The app will create a stylized intro effect, detect text from the middle frame, translate it to Persian, and overlay it on the entire video while preserving the original music.")
     with gr.Row():
         with gr.Column(scale=1):
+            video_input = gr.File(
+                label="📹 Upload Quote Video",
+                file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
+                type="binary"
+            )
+            process_btn = gr.Button("🎯 Process Video", variant="primary", size="lg")
+            with gr.Row():
+                text_output = gr.Textbox(
+                    label="📝 Extracted English Text",
+                    placeholder="Detected English text will appear here...",
+                    lines=3,
+                    show_copy_button=True
+                )
+                translated_output = gr.Textbox(
+                    label="🔤 Persian Translation",
+                    placeholder="Persian translation will appear here...",
+                    lines=3,
+                    show_copy_button=True
+                )
         with gr.Column(scale=1):
+            frame_output = gr.Image(
+                label="🖼️ Middle Frame (OCR Source)",
+                type="pil"
+            )
+            video_output = gr.Video(
+                label="🎥 Final Video with Sama Intro",
+                format="mp4"
+            )
+    process_btn.click(
+        fn=process_video_pipeline,
+        inputs=[video_input],
+        outputs=[text_output, translated_output, frame_output, video_output]
     )
+    gr.Markdown("### 📋 How it works:")
+    gr.Markdown("""
+    1. **Upload** a video file containing English text
+    2. **Click** 'Process Video' to start the magic ✨
+    3. The app will:
+       - 🎼 Create a sama-style intro with musical rhythm effects (like your reference video)
+       - 👁️ Extract the middle frame and detect English text using OCR
+       - 🔄 Translate the text to beautiful Persian poetry
+       - 🎨 Overlay the Persian text on all video frames with proper styling
+       - 🎵 Preserve and extend the original audio/music throughout
+       - 🎬 Combine everything into a polished final video
+    **Supported formats:** MP4, AVI, MOV, MKV, WebM
+    """)
 if __name__ == "__main__":
     demo.launch()