Spaces:

kavehtaheri
/

ocrvideo2

Sleeping

App Files Files Community

kavehtaheri commited on Jul 27

Commit

ce18e78

verified ·

1 Parent(s): 2ba7dc2

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -41

app.py CHANGED Viewed

@@ -10,12 +10,12 @@ import arabic_reshaper
 from bidi.algorithm import get_display
 import os
 import time
-import ffmpeg # ### --- CHANGE --- ###: Import the ffmpeg-python library
 # --- CONFIGURATION ---
 API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
-PERSIAN_FONT_PATH = "Vazir.ttf"
-FADE_IN_DURATION_SECONDS = 1.0 # The fade-in will be exactly 1 second long
 # --- GLOBAL INITIALIZATION ---
 reader = None
@@ -27,8 +27,7 @@ def initialize_reader():
         print("EasyOCR model loaded successfully!")
     return reader
-# --- YOUR CORE FUNCTIONS (Unchanged) ---
 def extract_text_and_bbox(image: Image.Image):
     ocr_reader = initialize_reader()
     img_array = np.array(image)
@@ -51,21 +50,19 @@ def translate_text_gemini(text: str) -> str:
     try:
         genai.configure(api_key=API_KEY)
         model = genai.GenerativeModel('gemini-1.5-flash')
-        # Your prompt here
         prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
         response = model.generate_content(prompt)
         return response.text.strip()
     except Exception as e: return f"Error during translation with Gemini: {str(e)}"
-# --- TEXT OVERLAY FUNCTION (RTL Logic Corrected) ---
 def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
     padding = 15
     overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
                    min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
     overlay_width = overlay_box[2] - overlay_box[0]
     overlay_height = overlay_box[3] - overlay_box[1]
     try:
         sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
@@ -76,7 +73,7 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
     target_width = overlay_width * 0.90
     font_size = 100
     final_wrapped_lines = []
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
         words = text_to_overlay.split();
@@ -91,7 +88,7 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
         total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
         if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
         else: font_size -= 2
     if not final_wrapped_lines: final_wrapped_lines = raw_lines
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
@@ -99,62 +96,55 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
     line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
     total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
     y_start = (overlay_height - total_text_height) / 2
     current_y = y_start
     for i, line in enumerate(final_wrapped_lines):
-        # ### --- CHANGE --- ###: Reverted to your original, proven RTL centering logic
-        # This is the fix for the RTL text display issue.
         reshaped_line = arabic_reshaper.reshape(line)
         bidi_line = get_display(reshaped_line)
-        # Manually calculate line width and center position
         line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
         line_width = line_bbox[2] - line_bbox[0]
         x_position = (overlay_width - line_width) / 2
-        # Draw shadow then text for readability
         draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
         draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
         current_y += line_heights[i] + line_spacing
     return overlay_layer, overlay_box
-# --- MAIN VIDEO PROCESSING PIPELINE (Now with FFMPEG) ---
 def process_video(video_path, progress=gr.Progress()):
     if video_path is None: raise gr.Error("Please upload a video file first.")
     progress(0, desc="Loading Video & Analyzing...")
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened(): raise gr.Error("Could not open video file.")
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
     if not ret: raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
     progress(0.2, desc="Detecting Text (EasyOCR)...")
     extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
     if bbox is None: raise gr.Error(extracted_text)
     progress(0.4, desc="Translating Text (Gemini API)...")
     translated_text = translate_text_gemini(extracted_text)
     if "Error" in translated_text: raise gr.Error(translated_text)
     progress(0.5, desc="Rendering Translated Text Overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
-    # ### --- CHANGE --- ###: Define filenames for temporary and final outputs
     timestamp = int(time.time())
     temp_silent_path = f"temp_silent_{timestamp}.mp4"
     final_output_path = f"translated_video_{timestamp}.mp4"
-    # Part 1: Create a silent video with the overlay using OpenCV
     progress(0.6, desc="Composing Silent Video with Overlay...")
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
@@ -162,58 +152,59 @@ def process_video(video_path, progress=gr.Progress()):
     cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
     frame_idx = 0
     x_min, y_min, x_max, y_max = overlay_position_box
     while True:
         ret, frame = cap.read()
         if not ret: break
         roi = frame[y_min:y_max, x_min:x_max]
         stamp_h, stamp_w, _ = overlay_stamp_cv.shape
         roi_h, roi_w, _ = roi.shape
         overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
         alpha = overlay_resized[:, :, 3] / 255.0
         alpha_mask = cv2.merge([alpha, alpha, alpha])
         blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
         frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
         out.write(frame)
         frame_idx += 1
         progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
     cap.release(); out.release()
-    # ### --- CHANGE --- ###: Part 2: Use ffmpeg to combine video with original audio and add fade
     progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
     try:
         input_video = ffmpeg.input(temp_silent_path)
         input_audio = ffmpeg.input(video_path)
         (
             ffmpeg
             .output(
-                input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS), # Apply fade-in to video stream
                 input_audio.audio, # Take audio stream from original
                 final_output_path,
-                c='copy', # Use 'copy' for audio codec to prevent re-encoding
-                shortest=None
             )
             .run(overwrite_output=True, quiet=True)
         )
     except ffmpeg.Error as e:
-        # Provide more detailed ffmpeg error logging if something goes wrong
         print('ffmpeg stdout:', e.stdout.decode('utf8'))
         print('ffmpeg stderr:', e.stderr.decode('utf8'))
         raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
     finally:
-        # Clean up the temporary silent video file
         if os.path.exists(temp_silent_path):
             os.remove(temp_silent_path)
     progress(1, desc="Done!")
     return final_output_path
-# --- GRADIO INTERFACE (Unchanged) ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎬 Persian Video Quote Translator")
     gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
@@ -223,7 +214,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     translate_button = gr.Button("Translate Video", variant="primary")
     translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
     gr.Markdown("---")
-    gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. **(New)** It uses `ffmpeg` to merge the new video with the **original audio**, apply a 1-second fade-in, and ensure the final duration matches the input.")
 if __name__ == "__main__":
     demo.launch(debug=True)

 from bidi.algorithm import get_display
 import os
 import time
+import ffmpeg # Import the ffmpeg-python library
 # --- CONFIGURATION ---
 API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
+PERSIAN_FONT_PATH = "Vazir.ttf" # Make sure this font file is in your repository
+FADE_IN_DURATION_SECONDS = 1.0
 # --- GLOBAL INITIALIZATION ---
 reader = None
         print("EasyOCR model loaded successfully!")
     return reader
+# --- CORE FUNCTIONS ---
 def extract_text_and_bbox(image: Image.Image):
     ocr_reader = initialize_reader()
     img_array = np.array(image)
     try:
         genai.configure(api_key=API_KEY)
         model = genai.GenerativeModel('gemini-1.5-flash')
         prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
         response = model.generate_content(prompt)
         return response.text.strip()
     except Exception as e: return f"Error during translation with Gemini: {str(e)}"
+# --- TEXT OVERLAY FUNCTION (WITH RTL CORRECTION) ---
 def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
     padding = 15
     overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
                    min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
     overlay_width = overlay_box[2] - overlay_box[0]
     overlay_height = overlay_box[3] - overlay_box[1]
     try:
         sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
         bg_color = original_image.getpixel((sample_x, sample_y))
     target_width = overlay_width * 0.90
     font_size = 100
     final_wrapped_lines = []
     while font_size > 10:
         font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
         words = text_to_overlay.split();
         total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
         if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
         else: font_size -= 2
     if not final_wrapped_lines: final_wrapped_lines = raw_lines
     final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
     line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
     total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
     y_start = (overlay_height - total_text_height) / 2
     current_y = y_start
     for i, line in enumerate(final_wrapped_lines):
         reshaped_line = arabic_reshaper.reshape(line)
         bidi_line = get_display(reshaped_line)
         line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
         line_width = line_bbox[2] - line_bbox[0]
         x_position = (overlay_width - line_width) / 2
         draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
         draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
         current_y += line_heights[i] + line_spacing
     return overlay_layer, overlay_box
+# --- MAIN VIDEO PROCESSING PIPELINE (WITH FFMPEG CORRECTION) ---
 def process_video(video_path, progress=gr.Progress()):
     if video_path is None: raise gr.Error("Please upload a video file first.")
     progress(0, desc="Loading Video & Analyzing...")
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened(): raise gr.Error("Could not open video file.")
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
     ret, middle_frame_bgr = cap.read()
     if not ret: raise gr.Error("Could not read middle frame.")
     middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
     progress(0.2, desc="Detecting Text (EasyOCR)...")
     extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
     if bbox is None: raise gr.Error(extracted_text)
     progress(0.4, desc="Translating Text (Gemini API)...")
     translated_text = translate_text_gemini(extracted_text)
     if "Error" in translated_text: raise gr.Error(translated_text)
     progress(0.5, desc="Rendering Translated Text Overlay...")
     overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
     overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
     timestamp = int(time.time())
     temp_silent_path = f"temp_silent_{timestamp}.mp4"
     final_output_path = f"translated_video_{timestamp}.mp4"
     progress(0.6, desc="Composing Silent Video with Overlay...")
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
     cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
     frame_idx = 0
     x_min, y_min, x_max, y_max = overlay_position_box
     while True:
         ret, frame = cap.read()
         if not ret: break
         roi = frame[y_min:y_max, x_min:x_max]
         stamp_h, stamp_w, _ = overlay_stamp_cv.shape
         roi_h, roi_w, _ = roi.shape
         overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
         alpha = overlay_resized[:, :, 3] / 255.0
         alpha_mask = cv2.merge([alpha, alpha, alpha])
         blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
         frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
         out.write(frame)
         frame_idx += 1
         progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
     cap.release(); out.release()
     progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
     try:
         input_video = ffmpeg.input(temp_silent_path)
         input_audio = ffmpeg.input(video_path)
+        # ### --- KEY CHANGE --- ###: Corrected ffmpeg command
         (
             ffmpeg
             .output(
+                input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
                 input_audio.audio, # Take audio stream from original
                 final_output_path,
+                vcodec='libx264',  # Explicitly set video codec for re-encoding
+                acodec='copy',     # Keep audio as is without re-encoding
+                shortest=None      # Ensure full video duration is used
             )
             .run(overwrite_output=True, quiet=True)
         )
     except ffmpeg.Error as e:
+        # Provide detailed ffmpeg error logs for easier debugging
         print('ffmpeg stdout:', e.stdout.decode('utf8'))
         print('ffmpeg stderr:', e.stderr.decode('utf8'))
         raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
     finally:
+        # Clean up the temporary silent file regardless of success or failure
         if os.path.exists(temp_silent_path):
             os.remove(temp_silent_path)
     progress(1, desc="Done!")
     return final_output_path
+# --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎬 Persian Video Quote Translator")
     gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
     translate_button = gr.Button("Translate Video", variant="primary")
     translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
     gr.Markdown("---")
+    gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. It uses `ffmpeg` to merge the new video with the **original audio**, apply a $1$-second fade-in, and ensure the final duration matches the input.")
 if __name__ == "__main__":
     demo.launch(debug=True)