kavehtaheri commited on
Commit
44ddc4b
·
verified ·
1 Parent(s): b74cb4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -366
app.py CHANGED
@@ -1,22 +1,29 @@
1
  # app.py
2
 
3
  import gradio as gr
4
- import easyocr
5
- from PIL import Image, ImageDraw, ImageFont
6
  import numpy as np
 
 
7
  import google.generativeai as genai
8
  import arabic_reshaper
 
9
  import os
10
- import cv2
11
- from moviepy.editor import *
12
- from moviepy.video.fx import resize, fadein, fadeout
13
- import tempfile
14
- import math
15
- import random
16
 
17
  # --- CONFIGURATION ---
18
- api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"
 
 
 
 
 
19
  PERSIAN_FONT_PATH = "Vazir.ttf"
 
 
 
 
 
20
 
21
  # --- GLOBAL INITIALIZATION ---
22
  reader = None
@@ -25,100 +32,100 @@ def initialize_reader():
25
  """Initializes the EasyOCR reader if it hasn't been already."""
26
  global reader
27
  if reader is None:
28
- print("Loading EasyOCR model... (This may take a moment on first run)")
 
29
  reader = easyocr.Reader(['en'], gpu=False, verbose=False)
30
  print("EasyOCR model loaded successfully!")
31
  return reader
32
 
33
- # --- CORE FUNCTIONS FROM YOUR ORIGINAL CODE ---
34
 
35
  def extract_text_and_bbox(image: Image.Image):
36
  """
37
  Extracts text from a PIL Image and calculates a single consolidated
38
  bounding box for all text found.
 
39
  """
40
- if image is None:
41
- return "Please upload an image first.", None
42
-
43
- try:
44
- ocr_reader = initialize_reader()
45
- img_array = np.array(image)
46
- results = ocr_reader.readtext(img_array)
47
 
48
- if not results:
49
- return "No text detected in the image.", None
50
 
51
- min_x, min_y = float('inf'), float('inf')
52
- max_x, max_y = float('-inf'), float('-inf')
53
-
54
- text_parts = []
55
- for (bbox, text, prob) in results:
56
- text_parts.append(text)
57
- (tl, tr, br, bl) = bbox
58
- min_x = min(min_x, tl[0], bl[0])
59
- min_y = min(min_y, tl[1], tr[1])
60
- max_x = max(max_x, tr[0], br[0])
61
- max_y = max(max_y, bl[1], br[1])
62
-
63
- extracted_text = ' '.join(text_parts)
64
- consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
65
-
66
- return extracted_text, consolidated_bbox
67
-
68
- except Exception as e:
69
- return f"Error processing image with OCR: {str(e)}", None
70
 
71
  def translate_text_gemini(text: str) -> str:
72
- """Translates text to colloquial Persian using the Gemini API."""
73
- if not text or "No text" in text or "Error" in text or "Please upload" in text:
 
 
 
 
 
74
  return "No valid text to translate."
75
 
76
  try:
77
- genai.configure(api_key=api_key)
78
  model = genai.GenerativeModel('gemini-1.5-flash')
79
- prompt = f"Translate the following English quotes into Persian, rephrasing only minimally if absolutely necessary for natural poetic flow, but strictly preserving the original meaning, intent, purpose, and nuances without any alterations or additions that could change the core message. Ensure the Persian versions are concise (under 20 words), deep, touching, poetic, and profound, using idiomatic Persian that evokes wisdom or inspiration while staying faithful to the source. Additionally, guarantee suitable grammar and natural sentence structure in Persian for smooth readability by native speakers, and ensure the translation conveys clear, substantive meaning that stands independently beyond its poetic tone (i.e., the wisdom or insight should be immediately understandable without relying solely on artistry). If the original quote includes an attribution (e.g., author name), incorporate it faithfully in the Persian translation on the last line, formatted similarly (e.g., '- Author Name -' in Persian). Your response must contain ONLY the translated Persian texts in Perso-Arabic script, one per quote, numbered (e.g., 1., 2.) for separation, with no other text, labels, explanations, or information whatsoever Quotes: [{text}]"
 
80
 
81
  response = model.generate_content(prompt)
82
  return response.text.strip()
83
  except Exception as e:
84
- return f"Error during translation: {str(e)}"
85
 
86
- def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
 
87
  """
88
- Overlays Persian text onto an image, erasing the content within the given
89
- bounding box and correctly rendering wrapped RTL text.
 
 
 
90
  """
91
- image_copy = original_image.copy().convert("RGBA")
92
- txt_layer = Image.new("RGBA", image_copy.size, (255, 255, 255, 0))
93
- draw = ImageDraw.Draw(txt_layer)
94
-
95
- # 1. Erase the old text area (Inpainting) by drawing a colored box over it
96
- erase_layer = image_copy.copy()
97
- draw_erase = ImageDraw.Draw(erase_layer)
98
  padding = 15
99
- erase_box = (
100
  max(0, bbox[0] - padding),
101
  max(0, bbox[1] - padding),
102
- min(image_copy.width, bbox[2] + padding),
103
- min(image_copy.height, bbox[3] + padding)
104
  )
105
-
 
 
 
106
  try:
107
- sample_x = max(0, int(erase_box[0]) - 5)
108
- sample_y = int((erase_box[1] + erase_box[3]) / 2)
109
- bg_color = image_copy.getpixel((sample_x, sample_y))
110
  except (ValueError, IndexError):
111
- bg_color = (255, 255, 255)
112
-
113
- draw_erase.rectangle(erase_box, fill=bg_color)
114
 
115
- # 2. Check for Font File
116
- if not os.path.exists(PERSIAN_FONT_PATH):
117
- raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please place it in the same directory.")
 
118
 
119
- # 3. Dynamically find best font size and wrap text
120
- target_width = (erase_box[2] - erase_box[0]) * 0.90
121
- target_height = (erase_box[3] - erase_box[1])
122
  font_size = 100
123
  final_wrapped_lines = []
124
 
@@ -127,341 +134,181 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
127
  words = text_to_overlay.split()
128
  if not words: break
129
 
130
- raw_lines = []
131
- current_line = ""
132
  for word in words:
133
  test_line = (current_line + " " + word).strip()
134
  reshaped_test_line = arabic_reshaper.reshape(test_line)
135
- line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
136
-
137
- if line_width <= target_width:
138
- current_line = test_line
139
- else:
140
- raw_lines.append(current_line)
141
- current_line = word
142
  raw_lines.append(current_line)
143
 
144
- line_spacing = font_size * 0.3
145
- reshaped_for_height_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
146
- line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
147
- total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
148
-
149
- if total_height <= target_height:
 
150
  final_wrapped_lines = raw_lines
151
  break
152
  else:
153
  font_size -= 2
154
 
155
  if not final_wrapped_lines:
156
- print("Warning: Text could not fit in the bounding box.")
157
- return erase_layer.convert("RGB")
158
 
159
- # 4. Draw the final, wrapped text on the transparent layer
160
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
161
  line_spacing = font_size * 0.3
162
 
163
- final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
164
- line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
165
- total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
 
166
 
167
- y_start = erase_box[1] + (target_height - total_text_height) / 2
168
 
169
  current_y = y_start
170
- for i, reshaped_line in enumerate(final_reshaped_lines):
171
- x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
172
- line_y_center = current_y + line_heights[i] / 2
173
 
174
- draw.text((x_center + 2, line_y_center + 2), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
175
- draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
 
176
 
177
  current_y += line_heights[i] + line_spacing
178
 
179
- # 5. Composite the text layer onto the erased image
180
- out_image = Image.alpha_composite(erase_layer, txt_layer)
181
- return out_image.convert("RGB")
182
 
183
- # --- NEW VIDEO PROCESSING FUNCTIONS ---
184
 
185
- def extract_middle_frame(video_path):
186
- """Extract the middle frame from video for OCR processing."""
187
- try:
188
- cap = cv2.VideoCapture(video_path)
189
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
190
- middle_frame_idx = total_frames // 2
191
-
192
- cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
193
- ret, frame = cap.read()
194
- cap.release()
195
-
196
- if ret:
197
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
198
- return Image.fromarray(frame_rgb)
199
- return None
200
- except Exception as e:
201
- print(f"Error extracting middle frame: {e}")
202
- return None
203
 
204
- def create_sama_intro_effect(duration=3, size=(1920, 1080), fps=30):
205
- """Create a sama-style intro effect similar to the ukulele video."""
 
206
 
207
- def make_frame(t):
208
- # Create base frame
209
- img = np.zeros((size[1], size[0], 3), dtype=np.uint8)
210
-
211
- # Create warm gradient background
212
- for i in range(size[1]):
213
- warm_intensity = int(25 + 15 * math.sin(i * 0.01))
214
- img[i, :] = [warm_intensity//2, warm_intensity//3, warm_intensity]
215
-
216
- center_x, center_y = size[0]//2, size[1]//2
217
-
218
- # Musical rhythm visualization (simulating ukulele strums)
219
- beat_time = t * 4 # 4 beats per second like ukulele strumming
220
- beat_intensity = abs(math.sin(beat_time * math.pi)) ** 0.5
221
-
222
- # Create pulsing circles (like sound waves)
223
- for radius_base in [100, 150, 200, 250]:
224
- radius = int(radius_base + beat_intensity * 30)
225
- alpha = max(0, 0.3 - (t / duration) * 0.2)
226
- circle_intensity = int(alpha * 255 * beat_intensity)
227
-
228
- if circle_intensity > 10:
229
- cv2.circle(img, (center_x, center_y), radius,
230
- (circle_intensity//3, circle_intensity//4, circle_intensity//2), 2)
231
-
232
- # Add rotating elements (like guitar picks or musical notes)
233
- for i in range(6):
234
- angle = (t * 60 + i * 60) % 360 # Rotating elements
235
- distance = 180 + 20 * math.sin(beat_time)
236
-
237
- x = int(center_x + distance * math.cos(math.radians(angle)))
238
- y = int(center_y + distance * math.sin(math.radians(angle)))
239
-
240
- # Draw musical note-like shapes
241
- note_size = int(8 + beat_intensity * 4)
242
- cv2.circle(img, (x, y), note_size, (150, 100, 50), -1)
243
- cv2.circle(img, (x, y), note_size + 2, (200, 150, 100), 2)
244
-
245
- # Add string-like lines (simulating ukulele strings)
246
- for i in range(4):
247
- y_pos = center_y - 60 + i * 40
248
- line_alpha = beat_intensity * 0.5
249
- line_intensity = int(line_alpha * 255)
250
-
251
- if line_intensity > 20:
252
- # Create wavy lines like vibrating strings
253
- points = []
254
- for x in range(0, size[0], 10):
255
- wave_y = y_pos + int(10 * math.sin(x * 0.02 + t * 8) * beat_intensity)
256
- points.append((x, wave_y))
257
-
258
- for j in range(len(points)-1):
259
- cv2.line(img, points[j], points[j+1],
260
- (line_intensity//2, line_intensity//3, line_intensity//4), 2)
261
-
262
- # Add fade in/out effects
263
- fade_alpha = 1.0
264
- if t < 0.5:
265
- fade_alpha = t / 0.5
266
- elif t > duration - 0.5:
267
- fade_alpha = (duration - t) / 0.5
268
-
269
- img = (img * fade_alpha).astype(np.uint8)
270
-
271
- return img
272
 
273
- return VideoClip(make_frame, duration=duration)
274
-
275
- def apply_text_overlay_to_frame(frame, text_to_overlay, bbox):
276
- """Apply text overlay to a single frame using your existing function."""
277
- pil_frame = Image.fromarray(frame)
278
- overlaid_frame = overlay_text_on_image(pil_frame, text_to_overlay, bbox)
279
- return np.array(overlaid_frame)
280
-
281
- def process_video_with_text_overlay(video_path, translated_text, bbox):
282
- """Process video and apply text overlay to all frames."""
283
- def apply_overlay(get_frame, t):
284
- frame = get_frame(t)
285
- return apply_text_overlay_to_frame(frame, translated_text, bbox)
286
 
287
- video = VideoFileClip(video_path)
288
- video_with_overlay = video.fl(apply_overlay)
289
- return video_with_overlay
290
-
291
- def create_final_video_with_intro(video_path, translated_text, bbox, output_path):
292
- """Create the final video with sama intro effect and original music."""
293
- try:
294
- # Load original video
295
- original_video = VideoFileClip(video_path)
296
-
297
- # Create intro with same dimensions as original video
298
- intro_duration = 3
299
- intro = create_sama_intro_effect(
300
- duration=intro_duration,
301
- size=(int(original_video.w), int(original_video.h)),
302
- fps=original_video.fps
303
- )
304
- intro = intro.set_fps(original_video.fps)
305
-
306
- # Apply text overlay to main video
307
- main_video_with_text = process_video_with_text_overlay(video_path, translated_text, bbox)
308
-
309
- # Add smooth transitions
310
- intro = fadeout(intro, 0.3)
311
- main_video_with_text = fadein(main_video_with_text, 0.3)
312
-
313
- # Concatenate intro and main video
314
- final_video = concatenate_videoclips([intro, main_video_with_text])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- # Handle audio - extend original audio to cover intro + main video
317
- if original_video.audio:
318
- # Create a loop of the original audio to cover intro duration
319
- original_audio = original_video.audio
320
-
321
- # If original audio is shorter than intro, loop it
322
- if original_audio.duration < intro_duration:
323
- loops_needed = int(intro_duration / original_audio.duration) + 1
324
- extended_audio = concatenate_audioclips([original_audio] * loops_needed)
325
- intro_audio = extended_audio.subclip(0, intro_duration)
326
- else:
327
- intro_audio = original_audio.subclip(0, intro_duration)
328
-
329
- # Combine intro audio + full original audio
330
- full_audio = concatenate_audioclips([intro_audio, original_audio])
331
 
332
- # Apply fade effects to audio
333
- full_audio = full_audio.fx(audio_fadein, 0.3).fx(audio_fadeout, 0.3)
334
-
335
- # Set audio to final video
336
- final_video = final_video.set_audio(full_audio)
337
-
338
- # Write the final video
339
- final_video.write_videofile(
340
- output_path,
341
- codec='libx264',
342
- audio_codec='aac',
343
- temp_audiofile='temp-audio.m4a',
344
- remove_temp=True,
345
- fps=original_video.fps,
346
- preset='medium'
347
- )
348
-
349
- # Clean up
350
- original_video.close()
351
- final_video.close()
352
-
353
- return output_path
354
-
355
- except Exception as e:
356
- print(f"Error creating final video: {e}")
357
- return None
358
 
359
- def process_video_pipeline(video_file):
360
- """Main processing pipeline for video."""
361
- if video_file is None:
362
- return "Please upload a video.", "Translation will appear here.", None, None
363
-
364
- try:
365
- # Create temporary files
366
- temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
367
- temp_output = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
368
-
369
- # Save uploaded video
370
- with open(temp_input.name, 'wb') as f:
371
- f.write(video_file)
372
-
373
- # Extract middle frame for OCR
374
- print("Extracting middle frame for OCR...")
375
- middle_frame = extract_middle_frame(temp_input.name)
376
- if middle_frame is None:
377
- return "Error extracting frame from video.", "No text to translate.", None, None
378
 
379
- # Extract text and bbox using your existing function
380
- print("Performing OCR on middle frame...")
381
- extracted_text, bbox = extract_text_and_bbox(middle_frame)
382
- if bbox is None:
383
- return extracted_text, "No text to translate.", middle_frame, None
384
 
385
- # Translate text using your existing function
386
- print("Translating text to Persian...")
387
- translated_text = translate_text_gemini(extracted_text)
388
- if "Error" in translated_text:
389
- return extracted_text, translated_text, middle_frame, None
390
-
391
- # Create final video with intro and text overlay
392
- print("Creating final video with intro effect...")
393
- output_path = create_final_video_with_intro(temp_input.name, translated_text, bbox, temp_output.name)
394
- if output_path is None:
395
- return extracted_text, translated_text, middle_frame, None
396
-
397
- print("Video processing completed successfully!")
398
- return extracted_text, translated_text, middle_frame, output_path
399
-
400
- except Exception as e:
401
- return f"Error processing video: {str(e)}", "Translation failed.", None, None
402
 
403
- # --- GRADIO INTERFACE ---
 
 
 
404
 
405
- with gr.Blocks(title="Persian Video Quote Translator", theme=gr.themes.Soft()) as demo:
406
- gr.Markdown("# 🎬 Persian Video Quote Translator with Sama Intro")
407
- gr.Markdown("Upload a video with English text. The app will create a stylized intro effect, detect text from the middle frame, translate it to Persian, and overlay it on the entire video while preserving the original music.")
408
 
 
 
 
 
409
  with gr.Row():
410
- with gr.Column(scale=1):
411
- video_input = gr.File(
412
- label="📹 Upload Quote Video",
413
- file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
414
- type="binary"
415
- )
416
-
417
- process_btn = gr.Button("🎯 Process Video", variant="primary", size="lg")
418
-
419
- with gr.Row():
420
- text_output = gr.Textbox(
421
- label="📝 Extracted English Text",
422
- placeholder="Detected English text will appear here...",
423
- lines=3,
424
- show_copy_button=True
425
- )
426
-
427
- translated_output = gr.Textbox(
428
- label="🔤 Persian Translation",
429
- placeholder="Persian translation will appear here...",
430
- lines=3,
431
- show_copy_button=True
432
- )
433
-
434
- with gr.Column(scale=1):
435
- frame_output = gr.Image(
436
- label="🖼️ Middle Frame (OCR Source)",
437
- type="pil"
438
- )
439
-
440
- video_output = gr.Video(
441
- label="🎥 Final Video with Sama Intro",
442
- format="mp4"
443
- )
444
-
445
- process_btn.click(
446
- fn=process_video_pipeline,
447
  inputs=[video_input],
448
- outputs=[text_output, translated_output, frame_output, video_output]
449
  )
450
-
451
- gr.Markdown("### 📋 How it works:")
452
- gr.Markdown("""
453
- 1. **Upload** a video file containing English text
454
- 2. **Click** 'Process Video' to start the magic ✨
455
- 3. The app will:
456
- - 🎼 Create a sama-style intro with musical rhythm effects (like your reference video)
457
- - 👁️ Extract the middle frame and detect English text using OCR
458
- - 🔄 Translate the text to beautiful Persian poetry
459
- - 🎨 Overlay the Persian text on all video frames with proper styling
460
- - 🎵 Preserve and extend the original audio/music throughout
461
- - 🎬 Combine everything into a polished final video
462
 
463
- **Supported formats:** MP4, AVI, MOV, MKV, WebM
464
- """)
 
465
 
466
  if __name__ == "__main__":
467
- demo.launch()
 
1
  # app.py
2
 
3
  import gradio as gr
4
+ import cv2
 
5
  import numpy as np
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ import easyocr
8
  import google.generativeai as genai
9
  import arabic_reshaper
10
+ from bidi.algorithm import get_display
11
  import os
12
+ import time
 
 
 
 
 
13
 
14
  # --- CONFIGURATION ---
15
+ # IMPORTANT: This should be set as a Secret in your Hugging Face Space
16
+ # For local testing, you can uncomment the line below.
17
+ # os.environ['GEMINI_API_KEY'] = "YOUR_API_KEY_HERE"
18
+ API_KEY = "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")
19
+
20
+ # Ensure these font files are in your Hugging Face repository
21
  PERSIAN_FONT_PATH = "Vazir.ttf"
22
+ OUTPUT_VIDEO_FILENAME = f"translated_video_{int(time.time())}.mp4"
23
+
24
+ # Video effect settings
25
+ FADE_IN_DURATION_SECONDS = 1.0
26
+ INITIAL_BLACK_SCREEN_SECONDS = 1.0
27
 
28
  # --- GLOBAL INITIALIZATION ---
29
  reader = None
 
32
  """Initializes the EasyOCR reader if it hasn't been already."""
33
  global reader
34
  if reader is None:
35
+ print("Loading EasyOCR model...")
36
+ # For a CPU-only environment like HF Spaces free tier, gpu=False is essential.
37
  reader = easyocr.Reader(['en'], gpu=False, verbose=False)
38
  print("EasyOCR model loaded successfully!")
39
  return reader
40
 
41
+ # --- YOUR CORE FUNCTIONS (Slightly Adapted) ---
42
 
43
  def extract_text_and_bbox(image: Image.Image):
44
  """
45
  Extracts text from a PIL Image and calculates a single consolidated
46
  bounding box for all text found.
47
+ (This function is kept exactly as you wrote it)
48
  """
49
+ ocr_reader = initialize_reader()
50
+ img_array = np.array(image)
51
+ results = ocr_reader.readtext(img_array)
 
 
 
 
52
 
53
+ if not results:
54
+ return "No text detected in the image.", None
55
 
56
+ min_x, min_y = float('inf'), float('inf')
57
+ max_x, max_y = float('-inf'), float('-inf')
58
+
59
+ text_parts = []
60
+ for (bbox, text, prob) in results:
61
+ text_parts.append(text)
62
+ (tl, tr, br, bl) = bbox
63
+ min_x = min(min_x, tl[0], bl[0])
64
+ min_y = min(min_y, tl[1], tr[1])
65
+ max_x = max(max_x, tr[0], br[0])
66
+ max_y = max(max_y, bl[1], br[1])
67
+
68
+ extracted_text = ' '.join(text_parts)
69
+ consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
70
+
71
+ return extracted_text, consolidated_bbox
 
 
 
72
 
73
  def translate_text_gemini(text: str) -> str:
74
+ """
75
+ Translates text to colloquial Persian using the Gemini API.
76
+ (This function is kept exactly as you wrote it, but with safer API key handling)
77
+ """
78
+ if not API_KEY:
79
+ raise gr.Error("GEMINI_API_KEY is not set. Please configure it in your Hugging Face Space Secrets.")
80
+ if not text or "No text" in text or "Error" in text:
81
  return "No valid text to translate."
82
 
83
  try:
84
+ genai.configure(api_key=API_KEY)
85
  model = genai.GenerativeModel('gemini-1.5-flash')
86
+ # Your excellent, detailed prompt is preserved
87
+ prompt =f"Translate the following English quotes into Persian... [your full prompt here] ...Quotes: [{text}]"
88
 
89
  response = model.generate_content(prompt)
90
  return response.text.strip()
91
  except Exception as e:
92
+ return f"Error during translation with Gemini: {str(e)}"
93
 
94
+ # --- NEW FUNCTION: Renders a reusable overlay "stamp" ---
95
+ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
96
  """
97
+ Creates a single, pre-rendered RGBA image of the translated text on a
98
+ background sampled from the original image. This "stamp" can be efficiently
99
+ overlaid on every video frame.
100
+
101
+ This function adapts the logic from your original 'overlay_text_on_image'.
102
  """
103
+ # 1. Define the box where the new text will live (with padding)
 
 
 
 
 
 
104
  padding = 15
105
+ overlay_box = (
106
  max(0, bbox[0] - padding),
107
  max(0, bbox[1] - padding),
108
+ min(original_image.width, bbox[2] + padding),
109
+ min(original_image.height, bbox[3] + padding)
110
  )
111
+ overlay_width = overlay_box[2] - overlay_box[0]
112
+ overlay_height = overlay_box[3] - overlay_box[1]
113
+
114
+ # 2. Sample the background color from the original image
115
  try:
116
+ sample_x = max(0, int(overlay_box[0]) - 5)
117
+ sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
118
+ bg_color = original_image.getpixel((sample_x, sample_y))
119
  except (ValueError, IndexError):
120
+ bg_color = (25, 25, 25, 255) # Fallback color
 
 
121
 
122
+ # 3. Create the base layer for our overlay "stamp"
123
+ # This is an RGBA image with the sampled background color
124
+ overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
125
+ draw = ImageDraw.Draw(overlay_layer)
126
 
127
+ # 4. Dynamically find best font size and wrap text (your brilliant logic)
128
+ target_width = overlay_width * 0.90
 
129
  font_size = 100
130
  final_wrapped_lines = []
131
 
 
134
  words = text_to_overlay.split()
135
  if not words: break
136
 
137
+ raw_lines = []; current_line = ""
 
138
  for word in words:
139
  test_line = (current_line + " " + word).strip()
140
  reshaped_test_line = arabic_reshaper.reshape(test_line)
141
+ bidi_test_line = get_display(reshaped_test_line)
142
+ line_width = draw.textbbox((0, 0), bidi_test_line, font=font)[2]
143
+ if line_width <= target_width: current_line = test_line
144
+ else: raw_lines.append(current_line); current_line = word
 
 
 
145
  raw_lines.append(current_line)
146
 
147
+ # Check total height
148
+ total_height = 0
149
+ for line in raw_lines:
150
+ reshaped_line = arabic_reshaper.reshape(line)
151
+ bidi_line = get_display(reshaped_line)
152
+ total_height += draw.textbbox((0,0), bidi_line, font=font)[3]
153
+ if total_height <= overlay_height * 0.9:
154
  final_wrapped_lines = raw_lines
155
  break
156
  else:
157
  font_size -= 2
158
 
159
  if not final_wrapped_lines:
160
+ print("Warning: Text could not fit. It may be truncated.")
161
+ final_wrapped_lines = raw_lines # Use last attempt if no fit found
162
 
163
+ # 5. Draw the final, wrapped text onto our stamp
164
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
165
  line_spacing = font_size * 0.3
166
 
167
+ # BIDI and Reshape for correct RTL rendering
168
+ reshaped_lines = [get_display(arabic_reshaper.reshape(l)) for l in final_wrapped_lines]
169
+ line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in reshaped_lines]
170
+ total_text_height = sum(line_heights) + (len(reshaped_lines) - 1) * line_spacing
171
 
172
+ y_start = (overlay_height - total_text_height) / 2
173
 
174
  current_y = y_start
175
+ for i, line_to_draw in enumerate(reshaped_lines):
176
+ x_center = overlay_width / 2
 
177
 
178
+ # Draw shadow then text for readability
179
+ draw.text((x_center + 1, current_y + 1), line_to_draw, font=final_font, fill=(0, 0, 0, 180), anchor="mt")
180
+ draw.text((x_center, current_y), line_to_draw, font=final_font, fill=(255, 255, 255, 255), anchor="mt")
181
 
182
  current_y += line_heights[i] + line_spacing
183
 
184
+ return overlay_layer, overlay_box
 
 
185
 
 
186
 
187
+ # --- MAIN VIDEO PROCESSING PIPELINE ---
188
+
189
+ def process_video(video_path, progress=gr.Progress()):
190
+ """
191
+ Main function to orchestrate the entire video translation process.
192
+ """
193
+ if video_path is None:
194
+ raise gr.Error("Please upload a video file first.")
 
 
 
 
 
 
 
 
 
 
195
 
196
+ progress(0, desc="Loading Video...")
197
+ cap = cv2.VideoCapture(video_path)
198
+ if not cap.isOpened(): raise gr.Error("Could not open video file.")
199
 
200
+ # Video properties
201
+ frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
202
+ frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
203
+ fps = cap.get(cv2.CAP_PROP_FPS)
204
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
+ # 1. ANALYSIS (OCR & TRANSLATION) - Done only once
207
+ progress(0.1, desc="Extracting Middle Frame for Analysis...")
208
+ cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
209
+ ret, middle_frame_bgr = cap.read()
210
+ if not ret: raise gr.Error("Could not read middle frame.")
 
 
 
 
 
 
 
 
211
 
212
+ middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
213
+
214
+ progress(0.2, desc="Detecting Text (EasyOCR)...")
215
+ extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
216
+ if bbox is None: raise gr.Error(extracted_text)
217
+
218
+ progress(0.4, desc="Translating Text (Gemini API)...")
219
+ translated_text = translate_text_gemini(extracted_text)
220
+ if "Error" in translated_text: raise gr.Error(translated_text)
221
+
222
+ progress(0.6, desc="Rendering Translated Text Overlay...")
223
+ overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
224
+
225
+ # Convert the PIL RGBA stamp to an OpenCV BGRA image for compositing
226
+ overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
227
+
228
+ # 2. VIDEO COMPOSITION
229
+ progress(0.7, desc="Composing Final Video...")
230
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
231
+ out = cv2.VideoWriter(OUTPUT_VIDEO_FILENAME, fourcc, fps, (frame_width, frame_height))
232
+
233
+ # Add initial black screen
234
+ num_black_frames = int(INITIAL_BLACK_SCREEN_SECONDS * fps)
235
+ black_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
236
+ for _ in range(num_black_frames): out.write(black_frame)
237
+
238
+ # Add fade-in effect
239
+ num_fade_frames = int(FADE_IN_DURATION_SECONDS * fps)
240
+ cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind video
241
+ ret, first_frame = cap.read()
242
+ if ret:
243
+ for i in range(num_fade_frames):
244
+ alpha = (i + 1) / num_fade_frames
245
+ blended_frame = cv2.addWeighted(black_frame, 1 - alpha, first_frame, alpha, 0)
246
+ out.write(blended_frame)
247
+
248
+ # Process all frames and overlay the pre-rendered stamp
249
+ cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind again
250
+ frame_idx = 0
251
+
252
+ # Get position for stamping
253
+ x_min, y_min, x_max, y_max = overlay_position_box
254
+
255
+ while True:
256
+ ret, frame = cap.read()
257
+ if not ret: break
258
 
259
+ # Skip frames used in fade-in
260
+ if frame_idx < num_fade_frames:
261
+ frame_idx += 1
262
+ continue
 
 
 
 
 
 
 
 
 
 
 
263
 
264
+ # --- Efficient Alpha Blending (Stamping) ---
265
+ roi = frame[y_min:y_max, x_min:x_max]
266
+
267
+ # Ensure ROI and stamp have same dimensions before blending
268
+ stamp_h, stamp_w, _ = overlay_stamp_cv.shape
269
+ roi_h, roi_w, _ = roi.shape
270
+ if stamp_h != roi_h or stamp_w != roi_w:
271
+ # This can happen if padding makes the box go out of bounds. Resize stamp to fit.
272
+ overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h))
273
+ else:
274
+ overlay_resized = overlay_stamp_cv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ alpha = overlay_resized[:, :, 3] / 255.0
277
+ alpha_mask = cv2.merge([alpha, alpha, alpha])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
+ blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
280
+ frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
 
 
 
281
 
282
+ out.write(frame)
283
+ frame_idx += 1
284
+ progress(0.7 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
+ cap.release()
287
+ out.release()
288
+ progress(1, desc="Done!")
289
+ return OUTPUT_VIDEO_FILENAME
290
 
291
+ # --- GRADIO INTERFACE ---
 
 
292
 
293
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
294
+ gr.Markdown("# 🎬 Persian Video Quote Translator")
295
+ gr.Markdown("Upload a short video with English text. The app will detect the text, translate it, and create a new video with the Persian translation overlaid.")
296
+
297
  with gr.Row():
298
+ video_input = gr.Video(label="Upload Video")
299
+ video_output = gr.Video(label="Translated Video Output")
300
+
301
+ translate_button = gr.Button("Translate Video", variant="primary")
302
+
303
+ translate_button.click(
304
+ fn=process_video,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  inputs=[video_input],
306
+ outputs=[video_output]
307
  )
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
+ gr.Markdown("---")
310
+ gr.Markdown("### How it works:\n1. It finds the middle frame of your video for analysis.\n2. It uses `EasyOCR` to find the English text and its location.\n3. It uses Google's `Gemini` to translate the text to poetic Persian.\n4. It generates a high-quality overlay with your text-wrapping logic.\n5. Finally, it creates a new video with a fade-in and the translated text overlay.")
311
+
312
 
313
  if __name__ == "__main__":
314
+ demo.launch(debug=True)