Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,29 @@
|
|
1 |
# app.py
|
2 |
|
3 |
import gradio as gr
|
4 |
-
import
|
5 |
-
from PIL import Image, ImageDraw, ImageFont
|
6 |
import numpy as np
|
|
|
|
|
7 |
import google.generativeai as genai
|
8 |
import arabic_reshaper
|
|
|
9 |
import os
|
10 |
-
import
|
11 |
-
from moviepy.editor import *
|
12 |
-
from moviepy.video.fx import resize, fadein, fadeout
|
13 |
-
import tempfile
|
14 |
-
import math
|
15 |
-
import random
|
16 |
|
17 |
# --- CONFIGURATION ---
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
19 |
PERSIAN_FONT_PATH = "Vazir.ttf"
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# --- GLOBAL INITIALIZATION ---
|
22 |
reader = None
|
@@ -25,100 +32,100 @@ def initialize_reader():
|
|
25 |
"""Initializes the EasyOCR reader if it hasn't been already."""
|
26 |
global reader
|
27 |
if reader is None:
|
28 |
-
print("Loading EasyOCR model...
|
|
|
29 |
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
|
30 |
print("EasyOCR model loaded successfully!")
|
31 |
return reader
|
32 |
|
33 |
-
# --- CORE FUNCTIONS
|
34 |
|
35 |
def extract_text_and_bbox(image: Image.Image):
|
36 |
"""
|
37 |
Extracts text from a PIL Image and calculates a single consolidated
|
38 |
bounding box for all text found.
|
|
|
39 |
"""
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
try:
|
44 |
-
ocr_reader = initialize_reader()
|
45 |
-
img_array = np.array(image)
|
46 |
-
results = ocr_reader.readtext(img_array)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
except Exception as e:
|
69 |
-
return f"Error processing image with OCR: {str(e)}", None
|
70 |
|
71 |
def translate_text_gemini(text: str) -> str:
|
72 |
-
"""
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
74 |
return "No valid text to translate."
|
75 |
|
76 |
try:
|
77 |
-
genai.configure(api_key=
|
78 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
79 |
-
|
|
|
80 |
|
81 |
response = model.generate_content(prompt)
|
82 |
return response.text.strip()
|
83 |
except Exception as e:
|
84 |
-
return f"Error during translation: {str(e)}"
|
85 |
|
86 |
-
|
|
|
87 |
"""
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
90 |
"""
|
91 |
-
|
92 |
-
txt_layer = Image.new("RGBA", image_copy.size, (255, 255, 255, 0))
|
93 |
-
draw = ImageDraw.Draw(txt_layer)
|
94 |
-
|
95 |
-
# 1. Erase the old text area (Inpainting) by drawing a colored box over it
|
96 |
-
erase_layer = image_copy.copy()
|
97 |
-
draw_erase = ImageDraw.Draw(erase_layer)
|
98 |
padding = 15
|
99 |
-
|
100 |
max(0, bbox[0] - padding),
|
101 |
max(0, bbox[1] - padding),
|
102 |
-
min(
|
103 |
-
min(
|
104 |
)
|
105 |
-
|
|
|
|
|
|
|
106 |
try:
|
107 |
-
sample_x = max(0, int(
|
108 |
-
sample_y = int((
|
109 |
-
bg_color =
|
110 |
except (ValueError, IndexError):
|
111 |
-
bg_color = (
|
112 |
-
|
113 |
-
draw_erase.rectangle(erase_box, fill=bg_color)
|
114 |
|
115 |
-
#
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
-
#
|
120 |
-
target_width =
|
121 |
-
target_height = (erase_box[3] - erase_box[1])
|
122 |
font_size = 100
|
123 |
final_wrapped_lines = []
|
124 |
|
@@ -127,341 +134,181 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
|
|
127 |
words = text_to_overlay.split()
|
128 |
if not words: break
|
129 |
|
130 |
-
raw_lines = []
|
131 |
-
current_line = ""
|
132 |
for word in words:
|
133 |
test_line = (current_line + " " + word).strip()
|
134 |
reshaped_test_line = arabic_reshaper.reshape(test_line)
|
135 |
-
|
136 |
-
|
137 |
-
if line_width <= target_width:
|
138 |
-
|
139 |
-
else:
|
140 |
-
raw_lines.append(current_line)
|
141 |
-
current_line = word
|
142 |
raw_lines.append(current_line)
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
150 |
final_wrapped_lines = raw_lines
|
151 |
break
|
152 |
else:
|
153 |
font_size -= 2
|
154 |
|
155 |
if not final_wrapped_lines:
|
156 |
-
print("Warning: Text could not fit
|
157 |
-
|
158 |
|
159 |
-
#
|
160 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
161 |
line_spacing = font_size * 0.3
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
166 |
|
167 |
-
y_start =
|
168 |
|
169 |
current_y = y_start
|
170 |
-
for i,
|
171 |
-
x_center =
|
172 |
-
line_y_center = current_y + line_heights[i] / 2
|
173 |
|
174 |
-
|
175 |
-
draw.text((x_center,
|
|
|
176 |
|
177 |
current_y += line_heights[i] + line_spacing
|
178 |
|
179 |
-
|
180 |
-
out_image = Image.alpha_composite(erase_layer, txt_layer)
|
181 |
-
return out_image.convert("RGB")
|
182 |
|
183 |
-
# --- NEW VIDEO PROCESSING FUNCTIONS ---
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
ret, frame = cap.read()
|
194 |
-
cap.release()
|
195 |
-
|
196 |
-
if ret:
|
197 |
-
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
198 |
-
return Image.fromarray(frame_rgb)
|
199 |
-
return None
|
200 |
-
except Exception as e:
|
201 |
-
print(f"Error extracting middle frame: {e}")
|
202 |
-
return None
|
203 |
|
204 |
-
|
205 |
-
|
|
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
for i in range(size[1]):
|
213 |
-
warm_intensity = int(25 + 15 * math.sin(i * 0.01))
|
214 |
-
img[i, :] = [warm_intensity//2, warm_intensity//3, warm_intensity]
|
215 |
-
|
216 |
-
center_x, center_y = size[0]//2, size[1]//2
|
217 |
-
|
218 |
-
# Musical rhythm visualization (simulating ukulele strums)
|
219 |
-
beat_time = t * 4 # 4 beats per second like ukulele strumming
|
220 |
-
beat_intensity = abs(math.sin(beat_time * math.pi)) ** 0.5
|
221 |
-
|
222 |
-
# Create pulsing circles (like sound waves)
|
223 |
-
for radius_base in [100, 150, 200, 250]:
|
224 |
-
radius = int(radius_base + beat_intensity * 30)
|
225 |
-
alpha = max(0, 0.3 - (t / duration) * 0.2)
|
226 |
-
circle_intensity = int(alpha * 255 * beat_intensity)
|
227 |
-
|
228 |
-
if circle_intensity > 10:
|
229 |
-
cv2.circle(img, (center_x, center_y), radius,
|
230 |
-
(circle_intensity//3, circle_intensity//4, circle_intensity//2), 2)
|
231 |
-
|
232 |
-
# Add rotating elements (like guitar picks or musical notes)
|
233 |
-
for i in range(6):
|
234 |
-
angle = (t * 60 + i * 60) % 360 # Rotating elements
|
235 |
-
distance = 180 + 20 * math.sin(beat_time)
|
236 |
-
|
237 |
-
x = int(center_x + distance * math.cos(math.radians(angle)))
|
238 |
-
y = int(center_y + distance * math.sin(math.radians(angle)))
|
239 |
-
|
240 |
-
# Draw musical note-like shapes
|
241 |
-
note_size = int(8 + beat_intensity * 4)
|
242 |
-
cv2.circle(img, (x, y), note_size, (150, 100, 50), -1)
|
243 |
-
cv2.circle(img, (x, y), note_size + 2, (200, 150, 100), 2)
|
244 |
-
|
245 |
-
# Add string-like lines (simulating ukulele strings)
|
246 |
-
for i in range(4):
|
247 |
-
y_pos = center_y - 60 + i * 40
|
248 |
-
line_alpha = beat_intensity * 0.5
|
249 |
-
line_intensity = int(line_alpha * 255)
|
250 |
-
|
251 |
-
if line_intensity > 20:
|
252 |
-
# Create wavy lines like vibrating strings
|
253 |
-
points = []
|
254 |
-
for x in range(0, size[0], 10):
|
255 |
-
wave_y = y_pos + int(10 * math.sin(x * 0.02 + t * 8) * beat_intensity)
|
256 |
-
points.append((x, wave_y))
|
257 |
-
|
258 |
-
for j in range(len(points)-1):
|
259 |
-
cv2.line(img, points[j], points[j+1],
|
260 |
-
(line_intensity//2, line_intensity//3, line_intensity//4), 2)
|
261 |
-
|
262 |
-
# Add fade in/out effects
|
263 |
-
fade_alpha = 1.0
|
264 |
-
if t < 0.5:
|
265 |
-
fade_alpha = t / 0.5
|
266 |
-
elif t > duration - 0.5:
|
267 |
-
fade_alpha = (duration - t) / 0.5
|
268 |
-
|
269 |
-
img = (img * fade_alpha).astype(np.uint8)
|
270 |
-
|
271 |
-
return img
|
272 |
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
overlaid_frame = overlay_text_on_image(pil_frame, text_to_overlay, bbox)
|
279 |
-
return np.array(overlaid_frame)
|
280 |
-
|
281 |
-
def process_video_with_text_overlay(video_path, translated_text, bbox):
|
282 |
-
"""Process video and apply text overlay to all frames."""
|
283 |
-
def apply_overlay(get_frame, t):
|
284 |
-
frame = get_frame(t)
|
285 |
-
return apply_text_overlay_to_frame(frame, translated_text, bbox)
|
286 |
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
-
#
|
317 |
-
if
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
# If original audio is shorter than intro, loop it
|
322 |
-
if original_audio.duration < intro_duration:
|
323 |
-
loops_needed = int(intro_duration / original_audio.duration) + 1
|
324 |
-
extended_audio = concatenate_audioclips([original_audio] * loops_needed)
|
325 |
-
intro_audio = extended_audio.subclip(0, intro_duration)
|
326 |
-
else:
|
327 |
-
intro_audio = original_audio.subclip(0, intro_duration)
|
328 |
-
|
329 |
-
# Combine intro audio + full original audio
|
330 |
-
full_audio = concatenate_audioclips([intro_audio, original_audio])
|
331 |
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
temp_audiofile='temp-audio.m4a',
|
344 |
-
remove_temp=True,
|
345 |
-
fps=original_video.fps,
|
346 |
-
preset='medium'
|
347 |
-
)
|
348 |
-
|
349 |
-
# Clean up
|
350 |
-
original_video.close()
|
351 |
-
final_video.close()
|
352 |
-
|
353 |
-
return output_path
|
354 |
-
|
355 |
-
except Exception as e:
|
356 |
-
print(f"Error creating final video: {e}")
|
357 |
-
return None
|
358 |
|
359 |
-
|
360 |
-
|
361 |
-
if video_file is None:
|
362 |
-
return "Please upload a video.", "Translation will appear here.", None, None
|
363 |
-
|
364 |
-
try:
|
365 |
-
# Create temporary files
|
366 |
-
temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
|
367 |
-
temp_output = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
|
368 |
-
|
369 |
-
# Save uploaded video
|
370 |
-
with open(temp_input.name, 'wb') as f:
|
371 |
-
f.write(video_file)
|
372 |
-
|
373 |
-
# Extract middle frame for OCR
|
374 |
-
print("Extracting middle frame for OCR...")
|
375 |
-
middle_frame = extract_middle_frame(temp_input.name)
|
376 |
-
if middle_frame is None:
|
377 |
-
return "Error extracting frame from video.", "No text to translate.", None, None
|
378 |
|
379 |
-
|
380 |
-
|
381 |
-
extracted_text, bbox = extract_text_and_bbox(middle_frame)
|
382 |
-
if bbox is None:
|
383 |
-
return extracted_text, "No text to translate.", middle_frame, None
|
384 |
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
if "Error" in translated_text:
|
389 |
-
return extracted_text, translated_text, middle_frame, None
|
390 |
-
|
391 |
-
# Create final video with intro and text overlay
|
392 |
-
print("Creating final video with intro effect...")
|
393 |
-
output_path = create_final_video_with_intro(temp_input.name, translated_text, bbox, temp_output.name)
|
394 |
-
if output_path is None:
|
395 |
-
return extracted_text, translated_text, middle_frame, None
|
396 |
-
|
397 |
-
print("Video processing completed successfully!")
|
398 |
-
return extracted_text, translated_text, middle_frame, output_path
|
399 |
-
|
400 |
-
except Exception as e:
|
401 |
-
return f"Error processing video: {str(e)}", "Translation failed.", None, None
|
402 |
|
403 |
-
|
|
|
|
|
|
|
404 |
|
405 |
-
|
406 |
-
gr.Markdown("# 🎬 Persian Video Quote Translator with Sama Intro")
|
407 |
-
gr.Markdown("Upload a video with English text. The app will create a stylized intro effect, detect text from the middle frame, translate it to Persian, and overlay it on the entire video while preserving the original music.")
|
408 |
|
|
|
|
|
|
|
|
|
409 |
with gr.Row():
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
process_btn = gr.Button("🎯 Process Video", variant="primary", size="lg")
|
418 |
-
|
419 |
-
with gr.Row():
|
420 |
-
text_output = gr.Textbox(
|
421 |
-
label="📝 Extracted English Text",
|
422 |
-
placeholder="Detected English text will appear here...",
|
423 |
-
lines=3,
|
424 |
-
show_copy_button=True
|
425 |
-
)
|
426 |
-
|
427 |
-
translated_output = gr.Textbox(
|
428 |
-
label="🔤 Persian Translation",
|
429 |
-
placeholder="Persian translation will appear here...",
|
430 |
-
lines=3,
|
431 |
-
show_copy_button=True
|
432 |
-
)
|
433 |
-
|
434 |
-
with gr.Column(scale=1):
|
435 |
-
frame_output = gr.Image(
|
436 |
-
label="🖼️ Middle Frame (OCR Source)",
|
437 |
-
type="pil"
|
438 |
-
)
|
439 |
-
|
440 |
-
video_output = gr.Video(
|
441 |
-
label="🎥 Final Video with Sama Intro",
|
442 |
-
format="mp4"
|
443 |
-
)
|
444 |
-
|
445 |
-
process_btn.click(
|
446 |
-
fn=process_video_pipeline,
|
447 |
inputs=[video_input],
|
448 |
-
outputs=[
|
449 |
)
|
450 |
-
|
451 |
-
gr.Markdown("### 📋 How it works:")
|
452 |
-
gr.Markdown("""
|
453 |
-
1. **Upload** a video file containing English text
|
454 |
-
2. **Click** 'Process Video' to start the magic ✨
|
455 |
-
3. The app will:
|
456 |
-
- 🎼 Create a sama-style intro with musical rhythm effects (like your reference video)
|
457 |
-
- 👁️ Extract the middle frame and detect English text using OCR
|
458 |
-
- 🔄 Translate the text to beautiful Persian poetry
|
459 |
-
- 🎨 Overlay the Persian text on all video frames with proper styling
|
460 |
-
- 🎵 Preserve and extend the original audio/music throughout
|
461 |
-
- 🎬 Combine everything into a polished final video
|
462 |
|
463 |
-
|
464 |
-
""
|
|
|
465 |
|
466 |
if __name__ == "__main__":
|
467 |
-
demo.launch()
|
|
|
1 |
# app.py
|
2 |
|
3 |
import gradio as gr
|
4 |
+
import cv2
|
|
|
5 |
import numpy as np
|
6 |
+
from PIL import Image, ImageDraw, ImageFont
|
7 |
+
import easyocr
|
8 |
import google.generativeai as genai
|
9 |
import arabic_reshaper
|
10 |
+
from bidi.algorithm import get_display
|
11 |
import os
|
12 |
+
import time
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# --- CONFIGURATION ---
|
15 |
+
# IMPORTANT: This should be set as a Secret in your Hugging Face Space
|
16 |
+
# For local testing, you can uncomment the line below.
|
17 |
+
# os.environ['GEMINI_API_KEY'] = "YOUR_API_KEY_HERE"
|
18 |
+
API_KEY = "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")
|
19 |
+
|
20 |
+
# Ensure these font files are in your Hugging Face repository
|
21 |
PERSIAN_FONT_PATH = "Vazir.ttf"
|
22 |
+
OUTPUT_VIDEO_FILENAME = f"translated_video_{int(time.time())}.mp4"
|
23 |
+
|
24 |
+
# Video effect settings
|
25 |
+
FADE_IN_DURATION_SECONDS = 1.0
|
26 |
+
INITIAL_BLACK_SCREEN_SECONDS = 1.0
|
27 |
|
28 |
# --- GLOBAL INITIALIZATION ---
|
29 |
reader = None
|
|
|
32 |
"""Initializes the EasyOCR reader if it hasn't been already."""
|
33 |
global reader
|
34 |
if reader is None:
|
35 |
+
print("Loading EasyOCR model...")
|
36 |
+
# For a CPU-only environment like HF Spaces free tier, gpu=False is essential.
|
37 |
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
|
38 |
print("EasyOCR model loaded successfully!")
|
39 |
return reader
|
40 |
|
41 |
+
# --- YOUR CORE FUNCTIONS (Slightly Adapted) ---
|
42 |
|
43 |
def extract_text_and_bbox(image: Image.Image):
|
44 |
"""
|
45 |
Extracts text from a PIL Image and calculates a single consolidated
|
46 |
bounding box for all text found.
|
47 |
+
(This function is kept exactly as you wrote it)
|
48 |
"""
|
49 |
+
ocr_reader = initialize_reader()
|
50 |
+
img_array = np.array(image)
|
51 |
+
results = ocr_reader.readtext(img_array)
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
if not results:
|
54 |
+
return "No text detected in the image.", None
|
55 |
|
56 |
+
min_x, min_y = float('inf'), float('inf')
|
57 |
+
max_x, max_y = float('-inf'), float('-inf')
|
58 |
+
|
59 |
+
text_parts = []
|
60 |
+
for (bbox, text, prob) in results:
|
61 |
+
text_parts.append(text)
|
62 |
+
(tl, tr, br, bl) = bbox
|
63 |
+
min_x = min(min_x, tl[0], bl[0])
|
64 |
+
min_y = min(min_y, tl[1], tr[1])
|
65 |
+
max_x = max(max_x, tr[0], br[0])
|
66 |
+
max_y = max(max_y, bl[1], br[1])
|
67 |
+
|
68 |
+
extracted_text = ' '.join(text_parts)
|
69 |
+
consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
|
70 |
+
|
71 |
+
return extracted_text, consolidated_bbox
|
|
|
|
|
|
|
72 |
|
73 |
def translate_text_gemini(text: str) -> str:
|
74 |
+
"""
|
75 |
+
Translates text to colloquial Persian using the Gemini API.
|
76 |
+
(This function is kept exactly as you wrote it, but with safer API key handling)
|
77 |
+
"""
|
78 |
+
if not API_KEY:
|
79 |
+
raise gr.Error("GEMINI_API_KEY is not set. Please configure it in your Hugging Face Space Secrets.")
|
80 |
+
if not text or "No text" in text or "Error" in text:
|
81 |
return "No valid text to translate."
|
82 |
|
83 |
try:
|
84 |
+
genai.configure(api_key=API_KEY)
|
85 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
86 |
+
# Your excellent, detailed prompt is preserved
|
87 |
+
prompt =f"Translate the following English quotes into Persian... [your full prompt here] ...Quotes: [{text}]"
|
88 |
|
89 |
response = model.generate_content(prompt)
|
90 |
return response.text.strip()
|
91 |
except Exception as e:
|
92 |
+
return f"Error during translation with Gemini: {str(e)}"
|
93 |
|
94 |
+
# --- NEW FUNCTION: Renders a reusable overlay "stamp" ---
|
95 |
+
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
|
96 |
"""
|
97 |
+
Creates a single, pre-rendered RGBA image of the translated text on a
|
98 |
+
background sampled from the original image. This "stamp" can be efficiently
|
99 |
+
overlaid on every video frame.
|
100 |
+
|
101 |
+
This function adapts the logic from your original 'overlay_text_on_image'.
|
102 |
"""
|
103 |
+
# 1. Define the box where the new text will live (with padding)
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
padding = 15
|
105 |
+
overlay_box = (
|
106 |
max(0, bbox[0] - padding),
|
107 |
max(0, bbox[1] - padding),
|
108 |
+
min(original_image.width, bbox[2] + padding),
|
109 |
+
min(original_image.height, bbox[3] + padding)
|
110 |
)
|
111 |
+
overlay_width = overlay_box[2] - overlay_box[0]
|
112 |
+
overlay_height = overlay_box[3] - overlay_box[1]
|
113 |
+
|
114 |
+
# 2. Sample the background color from the original image
|
115 |
try:
|
116 |
+
sample_x = max(0, int(overlay_box[0]) - 5)
|
117 |
+
sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
118 |
+
bg_color = original_image.getpixel((sample_x, sample_y))
|
119 |
except (ValueError, IndexError):
|
120 |
+
bg_color = (25, 25, 25, 255) # Fallback color
|
|
|
|
|
121 |
|
122 |
+
# 3. Create the base layer for our overlay "stamp"
|
123 |
+
# This is an RGBA image with the sampled background color
|
124 |
+
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
125 |
+
draw = ImageDraw.Draw(overlay_layer)
|
126 |
|
127 |
+
# 4. Dynamically find best font size and wrap text (your brilliant logic)
|
128 |
+
target_width = overlay_width * 0.90
|
|
|
129 |
font_size = 100
|
130 |
final_wrapped_lines = []
|
131 |
|
|
|
134 |
words = text_to_overlay.split()
|
135 |
if not words: break
|
136 |
|
137 |
+
raw_lines = []; current_line = ""
|
|
|
138 |
for word in words:
|
139 |
test_line = (current_line + " " + word).strip()
|
140 |
reshaped_test_line = arabic_reshaper.reshape(test_line)
|
141 |
+
bidi_test_line = get_display(reshaped_test_line)
|
142 |
+
line_width = draw.textbbox((0, 0), bidi_test_line, font=font)[2]
|
143 |
+
if line_width <= target_width: current_line = test_line
|
144 |
+
else: raw_lines.append(current_line); current_line = word
|
|
|
|
|
|
|
145 |
raw_lines.append(current_line)
|
146 |
|
147 |
+
# Check total height
|
148 |
+
total_height = 0
|
149 |
+
for line in raw_lines:
|
150 |
+
reshaped_line = arabic_reshaper.reshape(line)
|
151 |
+
bidi_line = get_display(reshaped_line)
|
152 |
+
total_height += draw.textbbox((0,0), bidi_line, font=font)[3]
|
153 |
+
if total_height <= overlay_height * 0.9:
|
154 |
final_wrapped_lines = raw_lines
|
155 |
break
|
156 |
else:
|
157 |
font_size -= 2
|
158 |
|
159 |
if not final_wrapped_lines:
|
160 |
+
print("Warning: Text could not fit. It may be truncated.")
|
161 |
+
final_wrapped_lines = raw_lines # Use last attempt if no fit found
|
162 |
|
163 |
+
# 5. Draw the final, wrapped text onto our stamp
|
164 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
165 |
line_spacing = font_size * 0.3
|
166 |
|
167 |
+
# BIDI and Reshape for correct RTL rendering
|
168 |
+
reshaped_lines = [get_display(arabic_reshaper.reshape(l)) for l in final_wrapped_lines]
|
169 |
+
line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in reshaped_lines]
|
170 |
+
total_text_height = sum(line_heights) + (len(reshaped_lines) - 1) * line_spacing
|
171 |
|
172 |
+
y_start = (overlay_height - total_text_height) / 2
|
173 |
|
174 |
current_y = y_start
|
175 |
+
for i, line_to_draw in enumerate(reshaped_lines):
|
176 |
+
x_center = overlay_width / 2
|
|
|
177 |
|
178 |
+
# Draw shadow then text for readability
|
179 |
+
draw.text((x_center + 1, current_y + 1), line_to_draw, font=final_font, fill=(0, 0, 0, 180), anchor="mt")
|
180 |
+
draw.text((x_center, current_y), line_to_draw, font=final_font, fill=(255, 255, 255, 255), anchor="mt")
|
181 |
|
182 |
current_y += line_heights[i] + line_spacing
|
183 |
|
184 |
+
return overlay_layer, overlay_box
|
|
|
|
|
185 |
|
|
|
186 |
|
187 |
+
# --- MAIN VIDEO PROCESSING PIPELINE ---
|
188 |
+
|
189 |
+
def process_video(video_path, progress=gr.Progress()):
|
190 |
+
"""
|
191 |
+
Main function to orchestrate the entire video translation process.
|
192 |
+
"""
|
193 |
+
if video_path is None:
|
194 |
+
raise gr.Error("Please upload a video file first.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
+
progress(0, desc="Loading Video...")
|
197 |
+
cap = cv2.VideoCapture(video_path)
|
198 |
+
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
199 |
|
200 |
+
# Video properties
|
201 |
+
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
202 |
+
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
203 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
204 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
+
# 1. ANALYSIS (OCR & TRANSLATION) - Done only once
|
207 |
+
progress(0.1, desc="Extracting Middle Frame for Analysis...")
|
208 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
209 |
+
ret, middle_frame_bgr = cap.read()
|
210 |
+
if not ret: raise gr.Error("Could not read middle frame.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
+
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
213 |
+
|
214 |
+
progress(0.2, desc="Detecting Text (EasyOCR)...")
|
215 |
+
extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
|
216 |
+
if bbox is None: raise gr.Error(extracted_text)
|
217 |
+
|
218 |
+
progress(0.4, desc="Translating Text (Gemini API)...")
|
219 |
+
translated_text = translate_text_gemini(extracted_text)
|
220 |
+
if "Error" in translated_text: raise gr.Error(translated_text)
|
221 |
+
|
222 |
+
progress(0.6, desc="Rendering Translated Text Overlay...")
|
223 |
+
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
224 |
+
|
225 |
+
# Convert the PIL RGBA stamp to an OpenCV BGRA image for compositing
|
226 |
+
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
227 |
+
|
228 |
+
# 2. VIDEO COMPOSITION
|
229 |
+
progress(0.7, desc="Composing Final Video...")
|
230 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
231 |
+
out = cv2.VideoWriter(OUTPUT_VIDEO_FILENAME, fourcc, fps, (frame_width, frame_height))
|
232 |
+
|
233 |
+
# Add initial black screen
|
234 |
+
num_black_frames = int(INITIAL_BLACK_SCREEN_SECONDS * fps)
|
235 |
+
black_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
|
236 |
+
for _ in range(num_black_frames): out.write(black_frame)
|
237 |
+
|
238 |
+
# Add fade-in effect
|
239 |
+
num_fade_frames = int(FADE_IN_DURATION_SECONDS * fps)
|
240 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind video
|
241 |
+
ret, first_frame = cap.read()
|
242 |
+
if ret:
|
243 |
+
for i in range(num_fade_frames):
|
244 |
+
alpha = (i + 1) / num_fade_frames
|
245 |
+
blended_frame = cv2.addWeighted(black_frame, 1 - alpha, first_frame, alpha, 0)
|
246 |
+
out.write(blended_frame)
|
247 |
+
|
248 |
+
# Process all frames and overlay the pre-rendered stamp
|
249 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind again
|
250 |
+
frame_idx = 0
|
251 |
+
|
252 |
+
# Get position for stamping
|
253 |
+
x_min, y_min, x_max, y_max = overlay_position_box
|
254 |
+
|
255 |
+
while True:
|
256 |
+
ret, frame = cap.read()
|
257 |
+
if not ret: break
|
258 |
|
259 |
+
# Skip frames used in fade-in
|
260 |
+
if frame_idx < num_fade_frames:
|
261 |
+
frame_idx += 1
|
262 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
+
# --- Efficient Alpha Blending (Stamping) ---
|
265 |
+
roi = frame[y_min:y_max, x_min:x_max]
|
266 |
+
|
267 |
+
# Ensure ROI and stamp have same dimensions before blending
|
268 |
+
stamp_h, stamp_w, _ = overlay_stamp_cv.shape
|
269 |
+
roi_h, roi_w, _ = roi.shape
|
270 |
+
if stamp_h != roi_h or stamp_w != roi_w:
|
271 |
+
# This can happen if padding makes the box go out of bounds. Resize stamp to fit.
|
272 |
+
overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h))
|
273 |
+
else:
|
274 |
+
overlay_resized = overlay_stamp_cv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
+
alpha = overlay_resized[:, :, 3] / 255.0
|
277 |
+
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
279 |
+
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
|
280 |
+
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
|
|
|
|
|
|
281 |
|
282 |
+
out.write(frame)
|
283 |
+
frame_idx += 1
|
284 |
+
progress(0.7 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
+
cap.release()
|
287 |
+
out.release()
|
288 |
+
progress(1, desc="Done!")
|
289 |
+
return OUTPUT_VIDEO_FILENAME
|
290 |
|
291 |
+
# --- GRADIO INTERFACE ---
|
|
|
|
|
292 |
|
293 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
294 |
+
gr.Markdown("# 🎬 Persian Video Quote Translator")
|
295 |
+
gr.Markdown("Upload a short video with English text. The app will detect the text, translate it, and create a new video with the Persian translation overlaid.")
|
296 |
+
|
297 |
with gr.Row():
|
298 |
+
video_input = gr.Video(label="Upload Video")
|
299 |
+
video_output = gr.Video(label="Translated Video Output")
|
300 |
+
|
301 |
+
translate_button = gr.Button("Translate Video", variant="primary")
|
302 |
+
|
303 |
+
translate_button.click(
|
304 |
+
fn=process_video,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
inputs=[video_input],
|
306 |
+
outputs=[video_output]
|
307 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
+
gr.Markdown("---")
|
310 |
+
gr.Markdown("### How it works:\n1. It finds the middle frame of your video for analysis.\n2. It uses `EasyOCR` to find the English text and its location.\n3. It uses Google's `Gemini` to translate the text to poetic Persian.\n4. It generates a high-quality overlay with your text-wrapping logic.\n5. Finally, it creates a new video with a fade-in and the translated text overlay.")
|
311 |
+
|
312 |
|
313 |
if __name__ == "__main__":
|
314 |
+
demo.launch(debug=True)
|