Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,18 +11,18 @@ from bidi.algorithm import get_display
|
|
11 |
import os
|
12 |
import time
|
13 |
import ffmpeg
|
14 |
-
import tempfile
|
15 |
-
import shutil
|
16 |
|
17 |
# --- CONFIGURATION ---
|
18 |
-
|
19 |
-
|
|
|
|
|
20 |
FADE_IN_DURATION_SECONDS = 1.0
|
21 |
|
22 |
# --- GLOBAL INITIALIZATION ---
|
23 |
reader = None
|
24 |
-
|
25 |
def initialize_reader():
|
|
|
26 |
global reader
|
27 |
if reader is None:
|
28 |
print("Loading EasyOCR model...")
|
@@ -30,106 +30,54 @@ def initialize_reader():
|
|
30 |
print("EasyOCR model loaded successfully!")
|
31 |
return reader
|
32 |
|
33 |
-
# --- CORE FUNCTIONS ---
|
34 |
def extract_text_and_bbox(image: Image.Image):
|
35 |
-
"""
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
text_parts.append(text)
|
53 |
-
(tl, tr, br, bl) = bbox
|
54 |
-
min_x = min(min_x, tl[0], bl[0])
|
55 |
-
min_y = min(min_y, tl[1], tr[1])
|
56 |
-
max_x = max(max_x, tr[0], br[0])
|
57 |
-
max_y = max(max_y, bl[1], br[1])
|
58 |
-
|
59 |
-
extracted_text = ' '.join(text_parts)
|
60 |
-
consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
return f"Error processing image with OCR: {str(e)}", None
|
66 |
|
67 |
def translate_text_gemini(text: str) -> str:
|
68 |
-
"""
|
69 |
-
if not API_KEY:
|
70 |
-
raise gr.Error("GEMINI_API_KEY is not set.")
|
71 |
-
|
72 |
-
if not text or "No text" in text or "Error" in text or "Please upload" in text:
|
73 |
return "No valid text to translate."
|
74 |
-
|
75 |
try:
|
76 |
genai.configure(api_key=API_KEY)
|
77 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
78 |
-
prompt = f"
|
79 |
-
- Colloquial and natural
|
80 |
-
- Poetic and meaningful
|
81 |
-
- Concise (under 20 words)
|
82 |
-
- Preserving the original meaning and tone
|
83 |
-
- Using proper Persian grammar
|
84 |
-
|
85 |
-
Provide only the translated Persian text. Quotes: [{text}]"""
|
86 |
-
|
87 |
response = model.generate_content(prompt)
|
88 |
return response.text.strip()
|
89 |
except Exception as e:
|
90 |
return f"Error during translation with Gemini: {str(e)}"
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
test_words = current_line_words + [word]
|
101 |
-
test_line = ' '.join(test_words)
|
102 |
-
|
103 |
-
# Process this test line for RTL to get actual display width
|
104 |
-
try:
|
105 |
-
reshaped_test = arabic_reshaper.reshape(test_line)
|
106 |
-
display_test = get_display(reshaped_test)
|
107 |
-
test_width = draw.textbbox((0, 0), display_test, font=font)[2]
|
108 |
-
except:
|
109 |
-
# Fallback if RTL processing fails
|
110 |
-
test_width = draw.textbbox((0, 0), test_line, font=font)[2]
|
111 |
-
|
112 |
-
if test_width <= max_width:
|
113 |
-
current_line_words.append(word)
|
114 |
-
else:
|
115 |
-
# Save current line and start new one
|
116 |
-
if current_line_words:
|
117 |
-
lines.append(' '.join(current_line_words))
|
118 |
-
current_line_words = [word]
|
119 |
-
|
120 |
-
# Don't forget the last line
|
121 |
-
if current_line_words:
|
122 |
-
lines.append(' '.join(current_line_words))
|
123 |
-
|
124 |
-
return lines
|
125 |
-
|
126 |
-
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> tuple:
|
127 |
-
"""Render Persian text overlay with proper RTL support"""
|
128 |
-
|
129 |
-
# Check for font file
|
130 |
-
if not os.path.exists(PERSIAN_FONT_PATH):
|
131 |
-
raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please ensure Vazir.ttf is in the repository.")
|
132 |
-
|
133 |
padding = 15
|
134 |
overlay_box = (
|
135 |
max(0, bbox[0] - padding),
|
@@ -137,232 +85,166 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
|
|
137 |
min(original_image.width, bbox[2] + padding),
|
138 |
min(original_image.height, bbox[3] + padding)
|
139 |
)
|
140 |
-
|
141 |
overlay_width = overlay_box[2] - overlay_box[0]
|
142 |
overlay_height = overlay_box[3] - overlay_box[1]
|
143 |
-
|
144 |
-
#
|
145 |
try:
|
|
|
146 |
sample_x = max(0, int(overlay_box[0]) - 5)
|
147 |
sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
148 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
149 |
except (ValueError, IndexError):
|
150 |
-
bg_color = (25, 25, 25
|
151 |
-
|
152 |
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
153 |
draw = ImageDraw.Draw(overlay_layer)
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
155 |
target_width = overlay_width * 0.90
|
|
|
156 |
font_size = 100
|
157 |
final_wrapped_lines = []
|
158 |
-
|
159 |
-
# Find optimal font size
|
160 |
while font_size > 10:
|
161 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
total_height
|
184 |
-
|
185 |
-
if total_height <=
|
186 |
-
final_wrapped_lines =
|
187 |
break
|
188 |
else:
|
189 |
font_size -= 2
|
190 |
-
|
191 |
if not final_wrapped_lines:
|
192 |
-
final_wrapped_lines = [text_to_overlay]
|
193 |
-
|
194 |
-
#
|
195 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
196 |
line_spacing = font_size * 0.3
|
197 |
-
|
198 |
-
#
|
199 |
-
|
200 |
-
line_heights = []
|
201 |
-
|
202 |
-
|
203 |
-
try:
|
204 |
-
# CRITICAL: Process each line individually for RTL
|
205 |
-
reshaped_line = arabic_reshaper.reshape(line)
|
206 |
-
display_line = get_display(reshaped_line)
|
207 |
-
processed_lines.append(display_line)
|
208 |
-
|
209 |
-
line_bbox = draw.textbbox((0, 0), display_line, font=final_font)
|
210 |
-
line_height = line_bbox[3] - line_bbox[1]
|
211 |
-
line_heights.append(line_height)
|
212 |
-
except Exception as e:
|
213 |
-
print(f"RTL processing failed for line '{line}': {e}")
|
214 |
-
# Fallback to original line
|
215 |
-
processed_lines.append(line)
|
216 |
-
line_bbox = draw.textbbox((0, 0), line, font=final_font)
|
217 |
-
line_height = line_bbox[3] - line_bbox[1]
|
218 |
-
line_heights.append(line_height)
|
219 |
-
|
220 |
-
total_text_height = sum(line_heights) + (len(processed_lines) - 1) * line_spacing
|
221 |
y_start = (overlay_height - total_text_height) / 2
|
222 |
-
|
223 |
-
# Draw each line
|
224 |
current_y = y_start
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
x_position = (overlay_width - line_width) / 2
|
230 |
-
|
231 |
-
# Draw shadow for better readability
|
232 |
-
draw.text((x_position + 1, current_y + 1), display_line, font=final_font, fill=(0, 0, 0, 180))
|
233 |
-
# Draw main text
|
234 |
-
draw.text((x_position, current_y), display_line, font=final_font, fill=(255, 255, 255, 255))
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
current_y += line_heights[i] + line_spacing
|
237 |
-
|
238 |
-
return overlay_layer, overlay_box
|
239 |
|
240 |
-
|
241 |
-
"""Process image: detect, translate, and overlay text"""
|
242 |
-
if image is None:
|
243 |
-
return "Please upload an image.", "Translation will appear here.", None
|
244 |
-
|
245 |
-
# Extract text
|
246 |
-
extracted_text, bbox = extract_text_and_bbox(image)
|
247 |
-
if bbox is None:
|
248 |
-
return extracted_text, "No text to translate.", None
|
249 |
-
|
250 |
-
# Translate text
|
251 |
-
translated_text = translate_text_gemini(extracted_text)
|
252 |
-
if "Error" in translated_text:
|
253 |
-
return extracted_text, translated_text, None
|
254 |
-
|
255 |
-
# Create overlay
|
256 |
-
overlay_layer, overlay_box = render_translated_overlay(image, translated_text, bbox)
|
257 |
-
|
258 |
-
# Apply overlay to image
|
259 |
-
image_copy = image.copy().convert("RGBA")
|
260 |
-
|
261 |
-
# Create background rectangle
|
262 |
-
draw = ImageDraw.Draw(image_copy)
|
263 |
-
draw.rectangle(overlay_box, fill=image.getpixel((overlay_box[0], overlay_box[1])))
|
264 |
-
|
265 |
-
# Paste overlay
|
266 |
-
image_copy.paste(overlay_layer, (overlay_box[0], overlay_box[1]), overlay_layer)
|
267 |
-
|
268 |
-
return extracted_text, translated_text, image_copy.convert("RGB")
|
269 |
|
|
|
270 |
def process_video(video_path, progress=gr.Progress()):
|
271 |
-
|
272 |
-
|
273 |
-
raise gr.Error("Please upload a video file first.")
|
274 |
-
|
275 |
progress(0, desc="Loading Video & Analyzing...")
|
276 |
cap = cv2.VideoCapture(video_path)
|
277 |
-
if not cap.isOpened():
|
278 |
-
|
279 |
-
|
280 |
-
# Get video properties
|
281 |
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
282 |
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
283 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
284 |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
285 |
-
|
286 |
-
#
|
287 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
288 |
ret, middle_frame_bgr = cap.read()
|
289 |
-
if not ret:
|
290 |
-
raise gr.Error("Could not read middle frame.")
|
291 |
-
|
292 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
293 |
-
|
294 |
progress(0.2, desc="Detecting Text (EasyOCR)...")
|
295 |
extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
|
296 |
-
if bbox is None:
|
297 |
-
|
298 |
-
|
299 |
progress(0.4, desc="Translating Text (Gemini API)...")
|
300 |
translated_text = translate_text_gemini(extracted_text)
|
301 |
-
if "Error" in translated_text:
|
302 |
-
|
303 |
-
|
304 |
progress(0.5, desc="Rendering Translated Text Overlay...")
|
305 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
|
|
306 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
307 |
-
|
308 |
-
# Create temporary files
|
309 |
timestamp = int(time.time())
|
310 |
-
|
311 |
-
temp_silent_path = os.path.join(temp_dir, f"temp_silent_{timestamp}.mp4")
|
312 |
final_output_path = f"translated_video_{timestamp}.mp4"
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
# Extract ROI and resize overlay if needed
|
329 |
-
roi = frame[y_min:y_max, x_min:x_max]
|
330 |
-
stamp_h, stamp_w, _ = overlay_stamp_cv.shape
|
331 |
-
roi_h, roi_w, _ = roi.shape
|
332 |
-
|
333 |
-
if stamp_h != roi_h or stamp_w != roi_w:
|
334 |
-
overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h))
|
335 |
-
else:
|
336 |
-
overlay_resized = overlay_stamp_cv
|
337 |
-
|
338 |
-
# Alpha blend
|
339 |
-
alpha = overlay_resized[:, :, 3] / 255.0
|
340 |
-
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
341 |
-
|
342 |
-
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) +
|
343 |
-
overlay_resized[:, :, :3].astype(float) * alpha_mask)
|
344 |
-
|
345 |
-
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
346 |
-
|
347 |
-
out.write(frame)
|
348 |
-
frame_idx += 1
|
349 |
-
progress(0.6 + (0.3 * frame_idx / total_frames),
|
350 |
-
desc=f"Processing frame {frame_idx}/{total_frames}")
|
351 |
-
|
352 |
-
cap.release()
|
353 |
-
out.release()
|
354 |
|
355 |
-
|
|
|
|
|
356 |
|
357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
input_video = ffmpeg.input(temp_silent_path)
|
359 |
-
input_audio = ffmpeg.input(video_path)
|
360 |
-
|
361 |
(
|
362 |
ffmpeg
|
363 |
.output(
|
364 |
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
|
365 |
-
input_audio
|
366 |
final_output_path,
|
367 |
vcodec='libx264',
|
368 |
acodec='copy',
|
@@ -370,79 +252,34 @@ def process_video(video_path, progress=gr.Progress()):
|
|
370 |
)
|
371 |
.run(overwrite_output=True, quiet=True)
|
372 |
)
|
373 |
-
|
374 |
-
progress(1, desc="Done!")
|
375 |
-
return final_output_path
|
376 |
-
|
377 |
except ffmpeg.Error as e:
|
378 |
-
print('ffmpeg stdout:', e.stdout.decode('utf8'
|
379 |
-
print('ffmpeg stderr:', e.stderr.decode('utf8'
|
380 |
-
raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8'
|
381 |
-
|
382 |
finally:
|
383 |
-
|
384 |
-
|
385 |
-
|
|
|
|
|
386 |
|
387 |
# --- GRADIO INTERFACE ---
|
388 |
-
with gr.Blocks(theme=gr.themes.Soft(), title="Persian
|
389 |
-
gr.Markdown("#
|
390 |
-
gr.Markdown("Upload
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
with gr.Column(scale=1):
|
403 |
-
image_output = gr.Image(label="Translated Image Output", type="pil")
|
404 |
-
|
405 |
-
image_input.change(
|
406 |
-
fn=process_image,
|
407 |
-
inputs=[image_input],
|
408 |
-
outputs=[img_text_output, img_translated_output, image_output]
|
409 |
-
)
|
410 |
-
|
411 |
-
with gr.TabItem("🎬 Video Translation"):
|
412 |
-
gr.Markdown("Upload a video with English text. The app will preserve audio and add Persian translation overlay.")
|
413 |
-
|
414 |
-
with gr.Row():
|
415 |
-
video_input = gr.Video(label="Upload Video")
|
416 |
-
video_output = gr.Video(label="Translated Video Output")
|
417 |
-
|
418 |
-
translate_button = gr.Button("Translate Video", variant="primary")
|
419 |
-
translate_button.click(
|
420 |
-
fn=process_video,
|
421 |
-
inputs=[video_input],
|
422 |
-
outputs=[video_output]
|
423 |
-
)
|
424 |
|
425 |
gr.Markdown("---")
|
426 |
-
gr.Markdown(""
|
427 |
-
### How it works:
|
428 |
-
**Image Mode:**
|
429 |
-
1. Detects English text using OCR
|
430 |
-
2. Translates to Persian using Gemini AI
|
431 |
-
3. Overlays properly formatted RTL Persian text
|
432 |
-
|
433 |
-
**Video Mode:**
|
434 |
-
1. Analyzes middle frame to detect text location
|
435 |
-
2. Translates English text to Persian
|
436 |
-
3. Applies Persian overlay to all frames with background
|
437 |
-
4. Merges with original audio and adds fade-in effect
|
438 |
-
|
439 |
-
**Features:**
|
440 |
-
- Proper RTL (Right-to-Left) Persian text rendering
|
441 |
-
- Automatic font sizing to fit available space
|
442 |
-
- Text wrapping for longer translations
|
443 |
-
- Background color sampling for natural overlay
|
444 |
-
- Audio preservation in video mode
|
445 |
-
""")
|
446 |
|
447 |
if __name__ == "__main__":
|
448 |
demo.launch(debug=True)
|
|
|
11 |
import os
|
12 |
import time
|
13 |
import ffmpeg
|
|
|
|
|
14 |
|
15 |
# --- CONFIGURATION ---
|
16 |
+
# IMPORTANT: For deployment on Hugging Face, set this as a "Secret".
|
17 |
+
# For local testing, you can paste your key here.
|
18 |
+
API_KEY = "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
|
19 |
+
PERSIAN_FONT_PATH = "Vazir.ttf" # Make sure this font file is in your repository
|
20 |
FADE_IN_DURATION_SECONDS = 1.0
|
21 |
|
22 |
# --- GLOBAL INITIALIZATION ---
|
23 |
reader = None
|
|
|
24 |
def initialize_reader():
|
25 |
+
"""Initializes the EasyOCR reader if it hasn't been already."""
|
26 |
global reader
|
27 |
if reader is None:
|
28 |
print("Loading EasyOCR model...")
|
|
|
30 |
print("EasyOCR model loaded successfully!")
|
31 |
return reader
|
32 |
|
33 |
+
# --- CORE PROCESSING FUNCTIONS ---
|
34 |
def extract_text_and_bbox(image: Image.Image):
|
35 |
+
"""Extracts text and a consolidated bounding box from a PIL Image."""
|
36 |
+
ocr_reader = initialize_reader()
|
37 |
+
img_array = np.array(image)
|
38 |
+
results = ocr_reader.readtext(img_array)
|
39 |
+
if not results: return "No text detected in the image.", None
|
40 |
+
|
41 |
+
min_x, min_y = float('inf'), float('inf')
|
42 |
+
max_x, max_y = float('-inf'), float('-inf')
|
43 |
+
text_parts = []
|
44 |
+
|
45 |
+
for (bbox, text, prob) in results:
|
46 |
+
text_parts.append(text)
|
47 |
+
(tl, tr, br, bl) = bbox
|
48 |
+
min_x = min(min_x, tl[0], bl[0])
|
49 |
+
min_y = min(min_y, tl[1], tr[1])
|
50 |
+
max_x = max(max_x, tr[0], br[0])
|
51 |
+
max_y = max(max_y, bl[1], br[1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
extracted_text = ' '.join(text_parts)
|
54 |
+
consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
|
55 |
+
return extracted_text, consolidated_bbox
|
|
|
56 |
|
57 |
def translate_text_gemini(text: str) -> str:
|
58 |
+
"""Translates text to colloquial Persian using the Gemini API."""
|
59 |
+
if not API_KEY or "YOUR_GEMINI_API_KEY_HERE" in API_KEY:
|
60 |
+
raise gr.Error("GEMINI_API_KEY is not set. Please add it as a Secret in your Hugging Face Space.")
|
61 |
+
if not text or "No text" in text:
|
|
|
62 |
return "No valid text to translate."
|
63 |
+
|
64 |
try:
|
65 |
genai.configure(api_key=API_KEY)
|
66 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
67 |
+
prompt = f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
response = model.generate_content(prompt)
|
69 |
return response.text.strip()
|
70 |
except Exception as e:
|
71 |
return f"Error during translation with Gemini: {str(e)}"
|
72 |
|
73 |
+
# ### --- THE NEW AND CORRECTED TEXT OVERLAY FUNCTION --- ###
|
74 |
+
# This function is now based on the superior logic from your textoverimage.txt script.
|
75 |
+
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
|
76 |
+
"""
|
77 |
+
Creates an overlay layer with correctly rendered, wrapped Persian text.
|
78 |
+
This function erases the background area defined by the bbox and draws new text.
|
79 |
+
"""
|
80 |
+
# 1. Define the area to work with, adding padding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
padding = 15
|
82 |
overlay_box = (
|
83 |
max(0, bbox[0] - padding),
|
|
|
85 |
min(original_image.width, bbox[2] + padding),
|
86 |
min(original_image.height, bbox[3] + padding)
|
87 |
)
|
|
|
88 |
overlay_width = overlay_box[2] - overlay_box[0]
|
89 |
overlay_height = overlay_box[3] - overlay_box[1]
|
90 |
+
|
91 |
+
# 2. Create the background layer by sampling a color from the original image
|
92 |
try:
|
93 |
+
# Sample color from just outside the original text box to get a clean background
|
94 |
sample_x = max(0, int(overlay_box[0]) - 5)
|
95 |
sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
96 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
97 |
except (ValueError, IndexError):
|
98 |
+
bg_color = (25, 25, 25) # Fallback color
|
99 |
+
|
100 |
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
101 |
draw = ImageDraw.Draw(overlay_layer)
|
102 |
+
|
103 |
+
# 3. Check for Font File
|
104 |
+
if not os.path.exists(PERSIAN_FONT_PATH):
|
105 |
+
raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it to your Space.")
|
106 |
+
|
107 |
+
# 4. Dynamically find the best font size and wrap the text
|
108 |
target_width = overlay_width * 0.90
|
109 |
+
target_height = overlay_height * 0.90
|
110 |
font_size = 100
|
111 |
final_wrapped_lines = []
|
112 |
+
|
|
|
113 |
while font_size > 10:
|
114 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
115 |
+
words = text_to_overlay.split()
|
116 |
+
if not words: break
|
117 |
+
|
118 |
+
raw_lines = []
|
119 |
+
current_line = ""
|
120 |
+
for word in words:
|
121 |
+
test_line = (current_line + " " + word).strip()
|
122 |
+
# To measure width correctly, we MUST reshape it first. This is the key.
|
123 |
+
reshaped_test_line = get_display(arabic_reshaper.reshape(test_line))
|
124 |
+
line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
|
125 |
+
|
126 |
+
if line_width <= target_width:
|
127 |
+
current_line = test_line
|
128 |
+
else:
|
129 |
+
raw_lines.append(current_line)
|
130 |
+
current_line = word
|
131 |
+
raw_lines.append(current_line)
|
132 |
+
|
133 |
+
line_spacing = font_size * 0.3
|
134 |
+
reshaped_for_height_calc = [get_display(arabic_reshaper.reshape(l)) for l in raw_lines]
|
135 |
+
line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
|
136 |
+
total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
|
137 |
+
|
138 |
+
if total_height <= target_height:
|
139 |
+
final_wrapped_lines = raw_lines
|
140 |
break
|
141 |
else:
|
142 |
font_size -= 2
|
143 |
+
|
144 |
if not final_wrapped_lines:
|
145 |
+
final_wrapped_lines = [text_to_overlay] # Fallback
|
146 |
+
|
147 |
+
# 5. Draw the final, wrapped, and correctly shaped text
|
148 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
149 |
line_spacing = font_size * 0.3
|
150 |
+
|
151 |
+
# Reshape final lines for drawing and calculate total height
|
152 |
+
final_display_lines = [get_display(arabic_reshaper.reshape(l)) for l in final_wrapped_lines]
|
153 |
+
line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_display_lines]
|
154 |
+
total_text_height = sum(line_heights) + (len(final_display_lines) - 1) * line_spacing
|
155 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
y_start = (overlay_height - total_text_height) / 2
|
|
|
|
|
157 |
current_y = y_start
|
158 |
+
|
159 |
+
for i, display_line in enumerate(final_display_lines):
|
160 |
+
x_center = overlay_width / 2
|
161 |
+
line_y_center = current_y + line_heights[i] / 2
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
+
# Use anchor="mm" to perfectly center the text block horizontally and vertically
|
164 |
+
# Draw a subtle shadow for better readability
|
165 |
+
draw.text((x_center + 1, line_y_center + 1), display_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
|
166 |
+
# Draw the main text
|
167 |
+
draw.text((x_center, line_y_center), display_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
|
168 |
+
|
169 |
current_y += line_heights[i] + line_spacing
|
|
|
|
|
170 |
|
171 |
+
return overlay_layer, overlay_box
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
+
# --- MAIN VIDEO PROCESSING PIPELINE ---
|
174 |
def process_video(video_path, progress=gr.Progress()):
|
175 |
+
if video_path is None: raise gr.Error("Please upload a video file first.")
|
176 |
+
|
|
|
|
|
177 |
progress(0, desc="Loading Video & Analyzing...")
|
178 |
cap = cv2.VideoCapture(video_path)
|
179 |
+
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
180 |
+
|
|
|
|
|
181 |
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
182 |
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
183 |
fps = cap.get(cv2.CAP_PROP_FPS)
|
184 |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
185 |
+
|
186 |
+
# Analyze the middle frame for text
|
187 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
188 |
ret, middle_frame_bgr = cap.read()
|
189 |
+
if not ret: raise gr.Error("Could not read middle frame.")
|
|
|
|
|
190 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
191 |
+
|
192 |
progress(0.2, desc="Detecting Text (EasyOCR)...")
|
193 |
extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
|
194 |
+
if bbox is None: raise gr.Error(extracted_text)
|
195 |
+
|
|
|
196 |
progress(0.4, desc="Translating Text (Gemini API)...")
|
197 |
translated_text = translate_text_gemini(extracted_text)
|
198 |
+
if "Error" in translated_text: raise gr.Error(translated_text)
|
199 |
+
|
|
|
200 |
progress(0.5, desc="Rendering Translated Text Overlay...")
|
201 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
202 |
+
# Convert the overlay to a format OpenCV can use (BGRA)
|
203 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
204 |
+
|
|
|
205 |
timestamp = int(time.time())
|
206 |
+
temp_silent_path = f"temp_silent_{timestamp}.mp4"
|
|
|
207 |
final_output_path = f"translated_video_{timestamp}.mp4"
|
208 |
+
|
209 |
+
progress(0.6, desc="Composing Silent Video with Overlay...")
|
210 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
211 |
+
out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
|
212 |
+
|
213 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind video to the beginning
|
214 |
+
frame_idx = 0
|
215 |
+
x_min, y_min, x_max, y_max = overlay_position_box
|
216 |
+
|
217 |
+
while True:
|
218 |
+
ret, frame = cap.read()
|
219 |
+
if not ret: break
|
220 |
+
|
221 |
+
# Define the region of interest (ROI) where the overlay will go
|
222 |
+
roi = frame[y_min:y_max, x_min:x_max]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
|
224 |
+
# Simple alpha blending
|
225 |
+
alpha = overlay_stamp_cv[:, :, 3] / 255.0
|
226 |
+
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
227 |
|
228 |
+
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_stamp_cv[:, :, :3].astype(float) * alpha_mask)
|
229 |
+
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
230 |
+
|
231 |
+
out.write(frame)
|
232 |
+
frame_idx += 1
|
233 |
+
progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
|
234 |
+
|
235 |
+
cap.release()
|
236 |
+
out.release()
|
237 |
+
|
238 |
+
progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
|
239 |
+
try:
|
240 |
input_video = ffmpeg.input(temp_silent_path)
|
241 |
+
input_audio = ffmpeg.input(video_path).audio # Select audio stream only
|
242 |
+
|
243 |
(
|
244 |
ffmpeg
|
245 |
.output(
|
246 |
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
|
247 |
+
input_audio,
|
248 |
final_output_path,
|
249 |
vcodec='libx264',
|
250 |
acodec='copy',
|
|
|
252 |
)
|
253 |
.run(overwrite_output=True, quiet=True)
|
254 |
)
|
|
|
|
|
|
|
|
|
255 |
except ffmpeg.Error as e:
|
256 |
+
print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
|
257 |
+
print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
|
258 |
+
raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8', errors='ignore')}")
|
|
|
259 |
finally:
|
260 |
+
if os.path.exists(temp_silent_path):
|
261 |
+
os.remove(temp_silent_path)
|
262 |
+
|
263 |
+
progress(1, desc="Done!")
|
264 |
+
return final_output_path
|
265 |
|
266 |
# --- GRADIO INTERFACE ---
|
267 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Persian Video Quote Translator") as demo:
|
268 |
+
gr.Markdown("# 🎬 Persian Video Quote Translator")
|
269 |
+
gr.Markdown("Upload a short video containing English text. The app will detect the text, replace it with a poetic Persian translation, and preserve the original audio and video duration.")
|
270 |
+
with gr.Row():
|
271 |
+
video_input = gr.Video(label="Upload Video")
|
272 |
+
video_output = gr.Video(label="Translated Video Output")
|
273 |
+
translate_button = gr.Button("Translate Video", variant="primary")
|
274 |
+
|
275 |
+
translate_button.click(
|
276 |
+
fn=process_video,
|
277 |
+
inputs=[video_input],
|
278 |
+
outputs=[video_output]
|
279 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
gr.Markdown("---")
|
282 |
+
gr.Markdown("### How it works:\n1. It analyzes the middle frame to find the text and its location.\n2. It uses the Gemini API to get a high-quality, poetic Persian translation.\n3. It renders the Persian text correctly onto a background that matches the original video.\n4. It composites this new text overlay onto every frame of the video.\n5. Finally, it uses `ffmpeg` to merge the new video with the **original audio** and add a 1-second fade-in effect.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
if __name__ == "__main__":
|
285 |
demo.launch(debug=True)
|