Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -10,205 +10,131 @@ import arabic_reshaper
|
|
10 |
from bidi.algorithm import get_display
|
11 |
import os
|
12 |
import time
|
|
|
13 |
|
14 |
# --- CONFIGURATION ---
|
15 |
-
|
16 |
-
# For local testing, you can uncomment the line below.
|
17 |
-
# os.environ['GEMINI_API_KEY'] = "YOUR_API_KEY_HERE"
|
18 |
-
API_KEY = "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
|
19 |
-
|
20 |
-
# Ensure these font files are in your Hugging Face repository
|
21 |
PERSIAN_FONT_PATH = "Vazir.ttf"
|
22 |
-
|
23 |
-
|
24 |
-
# Video effect settings
|
25 |
-
FADE_IN_DURATION_SECONDS = 1.0
|
26 |
-
INITIAL_BLACK_SCREEN_SECONDS = 1.0
|
27 |
|
28 |
# --- GLOBAL INITIALIZATION ---
|
29 |
reader = None
|
30 |
-
|
31 |
def initialize_reader():
|
32 |
-
"""Initializes the EasyOCR reader if it hasn't been already."""
|
33 |
global reader
|
34 |
if reader is None:
|
35 |
print("Loading EasyOCR model...")
|
36 |
-
# For a CPU-only environment like HF Spaces free tier, gpu=False is essential.
|
37 |
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
|
38 |
print("EasyOCR model loaded successfully!")
|
39 |
return reader
|
40 |
|
41 |
-
# --- YOUR CORE FUNCTIONS (
|
42 |
|
43 |
def extract_text_and_bbox(image: Image.Image):
|
44 |
-
"""
|
45 |
-
Extracts text from a PIL Image and calculates a single consolidated
|
46 |
-
bounding box for all text found.
|
47 |
-
(This function is kept exactly as you wrote it)
|
48 |
-
"""
|
49 |
ocr_reader = initialize_reader()
|
50 |
img_array = np.array(image)
|
51 |
results = ocr_reader.readtext(img_array)
|
52 |
-
|
53 |
-
|
54 |
-
return "No text detected in the image.", None
|
55 |
-
|
56 |
-
min_x, min_y = float('inf'), float('inf')
|
57 |
-
max_x, max_y = float('-inf'), float('-inf')
|
58 |
-
|
59 |
text_parts = []
|
60 |
for (bbox, text, prob) in results:
|
61 |
text_parts.append(text)
|
62 |
(tl, tr, br, bl) = bbox
|
63 |
-
min_x = min(min_x, tl[0], bl[0])
|
64 |
-
|
65 |
-
max_x = max(max_x, tr[0], br[0])
|
66 |
-
max_y = max(max_y, bl[1], br[1])
|
67 |
-
|
68 |
extracted_text = ' '.join(text_parts)
|
69 |
consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
|
70 |
-
|
71 |
return extracted_text, consolidated_bbox
|
72 |
|
73 |
def translate_text_gemini(text: str) -> str:
|
74 |
-
""
|
75 |
-
|
76 |
-
(This function is kept exactly as you wrote it, but with safer API key handling)
|
77 |
-
"""
|
78 |
-
if not API_KEY:
|
79 |
-
raise gr.Error("GEMINI_API_KEY is not set. Please configure it in your Hugging Face Space Secrets.")
|
80 |
-
if not text or "No text" in text or "Error" in text:
|
81 |
-
return "No valid text to translate."
|
82 |
-
|
83 |
try:
|
84 |
genai.configure(api_key=API_KEY)
|
85 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
86 |
-
# Your
|
87 |
-
prompt =f"Translate the following English quotes into Persian
|
88 |
-
|
89 |
response = model.generate_content(prompt)
|
90 |
return response.text.strip()
|
91 |
-
except Exception as e:
|
92 |
-
|
|
|
93 |
|
94 |
-
# --- NEW FUNCTION: Renders a reusable overlay "stamp" ---
|
95 |
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
|
96 |
-
"""
|
97 |
-
Creates a single, pre-rendered RGBA image of the translated text on a
|
98 |
-
background sampled from the original image. This "stamp" can be efficiently
|
99 |
-
overlaid on every video frame.
|
100 |
-
|
101 |
-
This function adapts the logic from your original 'overlay_text_on_image'.
|
102 |
-
"""
|
103 |
-
# 1. Define the box where the new text will live (with padding)
|
104 |
padding = 15
|
105 |
-
overlay_box = (
|
106 |
-
|
107 |
-
max(0, bbox[1] - padding),
|
108 |
-
min(original_image.width, bbox[2] + padding),
|
109 |
-
min(original_image.height, bbox[3] + padding)
|
110 |
-
)
|
111 |
overlay_width = overlay_box[2] - overlay_box[0]
|
112 |
overlay_height = overlay_box[3] - overlay_box[1]
|
113 |
-
|
114 |
-
# 2. Sample the background color from the original image
|
115 |
try:
|
116 |
-
sample_x = max(0, int(overlay_box[0]) - 5)
|
117 |
-
sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
118 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
119 |
-
except (ValueError, IndexError):
|
120 |
-
bg_color = (25, 25, 25, 255) # Fallback color
|
121 |
|
122 |
-
# 3. Create the base layer for our overlay "stamp"
|
123 |
-
# This is an RGBA image with the sampled background color
|
124 |
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
125 |
draw = ImageDraw.Draw(overlay_layer)
|
126 |
-
|
127 |
-
# 4. Dynamically find best font size and wrap text (your brilliant logic)
|
128 |
target_width = overlay_width * 0.90
|
129 |
font_size = 100
|
130 |
final_wrapped_lines = []
|
131 |
-
|
132 |
while font_size > 10:
|
133 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
134 |
-
words = text_to_overlay.split()
|
135 |
if not words: break
|
136 |
-
|
137 |
raw_lines = []; current_line = ""
|
138 |
for word in words:
|
139 |
test_line = (current_line + " " + word).strip()
|
140 |
-
|
141 |
-
bidi_test_line = get_display(reshaped_test_line)
|
142 |
-
line_width = draw.textbbox((0, 0), bidi_test_line, font=font)[2]
|
143 |
if line_width <= target_width: current_line = test_line
|
144 |
else: raw_lines.append(current_line); current_line = word
|
145 |
raw_lines.append(current_line)
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
-
# Check total height
|
148 |
-
total_height = 0
|
149 |
-
for line in raw_lines:
|
150 |
-
reshaped_line = arabic_reshaper.reshape(line)
|
151 |
-
bidi_line = get_display(reshaped_line)
|
152 |
-
total_height += draw.textbbox((0,0), bidi_line, font=font)[3]
|
153 |
-
if total_height <= overlay_height * 0.9:
|
154 |
-
final_wrapped_lines = raw_lines
|
155 |
-
break
|
156 |
-
else:
|
157 |
-
font_size -= 2
|
158 |
-
|
159 |
-
if not final_wrapped_lines:
|
160 |
-
print("Warning: Text could not fit. It may be truncated.")
|
161 |
-
final_wrapped_lines = raw_lines # Use last attempt if no fit found
|
162 |
-
|
163 |
-
# 5. Draw the final, wrapped text onto our stamp
|
164 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
165 |
line_spacing = font_size * 0.3
|
166 |
-
|
167 |
-
|
168 |
-
reshaped_lines = [get_display(arabic_reshaper.reshape(l)) for l in final_wrapped_lines]
|
169 |
-
line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in reshaped_lines]
|
170 |
-
total_text_height = sum(line_heights) + (len(reshaped_lines) - 1) * line_spacing
|
171 |
-
|
172 |
y_start = (overlay_height - total_text_height) / 2
|
173 |
|
174 |
current_y = y_start
|
175 |
-
for i,
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
# Draw shadow then text for readability
|
179 |
-
draw.text((
|
180 |
-
draw.text((
|
181 |
|
182 |
current_y += line_heights[i] + line_spacing
|
183 |
|
184 |
return overlay_layer, overlay_box
|
185 |
|
186 |
-
|
187 |
-
# --- MAIN VIDEO PROCESSING PIPELINE ---
|
188 |
|
189 |
def process_video(video_path, progress=gr.Progress()):
|
190 |
-
""
|
191 |
-
Main function to orchestrate the entire video translation process.
|
192 |
-
"""
|
193 |
-
if video_path is None:
|
194 |
-
raise gr.Error("Please upload a video file first.")
|
195 |
|
196 |
-
progress(0, desc="Loading Video...")
|
197 |
cap = cv2.VideoCapture(video_path)
|
198 |
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
203 |
-
fps = cap.get(cv2.CAP_PROP_FPS)
|
204 |
-
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
205 |
|
206 |
-
# 1. ANALYSIS (OCR & TRANSLATION) - Done only once
|
207 |
-
progress(0.1, desc="Extracting Middle Frame for Analysis...")
|
208 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
209 |
ret, middle_frame_bgr = cap.read()
|
210 |
if not ret: raise gr.Error("Could not read middle frame.")
|
211 |
-
|
212 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
213 |
|
214 |
progress(0.2, desc="Detecting Text (EasyOCR)...")
|
@@ -219,96 +145,85 @@ def process_video(video_path, progress=gr.Progress()):
|
|
219 |
translated_text = translate_text_gemini(extracted_text)
|
220 |
if "Error" in translated_text: raise gr.Error(translated_text)
|
221 |
|
222 |
-
progress(0.
|
223 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
224 |
-
|
225 |
-
# Convert the PIL RGBA stamp to an OpenCV BGRA image for compositing
|
226 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
227 |
|
228 |
-
#
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
|
233 |
-
#
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
# Add fade-in effect
|
239 |
-
num_fade_frames = int(FADE_IN_DURATION_SECONDS * fps)
|
240 |
-
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind video
|
241 |
-
ret, first_frame = cap.read()
|
242 |
-
if ret:
|
243 |
-
for i in range(num_fade_frames):
|
244 |
-
alpha = (i + 1) / num_fade_frames
|
245 |
-
blended_frame = cv2.addWeighted(black_frame, 1 - alpha, first_frame, alpha, 0)
|
246 |
-
out.write(blended_frame)
|
247 |
|
248 |
-
|
249 |
-
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind again
|
250 |
frame_idx = 0
|
251 |
-
|
252 |
-
# Get position for stamping
|
253 |
x_min, y_min, x_max, y_max = overlay_position_box
|
254 |
|
255 |
while True:
|
256 |
ret, frame = cap.read()
|
257 |
if not ret: break
|
258 |
|
259 |
-
# Skip frames used in fade-in
|
260 |
-
if frame_idx < num_fade_frames:
|
261 |
-
frame_idx += 1
|
262 |
-
continue
|
263 |
-
|
264 |
-
# --- Efficient Alpha Blending (Stamping) ---
|
265 |
roi = frame[y_min:y_max, x_min:x_max]
|
266 |
-
|
267 |
-
# Ensure ROI and stamp have same dimensions before blending
|
268 |
stamp_h, stamp_w, _ = overlay_stamp_cv.shape
|
269 |
roi_h, roi_w, _ = roi.shape
|
270 |
-
if stamp_h != roi_h or stamp_w != roi_w
|
271 |
-
|
272 |
-
overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h))
|
273 |
-
else:
|
274 |
-
overlay_resized = overlay_stamp_cv
|
275 |
-
|
276 |
alpha = overlay_resized[:, :, 3] / 255.0
|
277 |
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
278 |
-
|
279 |
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
|
280 |
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
281 |
|
282 |
out.write(frame)
|
283 |
frame_idx += 1
|
284 |
-
progress(0.
|
285 |
|
286 |
-
cap.release()
|
287 |
-
|
288 |
-
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
-
|
|
|
292 |
|
|
|
293 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
294 |
gr.Markdown("# 🎬 Persian Video Quote Translator")
|
295 |
-
gr.Markdown("Upload a short video with English text. The app will
|
296 |
-
|
297 |
with gr.Row():
|
298 |
video_input = gr.Video(label="Upload Video")
|
299 |
video_output = gr.Video(label="Translated Video Output")
|
300 |
-
|
301 |
translate_button = gr.Button("Translate Video", variant="primary")
|
302 |
-
|
303 |
-
translate_button.click(
|
304 |
-
fn=process_video,
|
305 |
-
inputs=[video_input],
|
306 |
-
outputs=[video_output]
|
307 |
-
)
|
308 |
-
|
309 |
gr.Markdown("---")
|
310 |
-
gr.Markdown("### How it works:\n1. It
|
311 |
-
|
312 |
|
313 |
if __name__ == "__main__":
|
314 |
demo.launch(debug=True)
|
|
|
10 |
from bidi.algorithm import get_display
|
11 |
import os
|
12 |
import time
|
13 |
+
import ffmpeg # ### --- CHANGE --- ###: Import the ffmpeg-python library
|
14 |
|
15 |
# --- CONFIGURATION ---
|
16 |
+
API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
|
|
|
|
|
|
|
|
|
|
|
17 |
PERSIAN_FONT_PATH = "Vazir.ttf"
|
18 |
+
FADE_IN_DURATION_SECONDS = 1.0 # The fade-in will be exactly 1 second long
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# --- GLOBAL INITIALIZATION ---
|
21 |
reader = None
|
|
|
22 |
def initialize_reader():
|
|
|
23 |
global reader
|
24 |
if reader is None:
|
25 |
print("Loading EasyOCR model...")
|
|
|
26 |
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
|
27 |
print("EasyOCR model loaded successfully!")
|
28 |
return reader
|
29 |
|
30 |
+
# --- YOUR CORE FUNCTIONS (Unchanged) ---
|
31 |
|
32 |
def extract_text_and_bbox(image: Image.Image):
|
|
|
|
|
|
|
|
|
|
|
33 |
ocr_reader = initialize_reader()
|
34 |
img_array = np.array(image)
|
35 |
results = ocr_reader.readtext(img_array)
|
36 |
+
if not results: return "No text detected in the image.", None
|
37 |
+
min_x, min_y, max_x, max_y = float('inf'), float('inf'), float('-inf'), float('-inf')
|
|
|
|
|
|
|
|
|
|
|
38 |
text_parts = []
|
39 |
for (bbox, text, prob) in results:
|
40 |
text_parts.append(text)
|
41 |
(tl, tr, br, bl) = bbox
|
42 |
+
min_x = min(min_x, tl[0], bl[0]); min_y = min(min_y, tl[1], tr[1])
|
43 |
+
max_x = max(max_x, tr[0], br[0]); max_y = max(max_y, bl[1], br[1])
|
|
|
|
|
|
|
44 |
extracted_text = ' '.join(text_parts)
|
45 |
consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
|
|
|
46 |
return extracted_text, consolidated_bbox
|
47 |
|
48 |
def translate_text_gemini(text: str) -> str:
|
49 |
+
if not API_KEY: raise gr.Error("GEMINI_API_KEY is not set.")
|
50 |
+
if not text or "No text" in text: return "No valid text to translate."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
try:
|
52 |
genai.configure(api_key=API_KEY)
|
53 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
54 |
+
# Your prompt here
|
55 |
+
prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
|
|
|
56 |
response = model.generate_content(prompt)
|
57 |
return response.text.strip()
|
58 |
+
except Exception as e: return f"Error during translation with Gemini: {str(e)}"
|
59 |
+
|
60 |
+
# --- TEXT OVERLAY FUNCTION (RTL Logic Corrected) ---
|
61 |
|
|
|
62 |
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
padding = 15
|
64 |
+
overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
|
65 |
+
min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
|
|
|
|
|
|
|
|
|
66 |
overlay_width = overlay_box[2] - overlay_box[0]
|
67 |
overlay_height = overlay_box[3] - overlay_box[1]
|
68 |
+
|
|
|
69 |
try:
|
70 |
+
sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
|
|
71 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
72 |
+
except (ValueError, IndexError): bg_color = (25, 25, 25, 255)
|
|
|
73 |
|
|
|
|
|
74 |
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
75 |
draw = ImageDraw.Draw(overlay_layer)
|
|
|
|
|
76 |
target_width = overlay_width * 0.90
|
77 |
font_size = 100
|
78 |
final_wrapped_lines = []
|
79 |
+
|
80 |
while font_size > 10:
|
81 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
82 |
+
words = text_to_overlay.split();
|
83 |
if not words: break
|
|
|
84 |
raw_lines = []; current_line = ""
|
85 |
for word in words:
|
86 |
test_line = (current_line + " " + word).strip()
|
87 |
+
line_width = draw.textbbox((0, 0), get_display(arabic_reshaper.reshape(test_line)), font=font)[2]
|
|
|
|
|
88 |
if line_width <= target_width: current_line = test_line
|
89 |
else: raw_lines.append(current_line); current_line = word
|
90 |
raw_lines.append(current_line)
|
91 |
+
total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
|
92 |
+
if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
|
93 |
+
else: font_size -= 2
|
94 |
+
|
95 |
+
if not final_wrapped_lines: final_wrapped_lines = raw_lines
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
98 |
line_spacing = font_size * 0.3
|
99 |
+
line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
|
100 |
+
total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
|
|
|
|
|
|
|
|
|
101 |
y_start = (overlay_height - total_text_height) / 2
|
102 |
|
103 |
current_y = y_start
|
104 |
+
for i, line in enumerate(final_wrapped_lines):
|
105 |
+
# ### --- CHANGE --- ###: Reverted to your original, proven RTL centering logic
|
106 |
+
# This is the fix for the RTL text display issue.
|
107 |
+
reshaped_line = arabic_reshaper.reshape(line)
|
108 |
+
bidi_line = get_display(reshaped_line)
|
109 |
+
|
110 |
+
# Manually calculate line width and center position
|
111 |
+
line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
|
112 |
+
line_width = line_bbox[2] - line_bbox[0]
|
113 |
+
x_position = (overlay_width - line_width) / 2
|
114 |
|
115 |
# Draw shadow then text for readability
|
116 |
+
draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
|
117 |
+
draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
|
118 |
|
119 |
current_y += line_heights[i] + line_spacing
|
120 |
|
121 |
return overlay_layer, overlay_box
|
122 |
|
123 |
+
# --- MAIN VIDEO PROCESSING PIPELINE (Now with FFMPEG) ---
|
|
|
124 |
|
125 |
def process_video(video_path, progress=gr.Progress()):
|
126 |
+
if video_path is None: raise gr.Error("Please upload a video file first.")
|
|
|
|
|
|
|
|
|
127 |
|
128 |
+
progress(0, desc="Loading Video & Analyzing...")
|
129 |
cap = cv2.VideoCapture(video_path)
|
130 |
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
131 |
|
132 |
+
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
133 |
+
fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
|
|
|
|
|
134 |
|
|
|
|
|
135 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
136 |
ret, middle_frame_bgr = cap.read()
|
137 |
if not ret: raise gr.Error("Could not read middle frame.")
|
|
|
138 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
139 |
|
140 |
progress(0.2, desc="Detecting Text (EasyOCR)...")
|
|
|
145 |
translated_text = translate_text_gemini(extracted_text)
|
146 |
if "Error" in translated_text: raise gr.Error(translated_text)
|
147 |
|
148 |
+
progress(0.5, desc="Rendering Translated Text Overlay...")
|
149 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
|
|
|
|
150 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
151 |
|
152 |
+
# ### --- CHANGE --- ###: Define filenames for temporary and final outputs
|
153 |
+
timestamp = int(time.time())
|
154 |
+
temp_silent_path = f"temp_silent_{timestamp}.mp4"
|
155 |
+
final_output_path = f"translated_video_{timestamp}.mp4"
|
156 |
|
157 |
+
# Part 1: Create a silent video with the overlay using OpenCV
|
158 |
+
progress(0.6, desc="Composing Silent Video with Overlay...")
|
159 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
160 |
+
out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
|
|
163 |
frame_idx = 0
|
|
|
|
|
164 |
x_min, y_min, x_max, y_max = overlay_position_box
|
165 |
|
166 |
while True:
|
167 |
ret, frame = cap.read()
|
168 |
if not ret: break
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
roi = frame[y_min:y_max, x_min:x_max]
|
|
|
|
|
171 |
stamp_h, stamp_w, _ = overlay_stamp_cv.shape
|
172 |
roi_h, roi_w, _ = roi.shape
|
173 |
+
overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
|
174 |
+
|
|
|
|
|
|
|
|
|
175 |
alpha = overlay_resized[:, :, 3] / 255.0
|
176 |
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
|
|
177 |
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
|
178 |
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
179 |
|
180 |
out.write(frame)
|
181 |
frame_idx += 1
|
182 |
+
progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
|
183 |
|
184 |
+
cap.release(); out.release()
|
185 |
+
|
186 |
+
# ### --- CHANGE --- ###: Part 2: Use ffmpeg to combine video with original audio and add fade
|
187 |
+
progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
|
188 |
+
try:
|
189 |
+
input_video = ffmpeg.input(temp_silent_path)
|
190 |
+
input_audio = ffmpeg.input(video_path)
|
191 |
+
|
192 |
+
(
|
193 |
+
ffmpeg
|
194 |
+
.output(
|
195 |
+
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS), # Apply fade-in to video stream
|
196 |
+
input_audio.audio, # Take audio stream from original
|
197 |
+
final_output_path,
|
198 |
+
c='copy', # Use 'copy' for audio codec to prevent re-encoding
|
199 |
+
shortest=None
|
200 |
+
)
|
201 |
+
.run(overwrite_output=True, quiet=True)
|
202 |
+
)
|
203 |
+
except ffmpeg.Error as e:
|
204 |
+
# Provide more detailed ffmpeg error logging if something goes wrong
|
205 |
+
print('ffmpeg stdout:', e.stdout.decode('utf8'))
|
206 |
+
print('ffmpeg stderr:', e.stderr.decode('utf8'))
|
207 |
+
raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
|
208 |
+
finally:
|
209 |
+
# Clean up the temporary silent video file
|
210 |
+
if os.path.exists(temp_silent_path):
|
211 |
+
os.remove(temp_silent_path)
|
212 |
|
213 |
+
progress(1, desc="Done!")
|
214 |
+
return final_output_path
|
215 |
|
216 |
+
# --- GRADIO INTERFACE (Unchanged) ---
|
217 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
218 |
gr.Markdown("# 🎬 Persian Video Quote Translator")
|
219 |
+
gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
|
|
|
220 |
with gr.Row():
|
221 |
video_input = gr.Video(label="Upload Video")
|
222 |
video_output = gr.Video(label="Translated Video Output")
|
|
|
223 |
translate_button = gr.Button("Translate Video", variant="primary")
|
224 |
+
translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
gr.Markdown("---")
|
226 |
+
gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. **(New)** It uses `ffmpeg` to merge the new video with the **original audio**, apply a 1-second fade-in, and ensure the final duration matches the input.")
|
|
|
227 |
|
228 |
if __name__ == "__main__":
|
229 |
demo.launch(debug=True)
|