Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -10,12 +10,12 @@ import arabic_reshaper
|
|
10 |
from bidi.algorithm import get_display
|
11 |
import os
|
12 |
import time
|
13 |
-
import ffmpeg #
|
14 |
|
15 |
# --- CONFIGURATION ---
|
16 |
API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
|
17 |
-
PERSIAN_FONT_PATH = "Vazir.ttf"
|
18 |
-
FADE_IN_DURATION_SECONDS = 1.0
|
19 |
|
20 |
# --- GLOBAL INITIALIZATION ---
|
21 |
reader = None
|
@@ -27,8 +27,7 @@ def initialize_reader():
|
|
27 |
print("EasyOCR model loaded successfully!")
|
28 |
return reader
|
29 |
|
30 |
-
# ---
|
31 |
-
|
32 |
def extract_text_and_bbox(image: Image.Image):
|
33 |
ocr_reader = initialize_reader()
|
34 |
img_array = np.array(image)
|
@@ -51,21 +50,19 @@ def translate_text_gemini(text: str) -> str:
|
|
51 |
try:
|
52 |
genai.configure(api_key=API_KEY)
|
53 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
54 |
-
# Your prompt here
|
55 |
prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
|
56 |
response = model.generate_content(prompt)
|
57 |
return response.text.strip()
|
58 |
except Exception as e: return f"Error during translation with Gemini: {str(e)}"
|
59 |
|
60 |
-
# --- TEXT OVERLAY FUNCTION (RTL
|
61 |
-
|
62 |
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
|
63 |
padding = 15
|
64 |
overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
|
65 |
min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
|
66 |
overlay_width = overlay_box[2] - overlay_box[0]
|
67 |
overlay_height = overlay_box[3] - overlay_box[1]
|
68 |
-
|
69 |
try:
|
70 |
sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
71 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
@@ -76,7 +73,7 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
|
|
76 |
target_width = overlay_width * 0.90
|
77 |
font_size = 100
|
78 |
final_wrapped_lines = []
|
79 |
-
|
80 |
while font_size > 10:
|
81 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
82 |
words = text_to_overlay.split();
|
@@ -91,7 +88,7 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
|
|
91 |
total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
|
92 |
if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
|
93 |
else: font_size -= 2
|
94 |
-
|
95 |
if not final_wrapped_lines: final_wrapped_lines = raw_lines
|
96 |
|
97 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
@@ -99,62 +96,55 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
|
|
99 |
line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
|
100 |
total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
|
101 |
y_start = (overlay_height - total_text_height) / 2
|
102 |
-
|
103 |
current_y = y_start
|
104 |
for i, line in enumerate(final_wrapped_lines):
|
105 |
-
# ### --- CHANGE --- ###: Reverted to your original, proven RTL centering logic
|
106 |
-
# This is the fix for the RTL text display issue.
|
107 |
reshaped_line = arabic_reshaper.reshape(line)
|
108 |
bidi_line = get_display(reshaped_line)
|
109 |
-
|
110 |
-
# Manually calculate line width and center position
|
111 |
line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
|
112 |
line_width = line_bbox[2] - line_bbox[0]
|
113 |
x_position = (overlay_width - line_width) / 2
|
114 |
-
|
115 |
-
# Draw shadow then text for readability
|
116 |
draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
|
117 |
draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
|
118 |
-
|
119 |
current_y += line_heights[i] + line_spacing
|
120 |
|
121 |
return overlay_layer, overlay_box
|
122 |
|
123 |
-
# --- MAIN VIDEO PROCESSING PIPELINE (
|
124 |
-
|
125 |
def process_video(video_path, progress=gr.Progress()):
|
126 |
if video_path is None: raise gr.Error("Please upload a video file first.")
|
127 |
|
128 |
progress(0, desc="Loading Video & Analyzing...")
|
129 |
cap = cv2.VideoCapture(video_path)
|
130 |
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
131 |
-
|
132 |
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
133 |
fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
134 |
-
|
135 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
136 |
ret, middle_frame_bgr = cap.read()
|
137 |
if not ret: raise gr.Error("Could not read middle frame.")
|
138 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
139 |
-
|
140 |
progress(0.2, desc="Detecting Text (EasyOCR)...")
|
141 |
extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
|
142 |
if bbox is None: raise gr.Error(extracted_text)
|
143 |
-
|
144 |
progress(0.4, desc="Translating Text (Gemini API)...")
|
145 |
translated_text = translate_text_gemini(extracted_text)
|
146 |
if "Error" in translated_text: raise gr.Error(translated_text)
|
147 |
-
|
148 |
progress(0.5, desc="Rendering Translated Text Overlay...")
|
149 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
150 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
151 |
|
152 |
-
# ### --- CHANGE --- ###: Define filenames for temporary and final outputs
|
153 |
timestamp = int(time.time())
|
154 |
temp_silent_path = f"temp_silent_{timestamp}.mp4"
|
155 |
final_output_path = f"translated_video_{timestamp}.mp4"
|
156 |
|
157 |
-
# Part 1: Create a silent video with the overlay using OpenCV
|
158 |
progress(0.6, desc="Composing Silent Video with Overlay...")
|
159 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
160 |
out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
|
@@ -162,58 +152,59 @@ def process_video(video_path, progress=gr.Progress()):
|
|
162 |
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
163 |
frame_idx = 0
|
164 |
x_min, y_min, x_max, y_max = overlay_position_box
|
165 |
-
|
166 |
while True:
|
167 |
ret, frame = cap.read()
|
168 |
if not ret: break
|
169 |
-
|
170 |
roi = frame[y_min:y_max, x_min:x_max]
|
171 |
stamp_h, stamp_w, _ = overlay_stamp_cv.shape
|
172 |
roi_h, roi_w, _ = roi.shape
|
173 |
overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
|
174 |
-
|
175 |
alpha = overlay_resized[:, :, 3] / 255.0
|
176 |
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
177 |
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
|
178 |
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
179 |
-
|
180 |
out.write(frame)
|
181 |
frame_idx += 1
|
182 |
progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
|
183 |
|
184 |
cap.release(); out.release()
|
185 |
|
186 |
-
# ### --- CHANGE --- ###: Part 2: Use ffmpeg to combine video with original audio and add fade
|
187 |
progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
|
188 |
try:
|
189 |
input_video = ffmpeg.input(temp_silent_path)
|
190 |
input_audio = ffmpeg.input(video_path)
|
191 |
-
|
|
|
192 |
(
|
193 |
ffmpeg
|
194 |
.output(
|
195 |
-
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
|
196 |
input_audio.audio, # Take audio stream from original
|
197 |
final_output_path,
|
198 |
-
|
199 |
-
|
|
|
200 |
)
|
201 |
.run(overwrite_output=True, quiet=True)
|
202 |
)
|
203 |
except ffmpeg.Error as e:
|
204 |
-
# Provide
|
205 |
print('ffmpeg stdout:', e.stdout.decode('utf8'))
|
206 |
print('ffmpeg stderr:', e.stderr.decode('utf8'))
|
207 |
raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
|
208 |
finally:
|
209 |
-
# Clean up the temporary silent
|
210 |
if os.path.exists(temp_silent_path):
|
211 |
os.remove(temp_silent_path)
|
212 |
|
213 |
progress(1, desc="Done!")
|
214 |
return final_output_path
|
215 |
|
216 |
-
# --- GRADIO INTERFACE
|
217 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
218 |
gr.Markdown("# 🎬 Persian Video Quote Translator")
|
219 |
gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
|
@@ -223,7 +214,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
223 |
translate_button = gr.Button("Translate Video", variant="primary")
|
224 |
translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
|
225 |
gr.Markdown("---")
|
226 |
-
gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3.
|
227 |
|
228 |
if __name__ == "__main__":
|
229 |
demo.launch(debug=True)
|
|
|
|
10 |
from bidi.algorithm import get_display
|
11 |
import os
|
12 |
import time
|
13 |
+
import ffmpeg # Import the ffmpeg-python library
|
14 |
|
15 |
# --- CONFIGURATION ---
|
16 |
API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
|
17 |
+
PERSIAN_FONT_PATH = "Vazir.ttf" # Make sure this font file is in your repository
|
18 |
+
FADE_IN_DURATION_SECONDS = 1.0
|
19 |
|
20 |
# --- GLOBAL INITIALIZATION ---
|
21 |
reader = None
|
|
|
27 |
print("EasyOCR model loaded successfully!")
|
28 |
return reader
|
29 |
|
30 |
+
# --- CORE FUNCTIONS ---
|
|
|
31 |
def extract_text_and_bbox(image: Image.Image):
|
32 |
ocr_reader = initialize_reader()
|
33 |
img_array = np.array(image)
|
|
|
50 |
try:
|
51 |
genai.configure(api_key=API_KEY)
|
52 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
|
|
53 |
prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
|
54 |
response = model.generate_content(prompt)
|
55 |
return response.text.strip()
|
56 |
except Exception as e: return f"Error during translation with Gemini: {str(e)}"
|
57 |
|
58 |
+
# --- TEXT OVERLAY FUNCTION (WITH RTL CORRECTION) ---
|
|
|
59 |
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
|
60 |
padding = 15
|
61 |
overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
|
62 |
min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
|
63 |
overlay_width = overlay_box[2] - overlay_box[0]
|
64 |
overlay_height = overlay_box[3] - overlay_box[1]
|
65 |
+
|
66 |
try:
|
67 |
sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
68 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
|
|
73 |
target_width = overlay_width * 0.90
|
74 |
font_size = 100
|
75 |
final_wrapped_lines = []
|
76 |
+
|
77 |
while font_size > 10:
|
78 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
79 |
words = text_to_overlay.split();
|
|
|
88 |
total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
|
89 |
if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
|
90 |
else: font_size -= 2
|
91 |
+
|
92 |
if not final_wrapped_lines: final_wrapped_lines = raw_lines
|
93 |
|
94 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
|
|
96 |
line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
|
97 |
total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
|
98 |
y_start = (overlay_height - total_text_height) / 2
|
99 |
+
|
100 |
current_y = y_start
|
101 |
for i, line in enumerate(final_wrapped_lines):
|
|
|
|
|
102 |
reshaped_line = arabic_reshaper.reshape(line)
|
103 |
bidi_line = get_display(reshaped_line)
|
104 |
+
|
|
|
105 |
line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
|
106 |
line_width = line_bbox[2] - line_bbox[0]
|
107 |
x_position = (overlay_width - line_width) / 2
|
108 |
+
|
|
|
109 |
draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
|
110 |
draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
|
111 |
+
|
112 |
current_y += line_heights[i] + line_spacing
|
113 |
|
114 |
return overlay_layer, overlay_box
|
115 |
|
116 |
+
# --- MAIN VIDEO PROCESSING PIPELINE (WITH FFMPEG CORRECTION) ---
|
|
|
117 |
def process_video(video_path, progress=gr.Progress()):
|
118 |
if video_path is None: raise gr.Error("Please upload a video file first.")
|
119 |
|
120 |
progress(0, desc="Loading Video & Analyzing...")
|
121 |
cap = cv2.VideoCapture(video_path)
|
122 |
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
123 |
+
|
124 |
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
125 |
fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
126 |
+
|
127 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
128 |
ret, middle_frame_bgr = cap.read()
|
129 |
if not ret: raise gr.Error("Could not read middle frame.")
|
130 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
131 |
+
|
132 |
progress(0.2, desc="Detecting Text (EasyOCR)...")
|
133 |
extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
|
134 |
if bbox is None: raise gr.Error(extracted_text)
|
135 |
+
|
136 |
progress(0.4, desc="Translating Text (Gemini API)...")
|
137 |
translated_text = translate_text_gemini(extracted_text)
|
138 |
if "Error" in translated_text: raise gr.Error(translated_text)
|
139 |
+
|
140 |
progress(0.5, desc="Rendering Translated Text Overlay...")
|
141 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
142 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
143 |
|
|
|
144 |
timestamp = int(time.time())
|
145 |
temp_silent_path = f"temp_silent_{timestamp}.mp4"
|
146 |
final_output_path = f"translated_video_{timestamp}.mp4"
|
147 |
|
|
|
148 |
progress(0.6, desc="Composing Silent Video with Overlay...")
|
149 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
150 |
out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
|
|
|
152 |
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
153 |
frame_idx = 0
|
154 |
x_min, y_min, x_max, y_max = overlay_position_box
|
155 |
+
|
156 |
while True:
|
157 |
ret, frame = cap.read()
|
158 |
if not ret: break
|
159 |
+
|
160 |
roi = frame[y_min:y_max, x_min:x_max]
|
161 |
stamp_h, stamp_w, _ = overlay_stamp_cv.shape
|
162 |
roi_h, roi_w, _ = roi.shape
|
163 |
overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
|
164 |
+
|
165 |
alpha = overlay_resized[:, :, 3] / 255.0
|
166 |
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
167 |
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
|
168 |
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
169 |
+
|
170 |
out.write(frame)
|
171 |
frame_idx += 1
|
172 |
progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
|
173 |
|
174 |
cap.release(); out.release()
|
175 |
|
|
|
176 |
progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
|
177 |
try:
|
178 |
input_video = ffmpeg.input(temp_silent_path)
|
179 |
input_audio = ffmpeg.input(video_path)
|
180 |
+
|
181 |
+
# ### --- KEY CHANGE --- ###: Corrected ffmpeg command
|
182 |
(
|
183 |
ffmpeg
|
184 |
.output(
|
185 |
+
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
|
186 |
input_audio.audio, # Take audio stream from original
|
187 |
final_output_path,
|
188 |
+
vcodec='libx264', # Explicitly set video codec for re-encoding
|
189 |
+
acodec='copy', # Keep audio as is without re-encoding
|
190 |
+
shortest=None # Ensure full video duration is used
|
191 |
)
|
192 |
.run(overwrite_output=True, quiet=True)
|
193 |
)
|
194 |
except ffmpeg.Error as e:
|
195 |
+
# Provide detailed ffmpeg error logs for easier debugging
|
196 |
print('ffmpeg stdout:', e.stdout.decode('utf8'))
|
197 |
print('ffmpeg stderr:', e.stderr.decode('utf8'))
|
198 |
raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
|
199 |
finally:
|
200 |
+
# Clean up the temporary silent file regardless of success or failure
|
201 |
if os.path.exists(temp_silent_path):
|
202 |
os.remove(temp_silent_path)
|
203 |
|
204 |
progress(1, desc="Done!")
|
205 |
return final_output_path
|
206 |
|
207 |
+
# --- GRADIO INTERFACE ---
|
208 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
209 |
gr.Markdown("# 🎬 Persian Video Quote Translator")
|
210 |
gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
|
|
|
214 |
translate_button = gr.Button("Translate Video", variant="primary")
|
215 |
translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
|
216 |
gr.Markdown("---")
|
217 |
+
gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. It uses `ffmpeg` to merge the new video with the **original audio**, apply a $1$-second fade-in, and ensure the final duration matches the input.")
|
218 |
|
219 |
if __name__ == "__main__":
|
220 |
demo.launch(debug=True)
|
221 |
+
|