kavehtaheri commited on
Commit
48ce277
·
verified ·
1 Parent(s): 307e11d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -179
app.py CHANGED
@@ -10,205 +10,131 @@ import arabic_reshaper
10
  from bidi.algorithm import get_display
11
  import os
12
  import time
 
13
 
14
  # --- CONFIGURATION ---
15
- # IMPORTANT: This should be set as a Secret in your Hugging Face Space
16
- # For local testing, you can uncomment the line below.
17
- # os.environ['GEMINI_API_KEY'] = "YOUR_API_KEY_HERE"
18
- API_KEY = "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
19
-
20
- # Ensure these font files are in your Hugging Face repository
21
  PERSIAN_FONT_PATH = "Vazir.ttf"
22
- OUTPUT_VIDEO_FILENAME = f"translated_video_{int(time.time())}.mp4"
23
-
24
- # Video effect settings
25
- FADE_IN_DURATION_SECONDS = 1.0
26
- INITIAL_BLACK_SCREEN_SECONDS = 1.0
27
 
28
  # --- GLOBAL INITIALIZATION ---
29
  reader = None
30
-
31
  def initialize_reader():
32
- """Initializes the EasyOCR reader if it hasn't been already."""
33
  global reader
34
  if reader is None:
35
  print("Loading EasyOCR model...")
36
- # For a CPU-only environment like HF Spaces free tier, gpu=False is essential.
37
  reader = easyocr.Reader(['en'], gpu=False, verbose=False)
38
  print("EasyOCR model loaded successfully!")
39
  return reader
40
 
41
- # --- YOUR CORE FUNCTIONS (Slightly Adapted) ---
42
 
43
  def extract_text_and_bbox(image: Image.Image):
44
- """
45
- Extracts text from a PIL Image and calculates a single consolidated
46
- bounding box for all text found.
47
- (This function is kept exactly as you wrote it)
48
- """
49
  ocr_reader = initialize_reader()
50
  img_array = np.array(image)
51
  results = ocr_reader.readtext(img_array)
52
-
53
- if not results:
54
- return "No text detected in the image.", None
55
-
56
- min_x, min_y = float('inf'), float('inf')
57
- max_x, max_y = float('-inf'), float('-inf')
58
-
59
  text_parts = []
60
  for (bbox, text, prob) in results:
61
  text_parts.append(text)
62
  (tl, tr, br, bl) = bbox
63
- min_x = min(min_x, tl[0], bl[0])
64
- min_y = min(min_y, tl[1], tr[1])
65
- max_x = max(max_x, tr[0], br[0])
66
- max_y = max(max_y, bl[1], br[1])
67
-
68
  extracted_text = ' '.join(text_parts)
69
  consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
70
-
71
  return extracted_text, consolidated_bbox
72
 
73
  def translate_text_gemini(text: str) -> str:
74
- """
75
- Translates text to colloquial Persian using the Gemini API.
76
- (This function is kept exactly as you wrote it, but with safer API key handling)
77
- """
78
- if not API_KEY:
79
- raise gr.Error("GEMINI_API_KEY is not set. Please configure it in your Hugging Face Space Secrets.")
80
- if not text or "No text" in text or "Error" in text:
81
- return "No valid text to translate."
82
-
83
  try:
84
  genai.configure(api_key=API_KEY)
85
  model = genai.GenerativeModel('gemini-1.5-flash')
86
- # Your excellent, detailed prompt is preserved
87
- prompt =f"Translate the following English quotes into Persian, rephrasing only minimally if absolutely necessary for natural poetic flow, but strictly preserving the original meaning, intent, purpose, and nuances without any alterations or additions that could change the core message. Ensure the Persian versions are concise (under 20 words), deep, touching, poetic, and profound, using idiomatic Persian that evokes wisdom or inspiration while staying faithful to the source. Additionally, guarantee suitable grammar and natural sentence structure in Persian for smooth readability by native speakers, and ensure the translation conveys clear, substantive meaning that stands independently beyond its poetic tone (i.e., the wisdom or insight should be immediately understandable without relying solely on artistry). If the original quote includes an attribution (e.g., author name), incorporate it faithfully in the Persian translation on the last line, formatted similarly (e.g., ‘- Author Name -’ in Persian). Your response must contain ONLY the translated Persian texts in Perso-Arabic script, one per quote, numbered (e.g., 1., 2.) for separation, with no other text, labels, explanations, or information whatsoever Quotes: [{text}]"
88
-
89
  response = model.generate_content(prompt)
90
  return response.text.strip()
91
- except Exception as e:
92
- return f"Error during translation with Gemini: {str(e)}"
 
93
 
94
- # --- NEW FUNCTION: Renders a reusable overlay "stamp" ---
95
  def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
96
- """
97
- Creates a single, pre-rendered RGBA image of the translated text on a
98
- background sampled from the original image. This "stamp" can be efficiently
99
- overlaid on every video frame.
100
-
101
- This function adapts the logic from your original 'overlay_text_on_image'.
102
- """
103
- # 1. Define the box where the new text will live (with padding)
104
  padding = 15
105
- overlay_box = (
106
- max(0, bbox[0] - padding),
107
- max(0, bbox[1] - padding),
108
- min(original_image.width, bbox[2] + padding),
109
- min(original_image.height, bbox[3] + padding)
110
- )
111
  overlay_width = overlay_box[2] - overlay_box[0]
112
  overlay_height = overlay_box[3] - overlay_box[1]
113
-
114
- # 2. Sample the background color from the original image
115
  try:
116
- sample_x = max(0, int(overlay_box[0]) - 5)
117
- sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
118
  bg_color = original_image.getpixel((sample_x, sample_y))
119
- except (ValueError, IndexError):
120
- bg_color = (25, 25, 25, 255) # Fallback color
121
 
122
- # 3. Create the base layer for our overlay "stamp"
123
- # This is an RGBA image with the sampled background color
124
  overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
125
  draw = ImageDraw.Draw(overlay_layer)
126
-
127
- # 4. Dynamically find best font size and wrap text (your brilliant logic)
128
  target_width = overlay_width * 0.90
129
  font_size = 100
130
  final_wrapped_lines = []
131
-
132
  while font_size > 10:
133
  font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
134
- words = text_to_overlay.split()
135
  if not words: break
136
-
137
  raw_lines = []; current_line = ""
138
  for word in words:
139
  test_line = (current_line + " " + word).strip()
140
- reshaped_test_line = arabic_reshaper.reshape(test_line)
141
- bidi_test_line = get_display(reshaped_test_line)
142
- line_width = draw.textbbox((0, 0), bidi_test_line, font=font)[2]
143
  if line_width <= target_width: current_line = test_line
144
  else: raw_lines.append(current_line); current_line = word
145
  raw_lines.append(current_line)
 
 
 
 
 
146
 
147
- # Check total height
148
- total_height = 0
149
- for line in raw_lines:
150
- reshaped_line = arabic_reshaper.reshape(line)
151
- bidi_line = get_display(reshaped_line)
152
- total_height += draw.textbbox((0,0), bidi_line, font=font)[3]
153
- if total_height <= overlay_height * 0.9:
154
- final_wrapped_lines = raw_lines
155
- break
156
- else:
157
- font_size -= 2
158
-
159
- if not final_wrapped_lines:
160
- print("Warning: Text could not fit. It may be truncated.")
161
- final_wrapped_lines = raw_lines # Use last attempt if no fit found
162
-
163
- # 5. Draw the final, wrapped text onto our stamp
164
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
165
  line_spacing = font_size * 0.3
166
-
167
- # BIDI and Reshape for correct RTL rendering
168
- reshaped_lines = [get_display(arabic_reshaper.reshape(l)) for l in final_wrapped_lines]
169
- line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in reshaped_lines]
170
- total_text_height = sum(line_heights) + (len(reshaped_lines) - 1) * line_spacing
171
-
172
  y_start = (overlay_height - total_text_height) / 2
173
 
174
  current_y = y_start
175
- for i, line_to_draw in enumerate(reshaped_lines):
176
- x_center = overlay_width / 2
 
 
 
 
 
 
 
 
177
 
178
  # Draw shadow then text for readability
179
- draw.text((x_center + 1, current_y + 1), line_to_draw, font=final_font, fill=(0, 0, 0, 180), anchor="mt")
180
- draw.text((x_center, current_y), line_to_draw, font=final_font, fill=(255, 255, 255, 255), anchor="mt")
181
 
182
  current_y += line_heights[i] + line_spacing
183
 
184
  return overlay_layer, overlay_box
185
 
186
-
187
- # --- MAIN VIDEO PROCESSING PIPELINE ---
188
 
189
  def process_video(video_path, progress=gr.Progress()):
190
- """
191
- Main function to orchestrate the entire video translation process.
192
- """
193
- if video_path is None:
194
- raise gr.Error("Please upload a video file first.")
195
 
196
- progress(0, desc="Loading Video...")
197
  cap = cv2.VideoCapture(video_path)
198
  if not cap.isOpened(): raise gr.Error("Could not open video file.")
199
 
200
- # Video properties
201
- frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
202
- frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
203
- fps = cap.get(cv2.CAP_PROP_FPS)
204
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
205
 
206
- # 1. ANALYSIS (OCR & TRANSLATION) - Done only once
207
- progress(0.1, desc="Extracting Middle Frame for Analysis...")
208
  cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
209
  ret, middle_frame_bgr = cap.read()
210
  if not ret: raise gr.Error("Could not read middle frame.")
211
-
212
  middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
213
 
214
  progress(0.2, desc="Detecting Text (EasyOCR)...")
@@ -219,96 +145,85 @@ def process_video(video_path, progress=gr.Progress()):
219
  translated_text = translate_text_gemini(extracted_text)
220
  if "Error" in translated_text: raise gr.Error(translated_text)
221
 
222
- progress(0.6, desc="Rendering Translated Text Overlay...")
223
  overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
224
-
225
- # Convert the PIL RGBA stamp to an OpenCV BGRA image for compositing
226
  overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
227
 
228
- # 2. VIDEO COMPOSITION
229
- progress(0.7, desc="Composing Final Video...")
230
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
231
- out = cv2.VideoWriter(OUTPUT_VIDEO_FILENAME, fourcc, fps, (frame_width, frame_height))
232
 
233
- # Add initial black screen
234
- num_black_frames = int(INITIAL_BLACK_SCREEN_SECONDS * fps)
235
- black_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
236
- for _ in range(num_black_frames): out.write(black_frame)
237
-
238
- # Add fade-in effect
239
- num_fade_frames = int(FADE_IN_DURATION_SECONDS * fps)
240
- cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind video
241
- ret, first_frame = cap.read()
242
- if ret:
243
- for i in range(num_fade_frames):
244
- alpha = (i + 1) / num_fade_frames
245
- blended_frame = cv2.addWeighted(black_frame, 1 - alpha, first_frame, alpha, 0)
246
- out.write(blended_frame)
247
 
248
- # Process all frames and overlay the pre-rendered stamp
249
- cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Rewind again
250
  frame_idx = 0
251
-
252
- # Get position for stamping
253
  x_min, y_min, x_max, y_max = overlay_position_box
254
 
255
  while True:
256
  ret, frame = cap.read()
257
  if not ret: break
258
 
259
- # Skip frames used in fade-in
260
- if frame_idx < num_fade_frames:
261
- frame_idx += 1
262
- continue
263
-
264
- # --- Efficient Alpha Blending (Stamping) ---
265
  roi = frame[y_min:y_max, x_min:x_max]
266
-
267
- # Ensure ROI and stamp have same dimensions before blending
268
  stamp_h, stamp_w, _ = overlay_stamp_cv.shape
269
  roi_h, roi_w, _ = roi.shape
270
- if stamp_h != roi_h or stamp_w != roi_w:
271
- # This can happen if padding makes the box go out of bounds. Resize stamp to fit.
272
- overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h))
273
- else:
274
- overlay_resized = overlay_stamp_cv
275
-
276
  alpha = overlay_resized[:, :, 3] / 255.0
277
  alpha_mask = cv2.merge([alpha, alpha, alpha])
278
-
279
  blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
280
  frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
281
 
282
  out.write(frame)
283
  frame_idx += 1
284
- progress(0.7 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
285
 
286
- cap.release()
287
- out.release()
288
- progress(1, desc="Done!")
289
- return OUTPUT_VIDEO_FILENAME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
- # --- GRADIO INTERFACE ---
 
292
 
 
293
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
294
  gr.Markdown("# 🎬 Persian Video Quote Translator")
295
- gr.Markdown("Upload a short video with English text. The app will detect the text, translate it, and create a new video with the Persian translation overlaid.")
296
-
297
  with gr.Row():
298
  video_input = gr.Video(label="Upload Video")
299
  video_output = gr.Video(label="Translated Video Output")
300
-
301
  translate_button = gr.Button("Translate Video", variant="primary")
302
-
303
- translate_button.click(
304
- fn=process_video,
305
- inputs=[video_input],
306
- outputs=[video_output]
307
- )
308
-
309
  gr.Markdown("---")
310
- gr.Markdown("### How it works:\n1. It finds the middle frame of your video for analysis.\n2. It uses `EasyOCR` to find the English text and its location.\n3. It uses Google's `Gemini` to translate the text to poetic Persian.\n4. It generates a high-quality overlay with your text-wrapping logic.\n5. Finally, it creates a new video with a fade-in and the translated text overlay.")
311
-
312
 
313
  if __name__ == "__main__":
314
  demo.launch(debug=True)
 
10
  from bidi.algorithm import get_display
11
  import os
12
  import time
13
+ import ffmpeg # ### --- CHANGE --- ###: Import the ffmpeg-python library
14
 
15
  # --- CONFIGURATION ---
16
+ API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
 
 
 
 
 
17
  PERSIAN_FONT_PATH = "Vazir.ttf"
18
+ FADE_IN_DURATION_SECONDS = 1.0 # The fade-in will be exactly 1 second long
 
 
 
 
19
 
20
  # --- GLOBAL INITIALIZATION ---
21
  reader = None
 
22
  def initialize_reader():
 
23
  global reader
24
  if reader is None:
25
  print("Loading EasyOCR model...")
 
26
  reader = easyocr.Reader(['en'], gpu=False, verbose=False)
27
  print("EasyOCR model loaded successfully!")
28
  return reader
29
 
30
+ # --- YOUR CORE FUNCTIONS (Unchanged) ---
31
 
32
  def extract_text_and_bbox(image: Image.Image):
 
 
 
 
 
33
  ocr_reader = initialize_reader()
34
  img_array = np.array(image)
35
  results = ocr_reader.readtext(img_array)
36
+ if not results: return "No text detected in the image.", None
37
+ min_x, min_y, max_x, max_y = float('inf'), float('inf'), float('-inf'), float('-inf')
 
 
 
 
 
38
  text_parts = []
39
  for (bbox, text, prob) in results:
40
  text_parts.append(text)
41
  (tl, tr, br, bl) = bbox
42
+ min_x = min(min_x, tl[0], bl[0]); min_y = min(min_y, tl[1], tr[1])
43
+ max_x = max(max_x, tr[0], br[0]); max_y = max(max_y, bl[1], br[1])
 
 
 
44
  extracted_text = ' '.join(text_parts)
45
  consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
 
46
  return extracted_text, consolidated_bbox
47
 
48
  def translate_text_gemini(text: str) -> str:
49
+ if not API_KEY: raise gr.Error("GEMINI_API_KEY is not set.")
50
+ if not text or "No text" in text: return "No valid text to translate."
 
 
 
 
 
 
 
51
  try:
52
  genai.configure(api_key=API_KEY)
53
  model = genai.GenerativeModel('gemini-1.5-flash')
54
+ # Your prompt here
55
+ prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
 
56
  response = model.generate_content(prompt)
57
  return response.text.strip()
58
+ except Exception as e: return f"Error during translation with Gemini: {str(e)}"
59
+
60
+ # --- TEXT OVERLAY FUNCTION (RTL Logic Corrected) ---
61
 
 
62
  def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
 
 
 
 
 
 
 
 
63
  padding = 15
64
+ overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
65
+ min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
 
 
 
 
66
  overlay_width = overlay_box[2] - overlay_box[0]
67
  overlay_height = overlay_box[3] - overlay_box[1]
68
+
 
69
  try:
70
+ sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
 
71
  bg_color = original_image.getpixel((sample_x, sample_y))
72
+ except (ValueError, IndexError): bg_color = (25, 25, 25, 255)
 
73
 
 
 
74
  overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
75
  draw = ImageDraw.Draw(overlay_layer)
 
 
76
  target_width = overlay_width * 0.90
77
  font_size = 100
78
  final_wrapped_lines = []
79
+
80
  while font_size > 10:
81
  font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
82
+ words = text_to_overlay.split();
83
  if not words: break
 
84
  raw_lines = []; current_line = ""
85
  for word in words:
86
  test_line = (current_line + " " + word).strip()
87
+ line_width = draw.textbbox((0, 0), get_display(arabic_reshaper.reshape(test_line)), font=font)[2]
 
 
88
  if line_width <= target_width: current_line = test_line
89
  else: raw_lines.append(current_line); current_line = word
90
  raw_lines.append(current_line)
91
+ total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
92
+ if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
93
+ else: font_size -= 2
94
+
95
+ if not final_wrapped_lines: final_wrapped_lines = raw_lines
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
98
  line_spacing = font_size * 0.3
99
+ line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
100
+ total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
 
 
 
 
101
  y_start = (overlay_height - total_text_height) / 2
102
 
103
  current_y = y_start
104
+ for i, line in enumerate(final_wrapped_lines):
105
+ # ### --- CHANGE --- ###: Reverted to your original, proven RTL centering logic
106
+ # This is the fix for the RTL text display issue.
107
+ reshaped_line = arabic_reshaper.reshape(line)
108
+ bidi_line = get_display(reshaped_line)
109
+
110
+ # Manually calculate line width and center position
111
+ line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
112
+ line_width = line_bbox[2] - line_bbox[0]
113
+ x_position = (overlay_width - line_width) / 2
114
 
115
  # Draw shadow then text for readability
116
+ draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
117
+ draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
118
 
119
  current_y += line_heights[i] + line_spacing
120
 
121
  return overlay_layer, overlay_box
122
 
123
+ # --- MAIN VIDEO PROCESSING PIPELINE (Now with FFMPEG) ---
 
124
 
125
  def process_video(video_path, progress=gr.Progress()):
126
+ if video_path is None: raise gr.Error("Please upload a video file first.")
 
 
 
 
127
 
128
+ progress(0, desc="Loading Video & Analyzing...")
129
  cap = cv2.VideoCapture(video_path)
130
  if not cap.isOpened(): raise gr.Error("Could not open video file.")
131
 
132
+ frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
133
+ fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
 
 
134
 
 
 
135
  cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
136
  ret, middle_frame_bgr = cap.read()
137
  if not ret: raise gr.Error("Could not read middle frame.")
 
138
  middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
139
 
140
  progress(0.2, desc="Detecting Text (EasyOCR)...")
 
145
  translated_text = translate_text_gemini(extracted_text)
146
  if "Error" in translated_text: raise gr.Error(translated_text)
147
 
148
+ progress(0.5, desc="Rendering Translated Text Overlay...")
149
  overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
 
 
150
  overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
151
 
152
+ # ### --- CHANGE --- ###: Define filenames for temporary and final outputs
153
+ timestamp = int(time.time())
154
+ temp_silent_path = f"temp_silent_{timestamp}.mp4"
155
+ final_output_path = f"translated_video_{timestamp}.mp4"
156
 
157
+ # Part 1: Create a silent video with the overlay using OpenCV
158
+ progress(0.6, desc="Composing Silent Video with Overlay...")
159
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
160
+ out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
 
 
 
 
 
 
 
 
 
 
161
 
162
+ cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
 
163
  frame_idx = 0
 
 
164
  x_min, y_min, x_max, y_max = overlay_position_box
165
 
166
  while True:
167
  ret, frame = cap.read()
168
  if not ret: break
169
 
 
 
 
 
 
 
170
  roi = frame[y_min:y_max, x_min:x_max]
 
 
171
  stamp_h, stamp_w, _ = overlay_stamp_cv.shape
172
  roi_h, roi_w, _ = roi.shape
173
+ overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
174
+
 
 
 
 
175
  alpha = overlay_resized[:, :, 3] / 255.0
176
  alpha_mask = cv2.merge([alpha, alpha, alpha])
 
177
  blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
178
  frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
179
 
180
  out.write(frame)
181
  frame_idx += 1
182
+ progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
183
 
184
+ cap.release(); out.release()
185
+
186
+ # ### --- CHANGE --- ###: Part 2: Use ffmpeg to combine video with original audio and add fade
187
+ progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
188
+ try:
189
+ input_video = ffmpeg.input(temp_silent_path)
190
+ input_audio = ffmpeg.input(video_path)
191
+
192
+ (
193
+ ffmpeg
194
+ .output(
195
+ input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS), # Apply fade-in to video stream
196
+ input_audio.audio, # Take audio stream from original
197
+ final_output_path,
198
+ c='copy', # Use 'copy' for audio codec to prevent re-encoding
199
+ shortest=None
200
+ )
201
+ .run(overwrite_output=True, quiet=True)
202
+ )
203
+ except ffmpeg.Error as e:
204
+ # Provide more detailed ffmpeg error logging if something goes wrong
205
+ print('ffmpeg stdout:', e.stdout.decode('utf8'))
206
+ print('ffmpeg stderr:', e.stderr.decode('utf8'))
207
+ raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
208
+ finally:
209
+ # Clean up the temporary silent video file
210
+ if os.path.exists(temp_silent_path):
211
+ os.remove(temp_silent_path)
212
 
213
+ progress(1, desc="Done!")
214
+ return final_output_path
215
 
216
+ # --- GRADIO INTERFACE (Unchanged) ---
217
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
218
  gr.Markdown("# 🎬 Persian Video Quote Translator")
219
+ gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
 
220
  with gr.Row():
221
  video_input = gr.Video(label="Upload Video")
222
  video_output = gr.Video(label="Translated Video Output")
 
223
  translate_button = gr.Button("Translate Video", variant="primary")
224
+ translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
 
 
 
 
 
 
225
  gr.Markdown("---")
226
+ gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. **(New)** It uses `ffmpeg` to merge the new video with the **original audio**, apply a 1-second fade-in, and ensure the final duration matches the input.")
 
227
 
228
  if __name__ == "__main__":
229
  demo.launch(debug=True)