kavehtaheri commited on
Commit
606c838
Β·
verified Β·
1 Parent(s): 14c703e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -134
app.py CHANGED
@@ -7,57 +7,42 @@ import numpy as np
7
  import google.generativeai as genai
8
  import arabic_reshaper
9
  import os
10
- import subprocess
 
 
11
  import tempfile
12
- import shutil
13
- import glob
14
 
15
  # --- CONFIGURATION ---
16
- # IMPORTANT: Replace with your actual Gemini API key
17
  api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"
18
- # Ensure the Vazir.ttf font file is in the same directory as this script
19
  PERSIAN_FONT_PATH = "Vazir.ttf"
20
 
21
  # --- GLOBAL INITIALIZATION ---
22
- # Lazily initialize the OCR reader to avoid loading it on script import
23
  reader = None
24
- translation_cache =- The text "Output video paths need to persist outside the temporary directory for Gradio display" seems like a note accidentally included.
25
- {}
26
 
27
  def initialize_reader():
28
  """Initializes the EasyOCR reader if it hasn't been already."""
29
  global reader
30
- # Output video paths need to persist outside the temporary directory for Gradio display.
31
-
32
  if reader is None:
33
  print("Loading EasyOCR model... (This may take a moment on first run)")
34
- # We only need to detect English, as we are translating from it.
35
  reader = easyocr.Reader(['en'], gpu=False, verbose=False)
36
  print("EasyOCR model loaded successfully!")
37
  return reader
38
 
39
- # --- CORE FUNCTIONS ---
40
 
41
  def extract_text_and_bbox(image: Image.Image):
42
  """
43
  Extracts text from a PIL Image and calculates a single consolidated
44
- bounding box for all text found. Uses resized image for faster OCR.
45
  """
46
  if image is None:
47
  return "Please upload an image first.", None
48
 
49
  try:
50
- # Resize for faster OCR (max width 640)
51
- ocr_width = 640
52
- if image.width > ocr_width:
53
- scale_factor = ocr_width / image.width
54
- ocr_image = image.resize((ocr_width, int(image.height * scale_factor)))
55
- else:
56
- ocr_image = image
57
- scale_factor = 1.0
58
-
59
  ocr_reader = initialize_reader()
60
- img_array = np.array(ocr_image)
61
  results = ocr_reader.readtext(img_array)
62
 
63
  if not results:
@@ -76,18 +61,13 @@ def extract_text_and_bbox(image: Image.Image):
76
  max_y = max(max_y, bl[1], br[1])
77
 
78
  extracted_text = ' '.join(text_parts)
79
- # Scale bbox back to original size
80
- consolidated_bbox = (
81
- int(min_x / scale_factor), int(min_y / scale_factor),
82
- int(max_x / scale_factor), int(max_y / scale_factor)
83
- )
84
 
85
  return extracted_text, consolidated_bbox
86
 
87
  except Exception as e:
88
  return f"Error processing image with OCR: {str(e)}", None
89
 
90
-
91
  def translate_text_gemini(text: str) -> str:
92
  """Translates text to colloquial Persian using the Gemini API."""
93
  if not text or "No text" in text or "Error" in text or "Please upload" in text:
@@ -96,18 +76,13 @@ def translate_text_gemini(text: str) -> str:
96
  try:
97
  genai.configure(api_key=api_key)
98
  model = genai.GenerativeModel('gemini-1.5-flash')
99
- prompt =f"Translate the following English quotes into Persian, rephrasing only minimally if absolutely necessary for natural poetic flow, but strictly preserving the original meaning, intent, purpose, and nuances without any alterations or additions that could change the core message. Ensure the Persian versions are concise (under 20 words), deep, touching, poetic, and profound, using idiomatic Persian that evokes wisdom or inspiration while staying faithful to the source. Additionally, guarantee suitable grammar and natural sentence structure in Persian for smooth readability by native speakers, and ensure the translation conveys clear, substantive meaning that stands independently beyond its poetic tone (i.e., the wisdom or insight should be immediately understandable without relying solely on artistry). If the original quote includes an attribution (e.g., author name), incorporate it faithfully in the Persian translation on the last line, formatted similarly (e.g., β€˜- Author Name -’ in Persian). Your response must contain ONLY the translated Persian texts in Perso-Arabic script, one per quote, numbered (e.g., 1., 2.) for separation, with no other text, labels, explanations, or information whatsoever Quotes: [{text}]"
100
 
101
  response = model.generate_content(prompt)
102
- translated = response.text.strip()
103
- # Strip numbering if present (assuming single quote)
104
- if translated.startswith('1. '):
105
- translated = translated[3:].strip()
106
- return translated
107
  except Exception as e:
108
  return f"Error during translation: {str(e)}"
109
 
110
- # --- THE NEW AND CORRECTED IMAGE OVERLAY FUNCTION ---
111
  def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
112
  """
113
  Overlays Persian text onto an image, erasing the content within the given
@@ -133,7 +108,7 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
133
  sample_y = int((erase_box[1] + erase_box[3]) / 2)
134
  bg_color = image_copy.getpixel((sample_x, sample_y))
135
  except (ValueError, IndexError):
136
- bg_color = (255, 255, 255) # Fallback to white
137
 
138
  draw_erase.rectangle(erase_box, fill=bg_color)
139
 
@@ -156,9 +131,7 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
156
  current_line = ""
157
  for word in words:
158
  test_line = (current_line + " " + word).strip()
159
- # To measure width, we MUST reshape it first. This is the key.
160
  reshaped_test_line = arabic_reshaper.reshape(test_line)
161
- # Use textbbox for more accurate size calculation
162
  line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
163
 
164
  if line_width <= target_width:
@@ -187,7 +160,6 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
187
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
188
  line_spacing = font_size * 0.3
189
 
190
- # Reshape the final lines for drawing
191
  final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
192
  line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
193
  total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
@@ -199,9 +171,7 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
199
  x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
200
  line_y_center = current_y + line_heights[i] / 2
201
 
202
- # Draw a subtle shadow for better readability
203
  draw.text((x_center + 2, line_y_center + 2), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
204
- # Draw the main text
205
  draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
206
 
207
  current_y += line_heights[i] + line_spacing
@@ -210,120 +180,288 @@ def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbo
210
  out_image = Image.alpha_composite(erase_layer, txt_layer)
211
  return out_image.convert("RGB")
212
 
 
213
 
214
- # --- VIDEO PROCESSING FUNCTION ---
215
-
216
- def process_video(video_path, progress=gr.Progress()):
217
- if video_path is None:
 
 
 
 
 
 
 
 
 
 
218
  return None
 
 
 
 
 
 
219
 
220
- progress(0, desc="Starting video processing...")
221
-
222
- # Create persistent output file outside tmpdir
223
- output_video_fd = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
224
- output_video_path = output_video_fd.name
225
- output_video_fd.close()
226
-
227
- with tempfile.TemporaryDirectory() as tmpdir:
228
- frames_dir = os.path.join(tmpdir, 'frames')
229
- out_frames_dir = os.path.join(tmpdir, 'out_frames')
230
- audio_path = os.path.join(tmpdir, 'audio.mp3')
231
-
232
- os.makedirs(frames_dir)
233
- os.makedirs(out_frames_dir)
234
-
235
- progress(0.1, desc="Extracting audio...")
236
- # Extract audio (if any)
237
- subprocess.run(['ffmpeg', '-y', '-i', video_path, '-vn', '-acodec', 'libmp3lame', audio_path], capture_output=True)
238
-
239
- progress(0.2, desc="Getting video info...")
240
- # Get FPS
241
- ffprobe_cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', 'stream=avg_frame_rate', '-of', 'default=noprint_wrappers=1:nokey=1', video_path]
242
- fps_str = subprocess.check_output(ffprobe_cmd).decode().strip()
243
- fps = eval(fps_str)
244
-
245
- progress(0.3, desc="Extracting frames...")
246
- # Extract frames
247
- frame_pattern = os.path.join(frames_dir, 'frame_%06d.png')
248
- subprocess.run(['ffmpeg', '-y', '-i', video_path, frame_pattern], capture_output=True, check=True)
249
-
250
- # List and sort frames
251
- frames = sorted(glob.glob(os.path.join(frames_dir, '*.png')))
252
-
253
- progress(0.4, desc="Processing frames...")
254
- num_frames = len(frames)
255
- if num_frames == 0:
256
- return None
257
-
258
- prev_translated_text = None
259
- prev_bbox = None
260
-
261
- for i, frame_path in enumerate(frames):
262
- progress(0.4 + (i / num_frames) * 0.4, desc=f"Processing frame {i+1}/{num_frames}")
263
 
264
- image = Image.open(frame_path)
265
- extracted_text, bbox = extract_text_and_bbox(image)
 
 
 
 
 
 
 
 
266
 
267
- out_frame_path = os.path.join(out_frames_dir, os.path.basename(frame_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
- if bbox is None:
270
- # No text, copy original frame
271
- shutil.copy(frame_path, out_frame_path)
272
- prev_translated_text = None
273
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
- # Check if text changed
276
- if extracted_text in translation_cache:
277
- translated_text = translation_cache[extracted_text]
 
 
278
  else:
279
- translated_text = translate_text_gemini(extracted_text)
280
- if "Error" in translated_text:
281
- # On error, copy original
282
- shutil.copy(frame_path, out_frame_path)
283
- continue
284
- translation_cache[extracted_text] = translated_text
285
 
286
- # If same as previous, and bbox similar, copy previous out frame (but since frames may differ, better to overlay again, but to save time, overlay is fast)
287
- # Overlay is PIL, fast; OCR is slow, but we already did OCR.
288
- # To further optimize, perhaps skip OCR if previous had no text, but for simplicity, keep as is.
289
 
290
- # Overlay
291
- final_image = overlay_text_on_image(image, translated_text, bbox)
292
- final_image.save(out_frame_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- progress(0.8, desc="Reassembling video...")
295
- # Reassemble video
296
- out_frame_pattern = os.path.join(out_frames_dir, 'frame_%06d.png')
297
- video_cmd = ['ffmpeg', '-y', '-framerate', str(fps), '-i', out_frame_pattern, '-c:v', 'libx264', '-pix_fmt', 'yuv420p', output_video_path]
 
298
 
299
- has_audio = os.path.exists(audio_path) and os.path.getsize(audio_path) > 0
300
- if has_audio:
301
- video_cmd = ['ffmpeg', '-y', '-framerate', str(fps), '-i', out_frame_pattern, '-i', audio_path, '-c:v', 'libx264', '-c:a', 'aac', '-pix_fmt', 'yuv420p', output_video_path]
 
 
302
 
303
- subprocess.run(video_cmd, capture_output=True, check=True)
 
 
 
 
304
 
305
- progress(1, desc="Done!")
306
- # Return the output video path
307
- return output_video_path
 
 
 
 
 
 
 
 
308
 
309
  # --- GRADIO INTERFACE ---
310
 
311
- with gr.Blocks(title="Persian Quote Video Translator", theme=gr.themes.Soft()) as demo:
312
- gr.Markdown("# πŸ“ Persian Quote Video Translator")
313
- gr.Markdown("Upload a video with English text. The app will automatically detect, erase, translate, and overlay the Persian text back onto each frame of the video.")
314
- gr.Markdown("**Note:** For best performance on free tier, use short videos (<30s). Longer videos may take time and could reconnect.")
315
 
316
  with gr.Row():
317
  with gr.Column(scale=1):
318
- video_input = gr.Video(label="Upload Quote Video", sources=["upload"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  with gr.Column(scale=1):
320
- video_output = gr.Video(label="Translated Video Output")
321
-
322
- video_input.change(
323
- fn=process_video,
324
- inputs=[video_input],
325
- outputs=[video_output]
 
 
 
 
 
 
 
 
326
  )
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  if __name__ == "__main__":
329
  demo.launch()
 
7
  import google.generativeai as genai
8
  import arabic_reshaper
9
  import os
10
+ import cv2
11
+ from moviepy.editor import *
12
+ from moviepy.video.fx import resize, fadein, fadeout
13
  import tempfile
14
+ import math
15
+ import random
16
 
17
  # --- CONFIGURATION ---
 
18
  api_key = "AIzaSyAKI92YawOKQ1-HRLmvaryMEWk_y4alJgA"
 
19
  PERSIAN_FONT_PATH = "Vazir.ttf"
20
 
21
  # --- GLOBAL INITIALIZATION ---
 
22
  reader = None
 
 
23
 
24
  def initialize_reader():
25
  """Initializes the EasyOCR reader if it hasn't been already."""
26
  global reader
 
 
27
  if reader is None:
28
  print("Loading EasyOCR model... (This may take a moment on first run)")
 
29
  reader = easyocr.Reader(['en'], gpu=False, verbose=False)
30
  print("EasyOCR model loaded successfully!")
31
  return reader
32
 
33
+ # --- CORE FUNCTIONS FROM YOUR ORIGINAL CODE ---
34
 
35
  def extract_text_and_bbox(image: Image.Image):
36
  """
37
  Extracts text from a PIL Image and calculates a single consolidated
38
+ bounding box for all text found.
39
  """
40
  if image is None:
41
  return "Please upload an image first.", None
42
 
43
  try:
 
 
 
 
 
 
 
 
 
44
  ocr_reader = initialize_reader()
45
+ img_array = np.array(image)
46
  results = ocr_reader.readtext(img_array)
47
 
48
  if not results:
 
61
  max_y = max(max_y, bl[1], br[1])
62
 
63
  extracted_text = ' '.join(text_parts)
64
+ consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
 
 
 
 
65
 
66
  return extracted_text, consolidated_bbox
67
 
68
  except Exception as e:
69
  return f"Error processing image with OCR: {str(e)}", None
70
 
 
71
  def translate_text_gemini(text: str) -> str:
72
  """Translates text to colloquial Persian using the Gemini API."""
73
  if not text or "No text" in text or "Error" in text or "Please upload" in text:
 
76
  try:
77
  genai.configure(api_key=api_key)
78
  model = genai.GenerativeModel('gemini-1.5-flash')
79
+ prompt = f"Translate the following English quotes into Persian, rephrasing only minimally if absolutely necessary for natural poetic flow, but strictly preserving the original meaning, intent, purpose, and nuances without any alterations or additions that could change the core message. Ensure the Persian versions are concise (under 20 words), deep, touching, poetic, and profound, using idiomatic Persian that evokes wisdom or inspiration while staying faithful to the source. Additionally, guarantee suitable grammar and natural sentence structure in Persian for smooth readability by native speakers, and ensure the translation conveys clear, substantive meaning that stands independently beyond its poetic tone (i.e., the wisdom or insight should be immediately understandable without relying solely on artistry). If the original quote includes an attribution (e.g., author name), incorporate it faithfully in the Persian translation on the last line, formatted similarly (e.g., '- Author Name -' in Persian). Your response must contain ONLY the translated Persian texts in Perso-Arabic script, one per quote, numbered (e.g., 1., 2.) for separation, with no other text, labels, explanations, or information whatsoever Quotes: [{text}]"
80
 
81
  response = model.generate_content(prompt)
82
+ return response.text.strip()
 
 
 
 
83
  except Exception as e:
84
  return f"Error during translation: {str(e)}"
85
 
 
86
  def overlay_text_on_image(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
87
  """
88
  Overlays Persian text onto an image, erasing the content within the given
 
108
  sample_y = int((erase_box[1] + erase_box[3]) / 2)
109
  bg_color = image_copy.getpixel((sample_x, sample_y))
110
  except (ValueError, IndexError):
111
+ bg_color = (255, 255, 255)
112
 
113
  draw_erase.rectangle(erase_box, fill=bg_color)
114
 
 
131
  current_line = ""
132
  for word in words:
133
  test_line = (current_line + " " + word).strip()
 
134
  reshaped_test_line = arabic_reshaper.reshape(test_line)
 
135
  line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
136
 
137
  if line_width <= target_width:
 
160
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
161
  line_spacing = font_size * 0.3
162
 
 
163
  final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
164
  line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
165
  total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
 
171
  x_center = erase_box[0] + (erase_box[2] - erase_box[0]) / 2
172
  line_y_center = current_y + line_heights[i] / 2
173
 
 
174
  draw.text((x_center + 2, line_y_center + 2), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
 
175
  draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
176
 
177
  current_y += line_heights[i] + line_spacing
 
180
  out_image = Image.alpha_composite(erase_layer, txt_layer)
181
  return out_image.convert("RGB")
182
 
183
+ # --- NEW VIDEO PROCESSING FUNCTIONS ---
184
 
185
+ def extract_middle_frame(video_path):
186
+ """Extract the middle frame from video for OCR processing."""
187
+ try:
188
+ cap = cv2.VideoCapture(video_path)
189
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
190
+ middle_frame_idx = total_frames // 2
191
+
192
+ cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
193
+ ret, frame = cap.read()
194
+ cap.release()
195
+
196
+ if ret:
197
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
198
+ return Image.fromarray(frame_rgb)
199
  return None
200
+ except Exception as e:
201
+ print(f"Error extracting middle frame: {e}")
202
+ return None
203
+
204
+ def create_sama_intro_effect(duration=3, size=(1920, 1080), fps=30):
205
+ """Create a sama-style intro effect similar to the ukulele video."""
206
 
207
+ def make_frame(t):
208
+ # Create base frame
209
+ img = np.zeros((size[1], size[0], 3), dtype=np.uint8)
210
+
211
+ # Create warm gradient background
212
+ for i in range(size[1]):
213
+ warm_intensity = int(25 + 15 * math.sin(i * 0.01))
214
+ img[i, :] = [warm_intensity//2, warm_intensity//3, warm_intensity]
215
+
216
+ center_x, center_y = size[0]//2, size[1]//2
217
+
218
+ # Musical rhythm visualization (simulating ukulele strums)
219
+ beat_time = t * 4 # 4 beats per second like ukulele strumming
220
+ beat_intensity = abs(math.sin(beat_time * math.pi)) ** 0.5
221
+
222
+ # Create pulsing circles (like sound waves)
223
+ for radius_base in [100, 150, 200, 250]:
224
+ radius = int(radius_base + beat_intensity * 30)
225
+ alpha = max(0, 0.3 - (t / duration) * 0.2)
226
+ circle_intensity = int(alpha * 255 * beat_intensity)
227
+
228
+ if circle_intensity > 10:
229
+ cv2.circle(img, (center_x, center_y), radius,
230
+ (circle_intensity//3, circle_intensity//4, circle_intensity//2), 2)
231
+
232
+ # Add rotating elements (like guitar picks or musical notes)
233
+ for i in range(6):
234
+ angle = (t * 60 + i * 60) % 360 # Rotating elements
235
+ distance = 180 + 20 * math.sin(beat_time)
236
+
237
+ x = int(center_x + distance * math.cos(math.radians(angle)))
238
+ y = int(center_y + distance * math.sin(math.radians(angle)))
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ # Draw musical note-like shapes
241
+ note_size = int(8 + beat_intensity * 4)
242
+ cv2.circle(img, (x, y), note_size, (150, 100, 50), -1)
243
+ cv2.circle(img, (x, y), note_size + 2, (200, 150, 100), 2)
244
+
245
+ # Add string-like lines (simulating ukulele strings)
246
+ for i in range(4):
247
+ y_pos = center_y - 60 + i * 40
248
+ line_alpha = beat_intensity * 0.5
249
+ line_intensity = int(line_alpha * 255)
250
 
251
+ if line_intensity > 20:
252
+ # Create wavy lines like vibrating strings
253
+ points = []
254
+ for x in range(0, size[0], 10):
255
+ wave_y = y_pos + int(10 * math.sin(x * 0.02 + t * 8) * beat_intensity)
256
+ points.append((x, wave_y))
257
+
258
+ for j in range(len(points)-1):
259
+ cv2.line(img, points[j], points[j+1],
260
+ (line_intensity//2, line_intensity//3, line_intensity//4), 2)
261
+
262
+ # Add fade in/out effects
263
+ fade_alpha = 1.0
264
+ if t < 0.5:
265
+ fade_alpha = t / 0.5
266
+ elif t > duration - 0.5:
267
+ fade_alpha = (duration - t) / 0.5
268
 
269
+ img = (img * fade_alpha).astype(np.uint8)
270
+
271
+ return img
272
+
273
+ return VideoClip(make_frame, duration=duration)
274
+
275
+ def apply_text_overlay_to_frame(frame, text_to_overlay, bbox):
276
+ """Apply text overlay to a single frame using your existing function."""
277
+ pil_frame = Image.fromarray(frame)
278
+ overlaid_frame = overlay_text_on_image(pil_frame, text_to_overlay, bbox)
279
+ return np.array(overlaid_frame)
280
+
281
+ def process_video_with_text_overlay(video_path, translated_text, bbox):
282
+ """Process video and apply text overlay to all frames."""
283
+ def apply_overlay(get_frame, t):
284
+ frame = get_frame(t)
285
+ return apply_text_overlay_to_frame(frame, translated_text, bbox)
286
+
287
+ video = VideoFileClip(video_path)
288
+ video_with_overlay = video.fl(apply_overlay)
289
+ return video_with_overlay
290
+
291
+ def create_final_video_with_intro(video_path, translated_text, bbox, output_path):
292
+ """Create the final video with sama intro effect and original music."""
293
+ try:
294
+ # Load original video
295
+ original_video = VideoFileClip(video_path)
296
+
297
+ # Create intro with same dimensions as original video
298
+ intro_duration = 3
299
+ intro = create_sama_intro_effect(
300
+ duration=intro_duration,
301
+ size=(int(original_video.w), int(original_video.h)),
302
+ fps=original_video.fps
303
+ )
304
+ intro = intro.set_fps(original_video.fps)
305
+
306
+ # Apply text overlay to main video
307
+ main_video_with_text = process_video_with_text_overlay(video_path, translated_text, bbox)
308
+
309
+ # Add smooth transitions
310
+ intro = fadeout(intro, 0.3)
311
+ main_video_with_text = fadein(main_video_with_text, 0.3)
312
+
313
+ # Concatenate intro and main video
314
+ final_video = concatenate_videoclips([intro, main_video_with_text])
315
+
316
+ # Handle audio - extend original audio to cover intro + main video
317
+ if original_video.audio:
318
+ # Create a loop of the original audio to cover intro duration
319
+ original_audio = original_video.audio
320
 
321
+ # If original audio is shorter than intro, loop it
322
+ if original_audio.duration < intro_duration:
323
+ loops_needed = int(intro_duration / original_audio.duration) + 1
324
+ extended_audio = concatenate_audioclips([original_audio] * loops_needed)
325
+ intro_audio = extended_audio.subclip(0, intro_duration)
326
  else:
327
+ intro_audio = original_audio.subclip(0, intro_duration)
328
+
329
+ # Combine intro audio + full original audio
330
+ full_audio = concatenate_audioclips([intro_audio, original_audio])
 
 
331
 
332
+ # Apply fade effects to audio
333
+ full_audio = full_audio.fx(audio_fadein, 0.3).fx(audio_fadeout, 0.3)
 
334
 
335
+ # Set audio to final video
336
+ final_video = final_video.set_audio(full_audio)
337
+
338
+ # Write the final video
339
+ final_video.write_videofile(
340
+ output_path,
341
+ codec='libx264',
342
+ audio_codec='aac',
343
+ temp_audiofile='temp-audio.m4a',
344
+ remove_temp=True,
345
+ fps=original_video.fps,
346
+ preset='medium'
347
+ )
348
+
349
+ # Clean up
350
+ original_video.close()
351
+ final_video.close()
352
+
353
+ return output_path
354
+
355
+ except Exception as e:
356
+ print(f"Error creating final video: {e}")
357
+ return None
358
+
359
+ def process_video_pipeline(video_file):
360
+ """Main processing pipeline for video."""
361
+ if video_file is None:
362
+ return "Please upload a video.", "Translation will appear here.", None, None
363
+
364
+ try:
365
+ # Create temporary files
366
+ temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
367
+ temp_output = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
368
+
369
+ # Save uploaded video
370
+ with open(temp_input.name, 'wb') as f:
371
+ f.write(video_file)
372
 
373
+ # Extract middle frame for OCR
374
+ print("Extracting middle frame for OCR...")
375
+ middle_frame = extract_middle_frame(temp_input.name)
376
+ if middle_frame is None:
377
+ return "Error extracting frame from video.", "No text to translate.", None, None
378
 
379
+ # Extract text and bbox using your existing function
380
+ print("Performing OCR on middle frame...")
381
+ extracted_text, bbox = extract_text_and_bbox(middle_frame)
382
+ if bbox is None:
383
+ return extracted_text, "No text to translate.", middle_frame, None
384
 
385
+ # Translate text using your existing function
386
+ print("Translating text to Persian...")
387
+ translated_text = translate_text_gemini(extracted_text)
388
+ if "Error" in translated_text:
389
+ return extracted_text, translated_text, middle_frame, None
390
 
391
+ # Create final video with intro and text overlay
392
+ print("Creating final video with intro effect...")
393
+ output_path = create_final_video_with_intro(temp_input.name, translated_text, bbox, temp_output.name)
394
+ if output_path is None:
395
+ return extracted_text, translated_text, middle_frame, None
396
+
397
+ print("Video processing completed successfully!")
398
+ return extracted_text, translated_text, middle_frame, output_path
399
+
400
+ except Exception as e:
401
+ return f"Error processing video: {str(e)}", "Translation failed.", None, None
402
 
403
  # --- GRADIO INTERFACE ---
404
 
405
+ with gr.Blocks(title="Persian Video Quote Translator", theme=gr.themes.Soft()) as demo:
406
+ gr.Markdown("# 🎬 Persian Video Quote Translator with Sama Intro")
407
+ gr.Markdown("Upload a video with English text. The app will create a stylized intro effect, detect text from the middle frame, translate it to Persian, and overlay it on the entire video while preserving the original music.")
 
408
 
409
  with gr.Row():
410
  with gr.Column(scale=1):
411
+ video_input = gr.File(
412
+ label="πŸ“Ή Upload Quote Video",
413
+ file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
414
+ type="binary"
415
+ )
416
+
417
+ process_btn = gr.Button("🎯 Process Video", variant="primary", size="lg")
418
+
419
+ with gr.Row():
420
+ text_output = gr.Textbox(
421
+ label="πŸ“ Extracted English Text",
422
+ placeholder="Detected English text will appear here...",
423
+ lines=3,
424
+ show_copy_button=True
425
+ )
426
+
427
+ translated_output = gr.Textbox(
428
+ label="πŸ”€ Persian Translation",
429
+ placeholder="Persian translation will appear here...",
430
+ lines=3,
431
+ show_copy_button=True
432
+ )
433
+
434
  with gr.Column(scale=1):
435
+ frame_output = gr.Image(
436
+ label="πŸ–ΌοΈ Middle Frame (OCR Source)",
437
+ type="pil"
438
+ )
439
+
440
+ video_output = gr.Video(
441
+ label="πŸŽ₯ Final Video with Sama Intro",
442
+ format="mp4"
443
+ )
444
+
445
+ process_btn.click(
446
+ fn=process_video_pipeline,
447
+ inputs=[video_input],
448
+ outputs=[text_output, translated_output, frame_output, video_output]
449
  )
450
 
451
+ gr.Markdown("### πŸ“‹ How it works:")
452
+ gr.Markdown("""
453
+ 1. **Upload** a video file containing English text
454
+ 2. **Click** 'Process Video' to start the magic ✨
455
+ 3. The app will:
456
+ - 🎼 Create a sama-style intro with musical rhythm effects (like your reference video)
457
+ - πŸ‘οΈ Extract the middle frame and detect English text using OCR
458
+ - πŸ”„ Translate the text to beautiful Persian poetry
459
+ - 🎨 Overlay the Persian text on all video frames with proper styling
460
+ - 🎡 Preserve and extend the original audio/music throughout
461
+ - 🎬 Combine everything into a polished final video
462
+
463
+ **Supported formats:** MP4, AVI, MOV, MKV, WebM
464
+ """)
465
+
466
  if __name__ == "__main__":
467
  demo.launch()