kavehtaheri commited on
Commit
ce18e78
·
verified ·
1 Parent(s): 2ba7dc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -41
app.py CHANGED
@@ -10,12 +10,12 @@ import arabic_reshaper
10
  from bidi.algorithm import get_display
11
  import os
12
  import time
13
- import ffmpeg # ### --- CHANGE --- ###: Import the ffmpeg-python library
14
 
15
  # --- CONFIGURATION ---
16
  API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
17
- PERSIAN_FONT_PATH = "Vazir.ttf"
18
- FADE_IN_DURATION_SECONDS = 1.0 # The fade-in will be exactly 1 second long
19
 
20
  # --- GLOBAL INITIALIZATION ---
21
  reader = None
@@ -27,8 +27,7 @@ def initialize_reader():
27
  print("EasyOCR model loaded successfully!")
28
  return reader
29
 
30
- # --- YOUR CORE FUNCTIONS (Unchanged) ---
31
-
32
  def extract_text_and_bbox(image: Image.Image):
33
  ocr_reader = initialize_reader()
34
  img_array = np.array(image)
@@ -51,21 +50,19 @@ def translate_text_gemini(text: str) -> str:
51
  try:
52
  genai.configure(api_key=API_KEY)
53
  model = genai.GenerativeModel('gemini-1.5-flash')
54
- # Your prompt here
55
  prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
56
  response = model.generate_content(prompt)
57
  return response.text.strip()
58
  except Exception as e: return f"Error during translation with Gemini: {str(e)}"
59
 
60
- # --- TEXT OVERLAY FUNCTION (RTL Logic Corrected) ---
61
-
62
  def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
63
  padding = 15
64
  overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
65
  min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
66
  overlay_width = overlay_box[2] - overlay_box[0]
67
  overlay_height = overlay_box[3] - overlay_box[1]
68
-
69
  try:
70
  sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
71
  bg_color = original_image.getpixel((sample_x, sample_y))
@@ -76,7 +73,7 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
76
  target_width = overlay_width * 0.90
77
  font_size = 100
78
  final_wrapped_lines = []
79
-
80
  while font_size > 10:
81
  font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
82
  words = text_to_overlay.split();
@@ -91,7 +88,7 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
91
  total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
92
  if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
93
  else: font_size -= 2
94
-
95
  if not final_wrapped_lines: final_wrapped_lines = raw_lines
96
 
97
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
@@ -99,62 +96,55 @@ def render_translated_overlay(original_image: Image.Image, text_to_overlay: str,
99
  line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
100
  total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
101
  y_start = (overlay_height - total_text_height) / 2
102
-
103
  current_y = y_start
104
  for i, line in enumerate(final_wrapped_lines):
105
- # ### --- CHANGE --- ###: Reverted to your original, proven RTL centering logic
106
- # This is the fix for the RTL text display issue.
107
  reshaped_line = arabic_reshaper.reshape(line)
108
  bidi_line = get_display(reshaped_line)
109
-
110
- # Manually calculate line width and center position
111
  line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
112
  line_width = line_bbox[2] - line_bbox[0]
113
  x_position = (overlay_width - line_width) / 2
114
-
115
- # Draw shadow then text for readability
116
  draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
117
  draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
118
-
119
  current_y += line_heights[i] + line_spacing
120
 
121
  return overlay_layer, overlay_box
122
 
123
- # --- MAIN VIDEO PROCESSING PIPELINE (Now with FFMPEG) ---
124
-
125
  def process_video(video_path, progress=gr.Progress()):
126
  if video_path is None: raise gr.Error("Please upload a video file first.")
127
 
128
  progress(0, desc="Loading Video & Analyzing...")
129
  cap = cv2.VideoCapture(video_path)
130
  if not cap.isOpened(): raise gr.Error("Could not open video file.")
131
-
132
  frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
133
  fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
134
-
135
  cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
136
  ret, middle_frame_bgr = cap.read()
137
  if not ret: raise gr.Error("Could not read middle frame.")
138
  middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
139
-
140
  progress(0.2, desc="Detecting Text (EasyOCR)...")
141
  extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
142
  if bbox is None: raise gr.Error(extracted_text)
143
-
144
  progress(0.4, desc="Translating Text (Gemini API)...")
145
  translated_text = translate_text_gemini(extracted_text)
146
  if "Error" in translated_text: raise gr.Error(translated_text)
147
-
148
  progress(0.5, desc="Rendering Translated Text Overlay...")
149
  overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
150
  overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
151
 
152
- # ### --- CHANGE --- ###: Define filenames for temporary and final outputs
153
  timestamp = int(time.time())
154
  temp_silent_path = f"temp_silent_{timestamp}.mp4"
155
  final_output_path = f"translated_video_{timestamp}.mp4"
156
 
157
- # Part 1: Create a silent video with the overlay using OpenCV
158
  progress(0.6, desc="Composing Silent Video with Overlay...")
159
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
160
  out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
@@ -162,58 +152,59 @@ def process_video(video_path, progress=gr.Progress()):
162
  cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
163
  frame_idx = 0
164
  x_min, y_min, x_max, y_max = overlay_position_box
165
-
166
  while True:
167
  ret, frame = cap.read()
168
  if not ret: break
169
-
170
  roi = frame[y_min:y_max, x_min:x_max]
171
  stamp_h, stamp_w, _ = overlay_stamp_cv.shape
172
  roi_h, roi_w, _ = roi.shape
173
  overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
174
-
175
  alpha = overlay_resized[:, :, 3] / 255.0
176
  alpha_mask = cv2.merge([alpha, alpha, alpha])
177
  blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
178
  frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
179
-
180
  out.write(frame)
181
  frame_idx += 1
182
  progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
183
 
184
  cap.release(); out.release()
185
 
186
- # ### --- CHANGE --- ###: Part 2: Use ffmpeg to combine video with original audio and add fade
187
  progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
188
  try:
189
  input_video = ffmpeg.input(temp_silent_path)
190
  input_audio = ffmpeg.input(video_path)
191
-
 
192
  (
193
  ffmpeg
194
  .output(
195
- input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS), # Apply fade-in to video stream
196
  input_audio.audio, # Take audio stream from original
197
  final_output_path,
198
- c='copy', # Use 'copy' for audio codec to prevent re-encoding
199
- shortest=None
 
200
  )
201
  .run(overwrite_output=True, quiet=True)
202
  )
203
  except ffmpeg.Error as e:
204
- # Provide more detailed ffmpeg error logging if something goes wrong
205
  print('ffmpeg stdout:', e.stdout.decode('utf8'))
206
  print('ffmpeg stderr:', e.stderr.decode('utf8'))
207
  raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
208
  finally:
209
- # Clean up the temporary silent video file
210
  if os.path.exists(temp_silent_path):
211
  os.remove(temp_silent_path)
212
 
213
  progress(1, desc="Done!")
214
  return final_output_path
215
 
216
- # --- GRADIO INTERFACE (Unchanged) ---
217
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
218
  gr.Markdown("# 🎬 Persian Video Quote Translator")
219
  gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
@@ -223,7 +214,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
223
  translate_button = gr.Button("Translate Video", variant="primary")
224
  translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
225
  gr.Markdown("---")
226
- gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. **(New)** It uses `ffmpeg` to merge the new video with the **original audio**, apply a 1-second fade-in, and ensure the final duration matches the input.")
227
 
228
  if __name__ == "__main__":
229
  demo.launch(debug=True)
 
 
10
  from bidi.algorithm import get_display
11
  import os
12
  import time
13
+ import ffmpeg # Import the ffmpeg-python library
14
 
15
  # --- CONFIGURATION ---
16
  API_KEY ="AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4"
17
+ PERSIAN_FONT_PATH = "Vazir.ttf" # Make sure this font file is in your repository
18
+ FADE_IN_DURATION_SECONDS = 1.0
19
 
20
  # --- GLOBAL INITIALIZATION ---
21
  reader = None
 
27
  print("EasyOCR model loaded successfully!")
28
  return reader
29
 
30
+ # --- CORE FUNCTIONS ---
 
31
  def extract_text_and_bbox(image: Image.Image):
32
  ocr_reader = initialize_reader()
33
  img_array = np.array(image)
 
50
  try:
51
  genai.configure(api_key=API_KEY)
52
  model = genai.GenerativeModel('gemini-1.5-flash')
 
53
  prompt =f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
54
  response = model.generate_content(prompt)
55
  return response.text.strip()
56
  except Exception as e: return f"Error during translation with Gemini: {str(e)}"
57
 
58
+ # --- TEXT OVERLAY FUNCTION (WITH RTL CORRECTION) ---
 
59
  def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> Image.Image:
60
  padding = 15
61
  overlay_box = (max(0, bbox[0] - padding), max(0, bbox[1] - padding),
62
  min(original_image.width, bbox[2] + padding), min(original_image.height, bbox[3] + padding))
63
  overlay_width = overlay_box[2] - overlay_box[0]
64
  overlay_height = overlay_box[3] - overlay_box[1]
65
+
66
  try:
67
  sample_x = max(0, int(overlay_box[0]) - 5); sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
68
  bg_color = original_image.getpixel((sample_x, sample_y))
 
73
  target_width = overlay_width * 0.90
74
  font_size = 100
75
  final_wrapped_lines = []
76
+
77
  while font_size > 10:
78
  font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
79
  words = text_to_overlay.split();
 
88
  total_height = sum(draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=font)[3] for l in raw_lines)
89
  if total_height <= overlay_height * 0.9: final_wrapped_lines = raw_lines; break
90
  else: font_size -= 2
91
+
92
  if not final_wrapped_lines: final_wrapped_lines = raw_lines
93
 
94
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
 
96
  line_heights = [draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[3] - draw.textbbox((0,0), get_display(arabic_reshaper.reshape(l)), font=final_font)[1] for l in final_wrapped_lines]
97
  total_text_height = sum(line_heights) + (len(final_wrapped_lines) - 1) * line_spacing
98
  y_start = (overlay_height - total_text_height) / 2
99
+
100
  current_y = y_start
101
  for i, line in enumerate(final_wrapped_lines):
 
 
102
  reshaped_line = arabic_reshaper.reshape(line)
103
  bidi_line = get_display(reshaped_line)
104
+
 
105
  line_bbox = draw.textbbox((0, 0), bidi_line, font=final_font)
106
  line_width = line_bbox[2] - line_bbox[0]
107
  x_position = (overlay_width - line_width) / 2
108
+
 
109
  draw.text((x_position + 1, current_y + 1), bidi_line, font=final_font, fill=(0, 0, 0, 180))
110
  draw.text((x_position, current_y), bidi_line, font=final_font, fill=(255, 255, 255, 255))
111
+
112
  current_y += line_heights[i] + line_spacing
113
 
114
  return overlay_layer, overlay_box
115
 
116
+ # --- MAIN VIDEO PROCESSING PIPELINE (WITH FFMPEG CORRECTION) ---
 
117
  def process_video(video_path, progress=gr.Progress()):
118
  if video_path is None: raise gr.Error("Please upload a video file first.")
119
 
120
  progress(0, desc="Loading Video & Analyzing...")
121
  cap = cv2.VideoCapture(video_path)
122
  if not cap.isOpened(): raise gr.Error("Could not open video file.")
123
+
124
  frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)); frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
125
  fps = cap.get(cv2.CAP_PROP_FPS); total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
126
+
127
  cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
128
  ret, middle_frame_bgr = cap.read()
129
  if not ret: raise gr.Error("Could not read middle frame.")
130
  middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
131
+
132
  progress(0.2, desc="Detecting Text (EasyOCR)...")
133
  extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
134
  if bbox is None: raise gr.Error(extracted_text)
135
+
136
  progress(0.4, desc="Translating Text (Gemini API)...")
137
  translated_text = translate_text_gemini(extracted_text)
138
  if "Error" in translated_text: raise gr.Error(translated_text)
139
+
140
  progress(0.5, desc="Rendering Translated Text Overlay...")
141
  overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
142
  overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
143
 
 
144
  timestamp = int(time.time())
145
  temp_silent_path = f"temp_silent_{timestamp}.mp4"
146
  final_output_path = f"translated_video_{timestamp}.mp4"
147
 
 
148
  progress(0.6, desc="Composing Silent Video with Overlay...")
149
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
150
  out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
 
152
  cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
153
  frame_idx = 0
154
  x_min, y_min, x_max, y_max = overlay_position_box
155
+
156
  while True:
157
  ret, frame = cap.read()
158
  if not ret: break
159
+
160
  roi = frame[y_min:y_max, x_min:x_max]
161
  stamp_h, stamp_w, _ = overlay_stamp_cv.shape
162
  roi_h, roi_w, _ = roi.shape
163
  overlay_resized = cv2.resize(overlay_stamp_cv, (roi_w, roi_h)) if (stamp_h != roi_h or stamp_w != roi_w) else overlay_stamp_cv
164
+
165
  alpha = overlay_resized[:, :, 3] / 255.0
166
  alpha_mask = cv2.merge([alpha, alpha, alpha])
167
  blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_resized[:, :, :3].astype(float) * alpha_mask)
168
  frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
169
+
170
  out.write(frame)
171
  frame_idx += 1
172
  progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
173
 
174
  cap.release(); out.release()
175
 
 
176
  progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
177
  try:
178
  input_video = ffmpeg.input(temp_silent_path)
179
  input_audio = ffmpeg.input(video_path)
180
+
181
+ # ### --- KEY CHANGE --- ###: Corrected ffmpeg command
182
  (
183
  ffmpeg
184
  .output(
185
+ input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
186
  input_audio.audio, # Take audio stream from original
187
  final_output_path,
188
+ vcodec='libx264', # Explicitly set video codec for re-encoding
189
+ acodec='copy', # Keep audio as is without re-encoding
190
+ shortest=None # Ensure full video duration is used
191
  )
192
  .run(overwrite_output=True, quiet=True)
193
  )
194
  except ffmpeg.Error as e:
195
+ # Provide detailed ffmpeg error logs for easier debugging
196
  print('ffmpeg stdout:', e.stdout.decode('utf8'))
197
  print('ffmpeg stderr:', e.stderr.decode('utf8'))
198
  raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8')}")
199
  finally:
200
+ # Clean up the temporary silent file regardless of success or failure
201
  if os.path.exists(temp_silent_path):
202
  os.remove(temp_silent_path)
203
 
204
  progress(1, desc="Done!")
205
  return final_output_path
206
 
207
+ # --- GRADIO INTERFACE ---
208
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
209
  gr.Markdown("# 🎬 Persian Video Quote Translator")
210
  gr.Markdown("Upload a short video with English text. The app will preserve the original audio and duration.")
 
214
  translate_button = gr.Button("Translate Video", variant="primary")
215
  translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
216
  gr.Markdown("---")
217
+ gr.Markdown("### How it works:\n1. It analyzes the middle frame to find and translate text.\n2. It generates a temporary silent video with the correctly rendered Persian text overlaid.\n3. It uses `ffmpeg` to merge the new video with the **original audio**, apply a $1$-second fade-in, and ensure the final duration matches the input.")
218
 
219
  if __name__ == "__main__":
220
  demo.launch(debug=True)
221
+