kavehtaheri commited on
Commit
45c5aeb
·
verified ·
1 Parent(s): 587bc00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +371 -140
app.py CHANGED
@@ -1,25 +1,35 @@
1
- # app.py
2
 
3
  import gradio as gr
4
  import cv2
5
  import numpy as np
6
  from PIL import Image, ImageDraw, ImageFont
7
- import easyocr
8
  import google.generativeai as genai
9
- import arabic_reshaper # We only need this one
10
- # from bidi.algorithm import get_display # <<< REMOVED THIS LINE
11
  import os
12
  import time
13
  import ffmpeg
 
 
 
 
 
14
 
 
 
 
 
 
 
15
  # --- CONFIGURATION ---
16
- API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")
 
17
  PERSIAN_FONT_PATH = "Vazir.ttf"
18
  FADE_IN_DURATION_SECONDS = 1.0
19
 
20
  # --- GLOBAL INITIALIZATION ---
21
  reader = None
22
- def initialize_reader():
23
  """Initializes the EasyOCR reader if it hasn't been already."""
24
  global reader
25
  if reader is None:
@@ -28,165 +38,286 @@ def initialize_reader():
28
  print("EasyOCR model loaded successfully!")
29
  return reader
30
 
31
- # --- CORE PROCESSING FUNCTIONS ---
32
- def extract_text_and_bbox(image: Image.Image):
33
- """Extracts text and a consolidated bounding box from a PIL Image."""
34
- ocr_reader = initialize_reader()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  img_array = np.array(image)
36
  results = ocr_reader.readtext(img_array)
37
- if not results: return "No text detected in the image.", None
38
-
 
39
  min_x, min_y = float('inf'), float('inf')
40
  max_x, max_y = float('-inf'), float('-inf')
41
- text_parts = []
42
-
 
 
43
  for (bbox, text, prob) in results:
44
- text_parts.append(text)
45
- (tl, tr, br, bl) = bbox
46
- min_x = min(min_x, tl[0], bl[0])
47
- min_y = min(min_y, tl[1], tr[1])
48
- max_x = max(max_x, tr[0], br[0])
49
- max_y = max(max_y, bl[1], br[1])
50
-
51
- extracted_text = ' '.join(text_parts)
52
- consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
53
- return extracted_text, consolidated_bbox
54
-
55
- def translate_text_gemini(text: str) -> str:
56
- """Translates text to colloquial Persian using the Gemini API."""
57
- if not API_KEY or "YOUR_GEMINI_API_KEY_HERE" in API_KEY:
58
- raise gr.Error("GEMINI_API_KEY is not set. Please add it as a Secret in your Hugging Face Space.")
59
- if not text or "No text" in text:
60
- return "No valid text to translate."
61
-
62
- try:
63
- genai.configure(api_key=API_KEY)
64
- model = genai.GenerativeModel('gemini-2.5-flash')
65
- prompt = f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
66
- response = model.generate_content(prompt)
67
- return response.text.strip()
68
- except Exception as e:
69
- return f"Error during translation with Gemini: {str(e)}"
 
 
70
 
71
- # ### --- THE DEFINITIVELY CORRECTED TEXT OVERLAY FUNCTION --- ###
72
- # This version REMOVES `get_display` and uses `arabic_reshaper` only,
73
- # just like the working image script.
74
  def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> (Image.Image, tuple):
75
  """
76
- Creates an overlay layer with correctly rendered, wrapped Persian text.
77
  """
78
- padding = 15
79
- overlay_box = (
80
- max(0, bbox[0] - padding),
81
- max(0, bbox[1] - padding),
82
- min(original_image.width, bbox[2] + padding),
83
- min(original_image.height, bbox[3] + padding)
84
- )
85
- overlay_width = overlay_box[2] - overlay_box[0]
86
- overlay_height = overlay_box[3] - overlay_box[1]
87
 
88
  try:
89
- sample_x = max(0, int(overlay_box[0]) - 5)
90
- sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
91
  bg_color = original_image.getpixel((sample_x, sample_y))
92
- except (ValueError, IndexError):
93
- bg_color = (25, 25, 25)
94
 
95
  overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
96
  draw = ImageDraw.Draw(overlay_layer)
97
 
 
 
 
 
 
 
 
 
98
  if not os.path.exists(PERSIAN_FONT_PATH):
99
- raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it to your Space.")
100
 
101
  target_width = overlay_width * 0.90
102
  target_height = overlay_height * 0.90
103
  font_size = 100
104
  final_wrapped_lines = []
 
105
 
106
  while font_size > 10:
107
  font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
108
- words = text_to_overlay.split()
109
- if not words: break
110
-
111
- raw_lines = []
112
- current_line = ""
113
- for word in words:
114
- test_line = (current_line + " " + word).strip()
115
- # To measure width, we MUST reshape it first. This is the key.
116
- # We DO NOT use get_display().
117
- reshaped_test_line = arabic_reshaper.reshape(test_line)
118
- line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
119
-
120
- if line_width <= target_width:
121
- current_line = test_line
122
- else:
123
- raw_lines.append(current_line)
124
- current_line = word
125
- raw_lines.append(current_line)
126
-
127
- line_spacing = font_size * 0.3
128
- reshaped_for_height_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
129
- line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
130
- total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
131
-
132
- if total_height <= target_height:
133
  final_wrapped_lines = raw_lines
134
  break
135
  else:
136
  font_size -= 2
137
 
138
  if not final_wrapped_lines:
139
- final_wrapped_lines = [text_to_overlay]
 
 
140
 
141
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
142
- line_spacing = font_size * 0.3
143
-
144
- # Reshape the final lines for drawing, WITHOUT get_display()
145
  final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
146
- line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
147
- total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
148
-
149
  y_start = (overlay_height - total_text_height) / 2
150
  current_y = y_start
151
-
152
  for i, reshaped_line in enumerate(final_reshaped_lines):
153
- x_center = overlay_width / 2
154
- line_y_center = current_y + line_heights[i] / 2
155
-
156
- draw.text((x_center + 1, line_y_center + 1), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
157
- draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
158
-
159
- current_y += line_heights[i] + line_spacing
160
 
161
- return overlay_layer, overlay_box
 
 
162
 
163
- # --- MAIN VIDEO PROCESSING PIPELINE (No changes needed here) ---
164
- def process_video(video_path, progress=gr.Progress()):
165
- if video_path is None: raise gr.Error("Please upload a video file first.")
166
 
167
- progress(0, desc="Loading Video & Analyzing...")
168
  cap = cv2.VideoCapture(video_path)
169
  if not cap.isOpened(): raise gr.Error("Could not open video file.")
170
-
171
- frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
172
- frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
173
- fps = cap.get(cv2.CAP_PROP_FPS)
174
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
175
-
176
  cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
177
  ret, middle_frame_bgr = cap.read()
178
  if not ret: raise gr.Error("Could not read middle frame.")
179
  middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
180
 
181
- progress(0.2, desc="Detecting Text (EasyOCR)...")
182
- extracted_text, bbox = extract_text_and_bbox(middle_frame_rgb_pil)
183
- if bbox is None: raise gr.Error(extracted_text)
 
 
184
 
185
- progress(0.4, desc="Translating Text (Gemini API)...")
186
- translated_text = translate_text_gemini(extracted_text)
187
- if "Error" in translated_text: raise gr.Error(translated_text)
188
-
189
- progress(0.5, desc="Rendering Translated Text Overlay...")
190
  overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
191
  overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
192
 
@@ -194,39 +325,38 @@ def process_video(video_path, progress=gr.Progress()):
194
  temp_silent_path = f"temp_silent_{timestamp}.mp4"
195
  final_output_path = f"translated_video_{timestamp}.mp4"
196
 
197
- progress(0.6, desc="Composing Silent Video with Overlay...")
198
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
199
  out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
200
-
201
  cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
202
  frame_idx = 0
203
  x_min, y_min, x_max, y_max = overlay_position_box
204
-
205
  while True:
206
  ret, frame = cap.read()
207
  if not ret: break
208
-
209
  roi = frame[y_min:y_max, x_min:x_max]
210
- alpha = overlay_stamp_cv[:, :, 3] / 255.0
 
 
 
 
211
  alpha_mask = cv2.merge([alpha, alpha, alpha])
212
- blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_stamp_cv[:, :, :3].astype(float) * alpha_mask)
213
  frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
214
-
215
  out.write(frame)
216
  frame_idx += 1
217
- progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
218
-
219
  cap.release(); out.release()
220
 
221
- progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
222
  try:
223
  input_video = ffmpeg.input(temp_silent_path)
224
  input_audio = ffmpeg.input(video_path).audio
225
-
226
  (ffmpeg.output(
227
  input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
228
  input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
229
  ).run(overwrite_output=True, quiet=True))
 
230
  except ffmpeg.Error as e:
231
  print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
232
  print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
@@ -235,21 +365,122 @@ def process_video(video_path, progress=gr.Progress()):
235
  if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
236
 
237
  progress(1, desc="Done!")
238
- return final_output_path
239
-
240
- # --- GRADIO INTERFACE (No changes needed here) ---
241
- with gr.Blocks(theme=gr.themes.Soft(), title="Persian Video Quote Translator") as demo:
242
- gr.Markdown("# 🎬 Persian Video Quote Translator")
243
- gr.Markdown("Upload a short video containing English text. The app will detect the text, replace it with a poetic Persian translation, and preserve the original audio and video duration.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  with gr.Row():
245
- video_input = gr.Video(label="Upload Video")
246
- video_output = gr.Video(label="Translated Video Output")
247
- translate_button = gr.Button("Translate Video", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
 
 
 
 
250
 
251
  gr.Markdown("---")
252
- gr.Markdown("### How it works:\n1. It analyzes the middle frame to find the text and its location.\n2. It uses the Gemini API to get a high-quality, poetic Persian translation.\n3. It renders the Persian text correctly onto a background that matches the original video.\n4. It composites this new text overlay onto every frame of the video.\n5. Finally, it uses `ffmpeg` to merge the new video with the **original audio** and add a 1-second fade-in effect.")
 
 
 
 
 
 
 
 
253
 
254
  if __name__ == "__main__":
255
- demo.launch(debug=True)
 
 
 
1
+ # advanced_video_transcreator_v3.4.py
2
 
3
  import gradio as gr
4
  import cv2
5
  import numpy as np
6
  from PIL import Image, ImageDraw, ImageFont
 
7
  import google.generativeai as genai
8
+ import arabic_reshaper
 
9
  import os
10
  import time
11
  import ffmpeg
12
+ import json
13
+ import easyocr
14
+ import requests
15
+ import io
16
+ import json
17
 
18
+ original_dumps = json.dumps
19
+ def custom_dumps(*args, **kwargs):
20
+ kwargs['ensure_ascii'] = False
21
+ return original_dumps(*args, **kwargs)
22
+
23
+ json.dumps = custom_dumps
24
  # --- CONFIGURATION ---
25
+ API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4") # Replace with your actual API key or use os.getenv
26
+ ONE_API_KEY = os.getenv("ONE_API_KEY", "268976:66f4f58a2a905") # Key for the Instagram download service
27
  PERSIAN_FONT_PATH = "Vazir.ttf"
28
  FADE_IN_DURATION_SECONDS = 1.0
29
 
30
  # --- GLOBAL INITIALIZATION ---
31
  reader = None
32
+ def initialize_easyocr_reader():
33
  """Initializes the EasyOCR reader if it hasn't been already."""
34
  global reader
35
  if reader is None:
 
38
  print("EasyOCR model loaded successfully!")
39
  return reader
40
 
41
+ # --- CORE AI AND VIDEO FUNCTIONS ---
42
+
43
+ def analyze_and_transcreate_with_gemini(video_path: str, english_caption: str, progress: gr.Progress):
44
+ """
45
+ Analyzes a video using the new comprehensive "Transcreation" prompt and extracts the result.
46
+ This single call performs analysis, translation, and caption generation, incorporating the user-provided English caption.
47
+ """
48
+ if not API_KEY or API_KEY == "YOUR_GEMINI_API_KEY":
49
+ raise gr.Error("GEMINI_API_KEY is not set.")
50
+
51
+ try:
52
+ genai.configure(api_key=API_KEY)
53
+ model = genai.GenerativeModel('gemini-1.5-flash')
54
+
55
+ progress(0.2, desc="[1/4] Performing deep analysis & transcreation with Gemini...")
56
+
57
+ ### MODIFIED PROMPT (Requirements 1, 2, 3: Author Name, Category Definitions, English Caption) ###
58
+ prompt_template = f"""
59
+ Objective: Analyze the provided video (containing text) across all modalities (visuals, audio, existing text) and the user-provided English caption to generate a superior Persian translation and a suitable Instagram caption. The translation must be contextually perfect, stylistically appropriate, and culturally resonant, avoiding the feel of a literal or AI-driven translation. The caption should be concise, engaging, and aligned with the video's mood, content, and the provided English caption, without hashtags.
60
+
61
+ User-Provided English Caption: "{english_caption if english_caption else 'No caption provided.'}"
62
+
63
+ Instructions:
64
+
65
+ 1. **Multi-Modal Analysis**: Perform a deep analysis of the video. Synthesize information from all three channels: visual, audio, and textual. Additionally, incorporate the user-provided English caption to inform the tone, context, and intent of the Instagram caption.
66
+ 2. **Isolate Essential Text**: Use OCR to find all text, but identify only the **core, persistent message** intended for the audience. **You MUST INCLUDE any author, poet, or famous person's name (e.g., '- Rumi') in the essential text if present.** **You MUST IGNORE temporary text such as usernames that flash on screen, watermarks, or English subtitles at the bottom of the frame.** The essential text is typically the main quote or statement that stays on screen.
67
+ 3. **Category Selection**: Choose the most appropriate content category based on the video's text, audio, and visuals. Use the following definitions:
68
+ - **MEME_HUMOR**: Videos with a white text box at the top, often containing phrases like "POV", "Me when...", or similar humorous, casual text, typically with playful or comedic intent.
69
+ - **COLD_MOTIVATIONAL**: Videos with dark themes (visuals or mood) and intense, strong music that evokes motivation or a driven mindset.
70
+ - **WISE_QUOTE**: Videos with peaceful, calm music and literary, poetic grammar, often quoting famous figures.
71
+ - **TWITTER_JOKE**: Videos with casual, friendly, simple text tone, accompanied by funny or lighthearted music.
72
+ 4. **Synthesize and Guide**: Use the visual, audio, textual analysis, and the English caption(if provided)to define the exact emotional and stylistic parameters for the translation and Instagram caption.
73
+ 5. **Instagram Caption**: Generate a concise, engaging Instagram caption in Persian that reflects the video's mood, content, cultural context, and the tone of the English caption (if provided). The caption should be standalone (not a direct translation of the text or English caption) and suitable for posting without hashtags.
74
+ 6. **Format Output**: Respond ONLY with a single, raw JSON object as specified below. Do not include any explanatory text before or after the JSON.
75
+ 7. **Author Formatting**: If an author's name is present (e.g., "- Rumi"), format the final translation so the author's name (in Persian) is on its own, separate line at the very end.
76
+
77
+ JSON Structure:
78
+ {{
79
+ "asset_id": "video_frame_01",
80
+ "content_category": "CHOOSE ONE: [MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE]",
81
+ "source_language": "en",
82
+ "target_language": "fa",
83
+ "comprehensive_analysis": {{
84
+ "visual_context": {{
85
+ "mood_and_aesthetics": "Describe the emotional mood conveyed by the visuals. (e.g., 'Somber and melancholic, uses slow zooms and a desaturated color palette to evoke a sense of loneliness.')",
86
+ "cinematic_style": "Describe the filming style. (e.g., 'UGC-style phone recording, shaky cam, feels raw and authentic.')",
87
+ "subject_matter": "Briefly describe what is happening visually, independent of the text. (e.g., 'A person is walking alone on a rainy street at night.')"
88
+ }},
89
+ "audio_context": {{
90
+ "music_analysis": "Describe the music's genre, tempo, and emotional impact. (e.g., 'Slow, ambient piano music, creates a feeling of introspection and sadness.')",
91
+ "sfx_analysis": "Describe any relevant sound effects. (e.g., 'The sound of rain and distant city ambiance is prominent, enhancing the feeling of isolation.')"
92
+ }},
93
+ "textual_context": {{
94
+ "full_text_detected": "The complete text from OCR, including ALL parts.",
95
+ "essential_text": "The core message INCLUDING author attribution if present (e.g., 'The wound is the place where the light enters you - Rumi'). THIS IS THE MOST IMPORTANT FIELD. Remember to exclude temporary usernames and subtitles."
96
+ }}
97
+ }},
98
+ "transcreation_directive": {{
99
+ "target_emotional_impact": "Synthesize the analysis above to define the precise emotion the Persian translation should evoke. (e.g., 'The translation should feel like a quiet, personal realization; a mix of sadness and acceptance, not dramatic grief.')",
100
+ "stylistic_guidance": {{
101
+ "formality": "CHOOSE ONE: [FORMAL_LITERARY, MODERN_POETIC, COLLOQUIAL_CASUAL, PROFESSIONAL_INFORMATIVE]",
102
+ "register": "Describe the linguistic 'flavor'. (e.g., 'Use sophisticated but natural vocabulary. Avoid slang but don't be overly academic. It should sound like a thoughtful, well-spoken friend.')"
103
+ }},
104
+ "cultural_adaptation_notes": "Provide guidance on adapting cultural nuances for a Persian audience. (e.g., 'The English concept of 'just being okay with it' can be translated to a more poetic Persian concept of resignation, like «کنار آمدن» or «پذیرفتن».')"
105
+ }},
106
+ "final_output": {{
107
+ "recommended_translation": "ONLY the final, high-quality Persian translation goes here. It should be the direct result of following the transcreation_directive.",
108
+ "translation_rationale": "Briefly explain WHY this translation was chosen, referencing the analysis.",
109
+ "instagram_caption": "A concise, engaging Persian caption for the Instagram post, without hashtags, reflecting the video's mood, content, and the English caption (if provided)."
110
+ }}
111
+ }}
112
+ """
113
+
114
+ video_file = genai.upload_file(path=video_path)
115
+ while video_file.state.name == "PROCESSING":
116
+ time.sleep(2)
117
+ video_file = genai.get_file(video_file.name)
118
+
119
+ if video_file.state.name == "FAILED":
120
+ raise gr.Error("Gemini file upload failed.")
121
+
122
+ response = model.generate_content([prompt_template, video_file], request_options={"timeout": 180})
123
+ genai.delete_file(video_file.name)
124
+
125
+ analysis_json_text = response.text.strip()
126
+ if analysis_json_text.startswith("```json"):
127
+ analysis_json_text = analysis_json_text[7:-3].strip()
128
+
129
+ analysis_data = json.loads(analysis_json_text)
130
+
131
+ essential_text = analysis_data.get("comprehensive_analysis", {}).get("textual_context", {}).get("essential_text", "")
132
+ final_translation = analysis_data.get("final_output", {}).get("recommended_translation", "")
133
+ instagram_caption = analysis_data.get("final_output", {}).get("instagram_caption", "")
134
+
135
+ if not essential_text or not final_translation or not instagram_caption:
136
+ raise gr.Error("Gemini analysis did not return the essential text, final translation, or Instagram caption.")
137
+
138
+ return analysis_data, essential_text, final_translation, instagram_caption
139
+
140
+ except json.JSONDecodeError:
141
+ error_message = f"Gemini returned invalid JSON. The response was:\n{response.text.strip()}"
142
+ raise gr.Error(error_message)
143
+ except Exception as e:
144
+ error_message = f"An error occurred with the Gemini API: {str(e)}"
145
+ raise gr.Error(error_message)
146
+
147
+ def detect_white_header_box(image: Image.Image, progress: gr.Progress):
148
+ """
149
+ Detects if a prominent white header box exists at the top of the video.
150
+ Returns the bounding box of this header if found, otherwise returns None.
151
+ """
152
+ progress(0.35, desc="[2/4] Checking for white header box...")
153
+ img_array = np.array(image.convert('L')) # Convert to grayscale
154
+ frame_width, frame_height = image.size
155
+
156
+ # Analyze the top 25% of the image
157
+ scan_height = int(frame_height * 0.25)
158
+ top_section = img_array[0:scan_height, :]
159
+
160
+ # Threshold the image to find very light areas (potential white box)
161
+ _, thresh = cv2.threshold(top_section, 230, 255, cv2.THRESH_BINARY)
162
+
163
+ # Find contours
164
+ contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
165
+
166
+ for cnt in contours:
167
+ x, y, w, h = cv2.boundingRect(cnt)
168
+ # Check if the contour is a large, wide rectangle typical of a header
169
+ if w > frame_width * 0.8 and h > frame_height * 0.05:
170
+ print(f"Detected potential white header box of size {w}x{h}.")
171
+ # Give it a little padding
172
+ padding_x = int(frame_width * 0.02)
173
+ padding_y = int(frame_height * 0.02)
174
+ final_bbox = (
175
+ max(0, x - padding_x), max(0, y - padding_y),
176
+ min(frame_width, x + w + padding_x), min(frame_height, y + h + padding_y)
177
+ )
178
+ print(f"Using white header as final bounding box: {final_bbox}")
179
+ return final_bbox
180
+
181
+ print("No dominant white header box found. Proceeding with standard text detection.")
182
+ return None
183
+
184
+ def get_bbox_for_essential_text(image: Image.Image, essential_text: str, progress: gr.Progress):
185
+ """
186
+ Uses EasyOCR to find the precise bounding box for the essential text identified by Gemini.
187
+ """
188
+ progress(0.4, desc="[2/4] Locating text with EasyOCR...")
189
+ ocr_reader = initialize_easyocr_reader()
190
  img_array = np.array(image)
191
  results = ocr_reader.readtext(img_array)
192
+ if not results: raise gr.Error("EasyOCR could not detect any text on the frame.")
193
+
194
+ essential_words = set(char.lower() for char in essential_text if char.isalnum())
195
  min_x, min_y = float('inf'), float('inf')
196
  max_x, max_y = float('-inf'), float('-inf')
197
+ found_match = False
198
+
199
+ print(f"Gemini's essential text: '{essential_text}'")
200
+ print("EasyOCR Results:")
201
  for (bbox, text, prob) in results:
202
+ print(f"- Detected: '{text}'")
203
+ text_words = set(char.lower() for char in text if char.isalnum())
204
+ if len(essential_words.intersection(text_words)) > 0:
205
+ found_match = True
206
+ (tl, tr, br, bl) = bbox
207
+ min_x = min(min_x, tl[0], bl[0])
208
+ min_y = min(min_y, tl[1], tr[1])
209
+ max_x = max(max_x, tr[0], br[0])
210
+ max_y = max(max_y, bl[1], br[1])
211
+ print(f" ^-- Matched! Updating consolidated bbox.")
212
+
213
+ if not found_match: raise gr.Error(f"EasyOCR ran but could not locate the essential text '{essential_text}' on the video frame.")
214
+
215
+ original_height = max_y - min_y
216
+ height_reduction = original_height * 0.10
217
+ min_y += height_reduction / 2
218
+ max_y -= height_reduction / 2
219
+ print(f"Bbox height adjusted: Reduced by {height_reduction:.2f} pixels for a tighter fit.")
220
+
221
+ frame_width, frame_height = image.size
222
+ padding_x = int(frame_width * 0.02)
223
+ padding_y = int(frame_height * 0.02)
224
+ final_bbox = (
225
+ max(0, int(min_x) - padding_x), max(0, int(min_y) - padding_y),
226
+ min(frame_width, int(max_x) + padding_x), min(frame_height, int(max_y) + padding_y)
227
+ )
228
+ print(f"Final consolidated bbox (x1, y1, x2, y2): {final_bbox}")
229
+ return final_bbox
230
 
 
 
 
231
  def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> (Image.Image, tuple):
232
  """
233
+ Creates an overlay with adaptive color and robust, auto-fitting wrapped Persian text.
234
  """
235
+ overlay_width = bbox[2] - bbox[0]
236
+ overlay_height = bbox[3] - bbox[1]
 
 
 
 
 
 
 
237
 
238
  try:
239
+ sample_x = max(0, int(bbox[0]) - 5)
240
+ sample_y = int((bbox[1] + bbox[3]) / 2)
241
  bg_color = original_image.getpixel((sample_x, sample_y))
242
+ except (ValueError, IndexError): bg_color = (25, 25, 25)
 
243
 
244
  overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
245
  draw = ImageDraw.Draw(overlay_layer)
246
 
247
+ luminance = (0.299 * bg_color[0] + 0.587 * bg_color[1] + 0.114 * bg_color[2])
248
+ if luminance > 128:
249
+ text_color, shadow_color = (0, 0, 0, 255), (200, 200, 200, 100)
250
+ print("Light background detected. Using BLACK text.")
251
+ else:
252
+ text_color, shadow_color = (255, 255, 255, 255), (0, 0, 0, 180)
253
+ print("Dark background detected. Using WHITE text.")
254
+
255
  if not os.path.exists(PERSIAN_FONT_PATH):
256
+ raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it.")
257
 
258
  target_width = overlay_width * 0.90
259
  target_height = overlay_height * 0.90
260
  font_size = 100
261
  final_wrapped_lines = []
262
+ raw_lines = text_to_overlay.split('\n')
263
 
264
  while font_size > 10:
265
  font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
266
+ max_line_width = 0
267
+ reshaped_lines_for_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
268
+ for line in reshaped_lines_for_calc:
269
+ max_line_width = max(max_line_width, font.getlength(line))
270
+ line_heights = [font.getbbox(l)[3] for l in reshaped_lines_for_calc if l]
271
+ total_height = sum(line_heights) + (len(raw_lines) - 1) * (font_size * 0.3)
272
+ if total_height <= target_height and max_line_width <= target_width:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  final_wrapped_lines = raw_lines
274
  break
275
  else:
276
  font_size -= 2
277
 
278
  if not final_wrapped_lines:
279
+ font_size = 10
280
+ final_wrapped_lines = raw_lines
281
+ print("Warning: Text was too long to fit perfectly. Using minimum font size.")
282
 
283
  final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
284
+ print(f"Final font size: {font_size}px")
 
 
285
  final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
286
+ line_heights_render = [final_font.getbbox(l)[3] for l in final_reshaped_lines]
287
+ total_text_height = sum(line_heights_render) + (len(final_reshaped_lines) - 1) * (font_size * 0.3)
 
288
  y_start = (overlay_height - total_text_height) / 2
289
  current_y = y_start
 
290
  for i, reshaped_line in enumerate(final_reshaped_lines):
291
+ line_width = final_font.getlength(reshaped_line)
292
+ x_position = (overlay_width - line_width) / 2
293
+ draw.text((x_position + 1, current_y + 1), reshaped_line, font=final_font, fill=shadow_color)
294
+ draw.text((x_position, current_y), reshaped_line, font=final_font, fill=text_color)
295
+ current_y += line_heights_render[i] + (font_size * 0.3)
296
+ return overlay_layer, bbox
 
297
 
298
+ # --- MAIN VIDEO PROCESSING PIPELINE ---
299
+ def process_video(video_path, english_caption, progress=gr.Progress()):
300
+ if video_path is None: raise gr.Error("Please upload or download a video file first.")
301
 
302
+ progress(0, desc="Starting process...")
303
+ analysis_data, essential_text, translated_text, instagram_caption = analyze_and_transcreate_with_gemini(video_path, english_caption, progress)
 
304
 
 
305
  cap = cv2.VideoCapture(video_path)
306
  if not cap.isOpened(): raise gr.Error("Could not open video file.")
307
+ frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
308
+ fps, total_frames = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
 
 
 
309
  cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
310
  ret, middle_frame_bgr = cap.read()
311
  if not ret: raise gr.Error("Could not read middle frame.")
312
  middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
313
 
314
+ # Prioritize white header box detection
315
+ bbox = detect_white_header_box(middle_frame_rgb_pil, progress)
316
+ if bbox is None:
317
+ # Fallback to the original EasyOCR method if no header is found
318
+ bbox = get_bbox_for_essential_text(middle_frame_rgb_pil, essential_text, progress)
319
 
320
+ progress(0.5, desc="[3/4] Rendering translated text overlay...")
 
 
 
 
321
  overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
322
  overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
323
 
 
325
  temp_silent_path = f"temp_silent_{timestamp}.mp4"
326
  final_output_path = f"translated_video_{timestamp}.mp4"
327
 
328
+ progress(0.6, desc="[4/4] Composing video with overlay...")
329
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
330
  out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
 
331
  cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
332
  frame_idx = 0
333
  x_min, y_min, x_max, y_max = overlay_position_box
 
334
  while True:
335
  ret, frame = cap.read()
336
  if not ret: break
 
337
  roi = frame[y_min:y_max, x_min:x_max]
338
+ if roi.shape[:2] != (overlay_stamp_cv.shape[0], overlay_stamp_cv.shape[1]):
339
+ h, w = roi.shape[:2]
340
+ resized_overlay = cv2.resize(overlay_stamp_cv, (w, h))
341
+ else: resized_overlay = overlay_stamp_cv
342
+ alpha = resized_overlay[:, :, 3] / 255.0
343
  alpha_mask = cv2.merge([alpha, alpha, alpha])
344
+ blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + resized_overlay[:, :, :3].astype(float) * alpha_mask)
345
  frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
 
346
  out.write(frame)
347
  frame_idx += 1
348
+ progress(0.6 + (0.35 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
 
349
  cap.release(); out.release()
350
 
351
+ progress(0.95, desc="Merging Audio and Applying Fade...")
352
  try:
353
  input_video = ffmpeg.input(temp_silent_path)
354
  input_audio = ffmpeg.input(video_path).audio
 
355
  (ffmpeg.output(
356
  input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
357
  input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
358
  ).run(overwrite_output=True, quiet=True))
359
+
360
  except ffmpeg.Error as e:
361
  print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
362
  print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
 
365
  if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
366
 
367
  progress(1, desc="Done!")
368
+ return final_output_path, analysis_data, instagram_caption
369
+
370
+ # --- INSTAGRAM DOWNLOADER FUNCTION ---
371
+ def download_instagram_video(ig_url: str, progress: gr.Progress = None):
372
+ """Fetch video from Instagram post using One-API and save it locally."""
373
+ if not ig_url:
374
+ raise gr.Error("Please provide an Instagram URL.")
375
+ if not ONE_API_KEY:
376
+ raise gr.Error("ONE_API_KEY is not set for Instagram downloads.")
377
+
378
+ if progress is not None:
379
+ progress(0, desc="Downloading from Instagram...")
380
+ try:
381
+ shortcode = ig_url.split("/")[-2]
382
+ url_one = "https://api.one-api.ir/instagram/v1/post/?shortcode=" + shortcode
383
+ headers = {
384
+ "accept": "application/json",
385
+ "one-api-token": ONE_API_KEY,
386
+ "Content-Type": "application/json"
387
+ }
388
+ response = requests.get(url_one, headers=headers, timeout=30)
389
+ response.raise_for_status()
390
+
391
+ result = response.json().get("result", {})
392
+ media_list = result.get('media', [])
393
+
394
+ if not media_list:
395
+ raise ValueError("No media found in the API response.")
396
+
397
+ # Find the first video URL in the media list
398
+ video_url = None
399
+ for media_item in media_list:
400
+ if media_item.get("type") == "video":
401
+ video_url = media_item.get("url")
402
+ break
403
+
404
+ if not video_url:
405
+ raise ValueError("API response did not contain a direct video URL.")
406
+
407
+ if progress is not None:
408
+ progress(0.5, desc="Found video link. Downloading content...")
409
+ video_response = requests.get(video_url, stream=True, timeout=60)
410
+ video_response.raise_for_status()
411
+
412
+ # Save the video to a temporary file
413
+ timestamp = int(time.time())
414
+ local_filename = f"ig_download_{timestamp}.mp4"
415
+ with open(local_filename, 'wb') as f:
416
+ for chunk in video_response.iter_content(chunk_size=8192):
417
+ f.write(chunk)
418
+
419
+ print(f"Instagram video successfully downloaded to {local_filename}")
420
+ if progress is not None:
421
+ progress(1, desc="Download complete!")
422
+ return local_filename
423
+
424
+ except requests.exceptions.RequestException as e:
425
+ raise gr.Error(f"Network error while downloading from Instagram: {str(e)}")
426
+ except (ValueError, KeyError) as e:
427
+ print(f"API parsing error: {response.text}")
428
+ raise gr.Error(f"Could not process the Instagram API response: {str(e)}")
429
+ except Exception as e:
430
+ raise gr.Error(f"An unexpected error occurred during Instagram download: {str(e)}")
431
+
432
+ # --- GRADIO INTERFACE (Updated) ---
433
+ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Video Transcreator") as demo:
434
+ gr.Markdown("# 🎬 Advanced Video Transcreator v3.4")
435
+ gr.Markdown(
436
+ "**This version uses a powerful multi-modal prompt for superior, context-aware 'Transcreation'.**\n\n"
437
+ "Upload a short video with English text, or provide an Instagram URL and an optional English caption. Clicking 'Download from URL' will download and automatically process the video. The app will analyze the video's mood, style, and caption to generate a perfectly integrated Persian translation and an Instagram caption. Author names (e.g., '- Rumi') are included in the translation and overlaid on a separate line."
438
+ )
439
+
440
  with gr.Row():
441
+ with gr.Column(scale=2):
442
+ video_input = gr.Video(label="Upload Video or Use URL Below")
443
+ with gr.Row():
444
+ ig_url_input = gr.Textbox(label="Instagram Post URL", placeholder="e.g., https://www.instagram.com/p/C1a2b3Y4deF/")
445
+ english_caption_input = gr.Textbox(label="English Caption (Optional)", placeholder="e.g., A moment of reflection with Rumi's wisdom")
446
+ download_button = gr.Button("Download from URL")
447
+ with gr.Column(scale=3):
448
+ video_output = gr.Video(label="Translated Video Output")
449
+ caption_output = gr.Textbox(label="Instagram Caption (No Hashtags)", lines=3, interactive=False)
450
+ json_output = gr.JSON(label="Gemini Transcreation Analysis")
451
+
452
+ translate_button = gr.Button("Analyze and Transcreate Video", variant="primary")
453
+
454
+ # Define the logic flow
455
+ def chain_download_and_process(ig_url, english_caption):
456
+ """Chains Instagram download with video processing."""
457
+ video_path = download_instagram_video(ig_url)
458
+ return process_video(video_path, english_caption)
459
+
460
+ download_button.click(
461
+ fn=chain_download_and_process,
462
+ inputs=[ig_url_input, english_caption_input],
463
+ outputs=[video_output, json_output, caption_output]
464
+ )
465
 
466
+ translate_button.click(
467
+ fn=process_video,
468
+ inputs=[video_input, english_caption_input],
469
+ outputs=[video_output, json_output, caption_output]
470
+ )
471
 
472
  gr.Markdown("---")
473
+ gr.Markdown(
474
+ "### How it works:\n"
475
+ "1. **Gemini Transcreation:** The video and optional English caption are sent to Gemini for a deep, multi-modal analysis. Gemini is specifically instructed to **include author names** (e.g., '- Rumi') in the essential text, **ignore temporary text** (like usernames or subtitles), and generate a Persian Instagram caption based on the video and caption input.\n"
476
+ "2. **Category Classification:** The app selects a category (MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE) based on text, audio, and visuals, using clear definitions for accurate translation.\n"
477
+ "3. **Smart BBox Detection:** The app first checks for a **prominent white header box**. If found, it uses that for a clean overlay. If not, it falls back to `EasyOCR` to find the *exact pixel location* of the essential text Gemini identified.\n"
478
+ "4. **Render & Composite:** The Persian text, including author names on a separate line, is rendered with **adaptive color** inside the detected bounding box, with a font size that's **guaranteed to fit**, and placed precisely over the original.\n"
479
+ "5. **Finalize with Fade-In:** The original audio is merged back into the new video, and a **1-second fade-in** is applied using `ffmpeg`.\n"
480
+ "6. **Instagram Caption:** A concise, culturally appropriate caption is generated, incorporating the English caption (if provided), and displayed for use with the translated video."
481
+ )
482
 
483
  if __name__ == "__main__":
484
+ if not os.path.exists(PERSIAN_FONT_PATH):
485
+ print(f"WARNING: Font file '{PERSIAN_FONT_PATH}' not found. The app will likely fail. Please ensure it's in the same directory.")
486
+ demo.launch(debug=True)