Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,25 +1,35 @@
|
|
1 |
-
#
|
2 |
|
3 |
import gradio as gr
|
4 |
import cv2
|
5 |
import numpy as np
|
6 |
from PIL import Image, ImageDraw, ImageFont
|
7 |
-
import easyocr
|
8 |
import google.generativeai as genai
|
9 |
-
import arabic_reshaper
|
10 |
-
# from bidi.algorithm import get_display # <<< REMOVED THIS LINE
|
11 |
import os
|
12 |
import time
|
13 |
import ffmpeg
|
|
|
|
|
|
|
|
|
|
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# --- CONFIGURATION ---
|
16 |
-
API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")
|
|
|
17 |
PERSIAN_FONT_PATH = "Vazir.ttf"
|
18 |
FADE_IN_DURATION_SECONDS = 1.0
|
19 |
|
20 |
# --- GLOBAL INITIALIZATION ---
|
21 |
reader = None
|
22 |
-
def
|
23 |
"""Initializes the EasyOCR reader if it hasn't been already."""
|
24 |
global reader
|
25 |
if reader is None:
|
@@ -28,165 +38,286 @@ def initialize_reader():
|
|
28 |
print("EasyOCR model loaded successfully!")
|
29 |
return reader
|
30 |
|
31 |
-
# --- CORE
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
img_array = np.array(image)
|
36 |
results = ocr_reader.readtext(img_array)
|
37 |
-
if not results:
|
38 |
-
|
|
|
39 |
min_x, min_y = float('inf'), float('inf')
|
40 |
max_x, max_y = float('-inf'), float('-inf')
|
41 |
-
|
42 |
-
|
|
|
|
|
43 |
for (bbox, text, prob) in results:
|
44 |
-
|
45 |
-
(
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
70 |
|
71 |
-
# ### --- THE DEFINITIVELY CORRECTED TEXT OVERLAY FUNCTION --- ###
|
72 |
-
# This version REMOVES `get_display` and uses `arabic_reshaper` only,
|
73 |
-
# just like the working image script.
|
74 |
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> (Image.Image, tuple):
|
75 |
"""
|
76 |
-
Creates an overlay
|
77 |
"""
|
78 |
-
|
79 |
-
|
80 |
-
max(0, bbox[0] - padding),
|
81 |
-
max(0, bbox[1] - padding),
|
82 |
-
min(original_image.width, bbox[2] + padding),
|
83 |
-
min(original_image.height, bbox[3] + padding)
|
84 |
-
)
|
85 |
-
overlay_width = overlay_box[2] - overlay_box[0]
|
86 |
-
overlay_height = overlay_box[3] - overlay_box[1]
|
87 |
|
88 |
try:
|
89 |
-
sample_x = max(0, int(
|
90 |
-
sample_y = int((
|
91 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
92 |
-
except (ValueError, IndexError):
|
93 |
-
bg_color = (25, 25, 25)
|
94 |
|
95 |
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
96 |
draw = ImageDraw.Draw(overlay_layer)
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
if not os.path.exists(PERSIAN_FONT_PATH):
|
99 |
-
raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it
|
100 |
|
101 |
target_width = overlay_width * 0.90
|
102 |
target_height = overlay_height * 0.90
|
103 |
font_size = 100
|
104 |
final_wrapped_lines = []
|
|
|
105 |
|
106 |
while font_size > 10:
|
107 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
# To measure width, we MUST reshape it first. This is the key.
|
116 |
-
# We DO NOT use get_display().
|
117 |
-
reshaped_test_line = arabic_reshaper.reshape(test_line)
|
118 |
-
line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
|
119 |
-
|
120 |
-
if line_width <= target_width:
|
121 |
-
current_line = test_line
|
122 |
-
else:
|
123 |
-
raw_lines.append(current_line)
|
124 |
-
current_line = word
|
125 |
-
raw_lines.append(current_line)
|
126 |
-
|
127 |
-
line_spacing = font_size * 0.3
|
128 |
-
reshaped_for_height_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
|
129 |
-
line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
|
130 |
-
total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
|
131 |
-
|
132 |
-
if total_height <= target_height:
|
133 |
final_wrapped_lines = raw_lines
|
134 |
break
|
135 |
else:
|
136 |
font_size -= 2
|
137 |
|
138 |
if not final_wrapped_lines:
|
139 |
-
|
|
|
|
|
140 |
|
141 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
142 |
-
|
143 |
-
|
144 |
-
# Reshape the final lines for drawing, WITHOUT get_display()
|
145 |
final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
|
146 |
-
|
147 |
-
total_text_height = sum(
|
148 |
-
|
149 |
y_start = (overlay_height - total_text_height) / 2
|
150 |
current_y = y_start
|
151 |
-
|
152 |
for i, reshaped_line in enumerate(final_reshaped_lines):
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
draw.text((
|
157 |
-
|
158 |
-
|
159 |
-
current_y += line_heights[i] + line_spacing
|
160 |
|
161 |
-
|
|
|
|
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
if video_path is None: raise gr.Error("Please upload a video file first.")
|
166 |
|
167 |
-
progress(0, desc="Loading Video & Analyzing...")
|
168 |
cap = cv2.VideoCapture(video_path)
|
169 |
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
170 |
-
|
171 |
-
|
172 |
-
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
173 |
-
fps = cap.get(cv2.CAP_PROP_FPS)
|
174 |
-
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
175 |
-
|
176 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
177 |
ret, middle_frame_bgr = cap.read()
|
178 |
if not ret: raise gr.Error("Could not read middle frame.")
|
179 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
if bbox is None:
|
|
|
|
|
184 |
|
185 |
-
progress(0.
|
186 |
-
translated_text = translate_text_gemini(extracted_text)
|
187 |
-
if "Error" in translated_text: raise gr.Error(translated_text)
|
188 |
-
|
189 |
-
progress(0.5, desc="Rendering Translated Text Overlay...")
|
190 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
191 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
192 |
|
@@ -194,39 +325,38 @@ def process_video(video_path, progress=gr.Progress()):
|
|
194 |
temp_silent_path = f"temp_silent_{timestamp}.mp4"
|
195 |
final_output_path = f"translated_video_{timestamp}.mp4"
|
196 |
|
197 |
-
progress(0.6, desc="Composing
|
198 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
199 |
out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
|
200 |
-
|
201 |
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
202 |
frame_idx = 0
|
203 |
x_min, y_min, x_max, y_max = overlay_position_box
|
204 |
-
|
205 |
while True:
|
206 |
ret, frame = cap.read()
|
207 |
if not ret: break
|
208 |
-
|
209 |
roi = frame[y_min:y_max, x_min:x_max]
|
210 |
-
|
|
|
|
|
|
|
|
|
211 |
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
212 |
-
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) +
|
213 |
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
214 |
-
|
215 |
out.write(frame)
|
216 |
frame_idx += 1
|
217 |
-
progress(0.6 + (0.
|
218 |
-
|
219 |
cap.release(); out.release()
|
220 |
|
221 |
-
progress(0.95, desc="Merging Audio and Applying Fade
|
222 |
try:
|
223 |
input_video = ffmpeg.input(temp_silent_path)
|
224 |
input_audio = ffmpeg.input(video_path).audio
|
225 |
-
|
226 |
(ffmpeg.output(
|
227 |
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
|
228 |
input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
|
229 |
).run(overwrite_output=True, quiet=True))
|
|
|
230 |
except ffmpeg.Error as e:
|
231 |
print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
|
232 |
print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
|
@@ -235,21 +365,122 @@ def process_video(video_path, progress=gr.Progress()):
|
|
235 |
if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
|
236 |
|
237 |
progress(1, desc="Done!")
|
238 |
-
return final_output_path
|
239 |
-
|
240 |
-
# ---
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
with gr.Row():
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
-
translate_button.click(
|
|
|
|
|
|
|
|
|
250 |
|
251 |
gr.Markdown("---")
|
252 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
if __name__ == "__main__":
|
255 |
-
|
|
|
|
|
|
1 |
+
# advanced_video_transcreator_v3.4.py
|
2 |
|
3 |
import gradio as gr
|
4 |
import cv2
|
5 |
import numpy as np
|
6 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
7 |
import google.generativeai as genai
|
8 |
+
import arabic_reshaper
|
|
|
9 |
import os
|
10 |
import time
|
11 |
import ffmpeg
|
12 |
+
import json
|
13 |
+
import easyocr
|
14 |
+
import requests
|
15 |
+
import io
|
16 |
+
import json
|
17 |
|
18 |
+
original_dumps = json.dumps
|
19 |
+
def custom_dumps(*args, **kwargs):
|
20 |
+
kwargs['ensure_ascii'] = False
|
21 |
+
return original_dumps(*args, **kwargs)
|
22 |
+
|
23 |
+
json.dumps = custom_dumps
|
24 |
# --- CONFIGURATION ---
|
25 |
+
API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4") # Replace with your actual API key or use os.getenv
|
26 |
+
ONE_API_KEY = os.getenv("ONE_API_KEY", "268976:66f4f58a2a905") # Key for the Instagram download service
|
27 |
PERSIAN_FONT_PATH = "Vazir.ttf"
|
28 |
FADE_IN_DURATION_SECONDS = 1.0
|
29 |
|
30 |
# --- GLOBAL INITIALIZATION ---
|
31 |
reader = None
|
32 |
+
def initialize_easyocr_reader():
|
33 |
"""Initializes the EasyOCR reader if it hasn't been already."""
|
34 |
global reader
|
35 |
if reader is None:
|
|
|
38 |
print("EasyOCR model loaded successfully!")
|
39 |
return reader
|
40 |
|
41 |
+
# --- CORE AI AND VIDEO FUNCTIONS ---
|
42 |
+
|
43 |
+
def analyze_and_transcreate_with_gemini(video_path: str, english_caption: str, progress: gr.Progress):
|
44 |
+
"""
|
45 |
+
Analyzes a video using the new comprehensive "Transcreation" prompt and extracts the result.
|
46 |
+
This single call performs analysis, translation, and caption generation, incorporating the user-provided English caption.
|
47 |
+
"""
|
48 |
+
if not API_KEY or API_KEY == "YOUR_GEMINI_API_KEY":
|
49 |
+
raise gr.Error("GEMINI_API_KEY is not set.")
|
50 |
+
|
51 |
+
try:
|
52 |
+
genai.configure(api_key=API_KEY)
|
53 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
54 |
+
|
55 |
+
progress(0.2, desc="[1/4] Performing deep analysis & transcreation with Gemini...")
|
56 |
+
|
57 |
+
### MODIFIED PROMPT (Requirements 1, 2, 3: Author Name, Category Definitions, English Caption) ###
|
58 |
+
prompt_template = f"""
|
59 |
+
Objective: Analyze the provided video (containing text) across all modalities (visuals, audio, existing text) and the user-provided English caption to generate a superior Persian translation and a suitable Instagram caption. The translation must be contextually perfect, stylistically appropriate, and culturally resonant, avoiding the feel of a literal or AI-driven translation. The caption should be concise, engaging, and aligned with the video's mood, content, and the provided English caption, without hashtags.
|
60 |
+
|
61 |
+
User-Provided English Caption: "{english_caption if english_caption else 'No caption provided.'}"
|
62 |
+
|
63 |
+
Instructions:
|
64 |
+
|
65 |
+
1. **Multi-Modal Analysis**: Perform a deep analysis of the video. Synthesize information from all three channels: visual, audio, and textual. Additionally, incorporate the user-provided English caption to inform the tone, context, and intent of the Instagram caption.
|
66 |
+
2. **Isolate Essential Text**: Use OCR to find all text, but identify only the **core, persistent message** intended for the audience. **You MUST INCLUDE any author, poet, or famous person's name (e.g., '- Rumi') in the essential text if present.** **You MUST IGNORE temporary text such as usernames that flash on screen, watermarks, or English subtitles at the bottom of the frame.** The essential text is typically the main quote or statement that stays on screen.
|
67 |
+
3. **Category Selection**: Choose the most appropriate content category based on the video's text, audio, and visuals. Use the following definitions:
|
68 |
+
- **MEME_HUMOR**: Videos with a white text box at the top, often containing phrases like "POV", "Me when...", or similar humorous, casual text, typically with playful or comedic intent.
|
69 |
+
- **COLD_MOTIVATIONAL**: Videos with dark themes (visuals or mood) and intense, strong music that evokes motivation or a driven mindset.
|
70 |
+
- **WISE_QUOTE**: Videos with peaceful, calm music and literary, poetic grammar, often quoting famous figures.
|
71 |
+
- **TWITTER_JOKE**: Videos with casual, friendly, simple text tone, accompanied by funny or lighthearted music.
|
72 |
+
4. **Synthesize and Guide**: Use the visual, audio, textual analysis, and the English caption(if provided)to define the exact emotional and stylistic parameters for the translation and Instagram caption.
|
73 |
+
5. **Instagram Caption**: Generate a concise, engaging Instagram caption in Persian that reflects the video's mood, content, cultural context, and the tone of the English caption (if provided). The caption should be standalone (not a direct translation of the text or English caption) and suitable for posting without hashtags.
|
74 |
+
6. **Format Output**: Respond ONLY with a single, raw JSON object as specified below. Do not include any explanatory text before or after the JSON.
|
75 |
+
7. **Author Formatting**: If an author's name is present (e.g., "- Rumi"), format the final translation so the author's name (in Persian) is on its own, separate line at the very end.
|
76 |
+
|
77 |
+
JSON Structure:
|
78 |
+
{{
|
79 |
+
"asset_id": "video_frame_01",
|
80 |
+
"content_category": "CHOOSE ONE: [MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE]",
|
81 |
+
"source_language": "en",
|
82 |
+
"target_language": "fa",
|
83 |
+
"comprehensive_analysis": {{
|
84 |
+
"visual_context": {{
|
85 |
+
"mood_and_aesthetics": "Describe the emotional mood conveyed by the visuals. (e.g., 'Somber and melancholic, uses slow zooms and a desaturated color palette to evoke a sense of loneliness.')",
|
86 |
+
"cinematic_style": "Describe the filming style. (e.g., 'UGC-style phone recording, shaky cam, feels raw and authentic.')",
|
87 |
+
"subject_matter": "Briefly describe what is happening visually, independent of the text. (e.g., 'A person is walking alone on a rainy street at night.')"
|
88 |
+
}},
|
89 |
+
"audio_context": {{
|
90 |
+
"music_analysis": "Describe the music's genre, tempo, and emotional impact. (e.g., 'Slow, ambient piano music, creates a feeling of introspection and sadness.')",
|
91 |
+
"sfx_analysis": "Describe any relevant sound effects. (e.g., 'The sound of rain and distant city ambiance is prominent, enhancing the feeling of isolation.')"
|
92 |
+
}},
|
93 |
+
"textual_context": {{
|
94 |
+
"full_text_detected": "The complete text from OCR, including ALL parts.",
|
95 |
+
"essential_text": "The core message INCLUDING author attribution if present (e.g., 'The wound is the place where the light enters you - Rumi'). THIS IS THE MOST IMPORTANT FIELD. Remember to exclude temporary usernames and subtitles."
|
96 |
+
}}
|
97 |
+
}},
|
98 |
+
"transcreation_directive": {{
|
99 |
+
"target_emotional_impact": "Synthesize the analysis above to define the precise emotion the Persian translation should evoke. (e.g., 'The translation should feel like a quiet, personal realization; a mix of sadness and acceptance, not dramatic grief.')",
|
100 |
+
"stylistic_guidance": {{
|
101 |
+
"formality": "CHOOSE ONE: [FORMAL_LITERARY, MODERN_POETIC, COLLOQUIAL_CASUAL, PROFESSIONAL_INFORMATIVE]",
|
102 |
+
"register": "Describe the linguistic 'flavor'. (e.g., 'Use sophisticated but natural vocabulary. Avoid slang but don't be overly academic. It should sound like a thoughtful, well-spoken friend.')"
|
103 |
+
}},
|
104 |
+
"cultural_adaptation_notes": "Provide guidance on adapting cultural nuances for a Persian audience. (e.g., 'The English concept of 'just being okay with it' can be translated to a more poetic Persian concept of resignation, like «کنار آمدن» or «پذیرفتن».')"
|
105 |
+
}},
|
106 |
+
"final_output": {{
|
107 |
+
"recommended_translation": "ONLY the final, high-quality Persian translation goes here. It should be the direct result of following the transcreation_directive.",
|
108 |
+
"translation_rationale": "Briefly explain WHY this translation was chosen, referencing the analysis.",
|
109 |
+
"instagram_caption": "A concise, engaging Persian caption for the Instagram post, without hashtags, reflecting the video's mood, content, and the English caption (if provided)."
|
110 |
+
}}
|
111 |
+
}}
|
112 |
+
"""
|
113 |
+
|
114 |
+
video_file = genai.upload_file(path=video_path)
|
115 |
+
while video_file.state.name == "PROCESSING":
|
116 |
+
time.sleep(2)
|
117 |
+
video_file = genai.get_file(video_file.name)
|
118 |
+
|
119 |
+
if video_file.state.name == "FAILED":
|
120 |
+
raise gr.Error("Gemini file upload failed.")
|
121 |
+
|
122 |
+
response = model.generate_content([prompt_template, video_file], request_options={"timeout": 180})
|
123 |
+
genai.delete_file(video_file.name)
|
124 |
+
|
125 |
+
analysis_json_text = response.text.strip()
|
126 |
+
if analysis_json_text.startswith("```json"):
|
127 |
+
analysis_json_text = analysis_json_text[7:-3].strip()
|
128 |
+
|
129 |
+
analysis_data = json.loads(analysis_json_text)
|
130 |
+
|
131 |
+
essential_text = analysis_data.get("comprehensive_analysis", {}).get("textual_context", {}).get("essential_text", "")
|
132 |
+
final_translation = analysis_data.get("final_output", {}).get("recommended_translation", "")
|
133 |
+
instagram_caption = analysis_data.get("final_output", {}).get("instagram_caption", "")
|
134 |
+
|
135 |
+
if not essential_text or not final_translation or not instagram_caption:
|
136 |
+
raise gr.Error("Gemini analysis did not return the essential text, final translation, or Instagram caption.")
|
137 |
+
|
138 |
+
return analysis_data, essential_text, final_translation, instagram_caption
|
139 |
+
|
140 |
+
except json.JSONDecodeError:
|
141 |
+
error_message = f"Gemini returned invalid JSON. The response was:\n{response.text.strip()}"
|
142 |
+
raise gr.Error(error_message)
|
143 |
+
except Exception as e:
|
144 |
+
error_message = f"An error occurred with the Gemini API: {str(e)}"
|
145 |
+
raise gr.Error(error_message)
|
146 |
+
|
147 |
+
def detect_white_header_box(image: Image.Image, progress: gr.Progress):
|
148 |
+
"""
|
149 |
+
Detects if a prominent white header box exists at the top of the video.
|
150 |
+
Returns the bounding box of this header if found, otherwise returns None.
|
151 |
+
"""
|
152 |
+
progress(0.35, desc="[2/4] Checking for white header box...")
|
153 |
+
img_array = np.array(image.convert('L')) # Convert to grayscale
|
154 |
+
frame_width, frame_height = image.size
|
155 |
+
|
156 |
+
# Analyze the top 25% of the image
|
157 |
+
scan_height = int(frame_height * 0.25)
|
158 |
+
top_section = img_array[0:scan_height, :]
|
159 |
+
|
160 |
+
# Threshold the image to find very light areas (potential white box)
|
161 |
+
_, thresh = cv2.threshold(top_section, 230, 255, cv2.THRESH_BINARY)
|
162 |
+
|
163 |
+
# Find contours
|
164 |
+
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
165 |
+
|
166 |
+
for cnt in contours:
|
167 |
+
x, y, w, h = cv2.boundingRect(cnt)
|
168 |
+
# Check if the contour is a large, wide rectangle typical of a header
|
169 |
+
if w > frame_width * 0.8 and h > frame_height * 0.05:
|
170 |
+
print(f"Detected potential white header box of size {w}x{h}.")
|
171 |
+
# Give it a little padding
|
172 |
+
padding_x = int(frame_width * 0.02)
|
173 |
+
padding_y = int(frame_height * 0.02)
|
174 |
+
final_bbox = (
|
175 |
+
max(0, x - padding_x), max(0, y - padding_y),
|
176 |
+
min(frame_width, x + w + padding_x), min(frame_height, y + h + padding_y)
|
177 |
+
)
|
178 |
+
print(f"Using white header as final bounding box: {final_bbox}")
|
179 |
+
return final_bbox
|
180 |
+
|
181 |
+
print("No dominant white header box found. Proceeding with standard text detection.")
|
182 |
+
return None
|
183 |
+
|
184 |
+
def get_bbox_for_essential_text(image: Image.Image, essential_text: str, progress: gr.Progress):
|
185 |
+
"""
|
186 |
+
Uses EasyOCR to find the precise bounding box for the essential text identified by Gemini.
|
187 |
+
"""
|
188 |
+
progress(0.4, desc="[2/4] Locating text with EasyOCR...")
|
189 |
+
ocr_reader = initialize_easyocr_reader()
|
190 |
img_array = np.array(image)
|
191 |
results = ocr_reader.readtext(img_array)
|
192 |
+
if not results: raise gr.Error("EasyOCR could not detect any text on the frame.")
|
193 |
+
|
194 |
+
essential_words = set(char.lower() for char in essential_text if char.isalnum())
|
195 |
min_x, min_y = float('inf'), float('inf')
|
196 |
max_x, max_y = float('-inf'), float('-inf')
|
197 |
+
found_match = False
|
198 |
+
|
199 |
+
print(f"Gemini's essential text: '{essential_text}'")
|
200 |
+
print("EasyOCR Results:")
|
201 |
for (bbox, text, prob) in results:
|
202 |
+
print(f"- Detected: '{text}'")
|
203 |
+
text_words = set(char.lower() for char in text if char.isalnum())
|
204 |
+
if len(essential_words.intersection(text_words)) > 0:
|
205 |
+
found_match = True
|
206 |
+
(tl, tr, br, bl) = bbox
|
207 |
+
min_x = min(min_x, tl[0], bl[0])
|
208 |
+
min_y = min(min_y, tl[1], tr[1])
|
209 |
+
max_x = max(max_x, tr[0], br[0])
|
210 |
+
max_y = max(max_y, bl[1], br[1])
|
211 |
+
print(f" ^-- Matched! Updating consolidated bbox.")
|
212 |
+
|
213 |
+
if not found_match: raise gr.Error(f"EasyOCR ran but could not locate the essential text '{essential_text}' on the video frame.")
|
214 |
+
|
215 |
+
original_height = max_y - min_y
|
216 |
+
height_reduction = original_height * 0.10
|
217 |
+
min_y += height_reduction / 2
|
218 |
+
max_y -= height_reduction / 2
|
219 |
+
print(f"Bbox height adjusted: Reduced by {height_reduction:.2f} pixels for a tighter fit.")
|
220 |
+
|
221 |
+
frame_width, frame_height = image.size
|
222 |
+
padding_x = int(frame_width * 0.02)
|
223 |
+
padding_y = int(frame_height * 0.02)
|
224 |
+
final_bbox = (
|
225 |
+
max(0, int(min_x) - padding_x), max(0, int(min_y) - padding_y),
|
226 |
+
min(frame_width, int(max_x) + padding_x), min(frame_height, int(max_y) + padding_y)
|
227 |
+
)
|
228 |
+
print(f"Final consolidated bbox (x1, y1, x2, y2): {final_bbox}")
|
229 |
+
return final_bbox
|
230 |
|
|
|
|
|
|
|
231 |
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple) -> (Image.Image, tuple):
|
232 |
"""
|
233 |
+
Creates an overlay with adaptive color and robust, auto-fitting wrapped Persian text.
|
234 |
"""
|
235 |
+
overlay_width = bbox[2] - bbox[0]
|
236 |
+
overlay_height = bbox[3] - bbox[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
try:
|
239 |
+
sample_x = max(0, int(bbox[0]) - 5)
|
240 |
+
sample_y = int((bbox[1] + bbox[3]) / 2)
|
241 |
bg_color = original_image.getpixel((sample_x, sample_y))
|
242 |
+
except (ValueError, IndexError): bg_color = (25, 25, 25)
|
|
|
243 |
|
244 |
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
245 |
draw = ImageDraw.Draw(overlay_layer)
|
246 |
|
247 |
+
luminance = (0.299 * bg_color[0] + 0.587 * bg_color[1] + 0.114 * bg_color[2])
|
248 |
+
if luminance > 128:
|
249 |
+
text_color, shadow_color = (0, 0, 0, 255), (200, 200, 200, 100)
|
250 |
+
print("Light background detected. Using BLACK text.")
|
251 |
+
else:
|
252 |
+
text_color, shadow_color = (255, 255, 255, 255), (0, 0, 0, 180)
|
253 |
+
print("Dark background detected. Using WHITE text.")
|
254 |
+
|
255 |
if not os.path.exists(PERSIAN_FONT_PATH):
|
256 |
+
raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it.")
|
257 |
|
258 |
target_width = overlay_width * 0.90
|
259 |
target_height = overlay_height * 0.90
|
260 |
font_size = 100
|
261 |
final_wrapped_lines = []
|
262 |
+
raw_lines = text_to_overlay.split('\n')
|
263 |
|
264 |
while font_size > 10:
|
265 |
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
266 |
+
max_line_width = 0
|
267 |
+
reshaped_lines_for_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
|
268 |
+
for line in reshaped_lines_for_calc:
|
269 |
+
max_line_width = max(max_line_width, font.getlength(line))
|
270 |
+
line_heights = [font.getbbox(l)[3] for l in reshaped_lines_for_calc if l]
|
271 |
+
total_height = sum(line_heights) + (len(raw_lines) - 1) * (font_size * 0.3)
|
272 |
+
if total_height <= target_height and max_line_width <= target_width:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
final_wrapped_lines = raw_lines
|
274 |
break
|
275 |
else:
|
276 |
font_size -= 2
|
277 |
|
278 |
if not final_wrapped_lines:
|
279 |
+
font_size = 10
|
280 |
+
final_wrapped_lines = raw_lines
|
281 |
+
print("Warning: Text was too long to fit perfectly. Using minimum font size.")
|
282 |
|
283 |
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
284 |
+
print(f"Final font size: {font_size}px")
|
|
|
|
|
285 |
final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
|
286 |
+
line_heights_render = [final_font.getbbox(l)[3] for l in final_reshaped_lines]
|
287 |
+
total_text_height = sum(line_heights_render) + (len(final_reshaped_lines) - 1) * (font_size * 0.3)
|
|
|
288 |
y_start = (overlay_height - total_text_height) / 2
|
289 |
current_y = y_start
|
|
|
290 |
for i, reshaped_line in enumerate(final_reshaped_lines):
|
291 |
+
line_width = final_font.getlength(reshaped_line)
|
292 |
+
x_position = (overlay_width - line_width) / 2
|
293 |
+
draw.text((x_position + 1, current_y + 1), reshaped_line, font=final_font, fill=shadow_color)
|
294 |
+
draw.text((x_position, current_y), reshaped_line, font=final_font, fill=text_color)
|
295 |
+
current_y += line_heights_render[i] + (font_size * 0.3)
|
296 |
+
return overlay_layer, bbox
|
|
|
297 |
|
298 |
+
# --- MAIN VIDEO PROCESSING PIPELINE ---
|
299 |
+
def process_video(video_path, english_caption, progress=gr.Progress()):
|
300 |
+
if video_path is None: raise gr.Error("Please upload or download a video file first.")
|
301 |
|
302 |
+
progress(0, desc="Starting process...")
|
303 |
+
analysis_data, essential_text, translated_text, instagram_caption = analyze_and_transcreate_with_gemini(video_path, english_caption, progress)
|
|
|
304 |
|
|
|
305 |
cap = cv2.VideoCapture(video_path)
|
306 |
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
307 |
+
frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
308 |
+
fps, total_frames = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
|
|
|
|
|
|
|
309 |
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
310 |
ret, middle_frame_bgr = cap.read()
|
311 |
if not ret: raise gr.Error("Could not read middle frame.")
|
312 |
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
313 |
|
314 |
+
# Prioritize white header box detection
|
315 |
+
bbox = detect_white_header_box(middle_frame_rgb_pil, progress)
|
316 |
+
if bbox is None:
|
317 |
+
# Fallback to the original EasyOCR method if no header is found
|
318 |
+
bbox = get_bbox_for_essential_text(middle_frame_rgb_pil, essential_text, progress)
|
319 |
|
320 |
+
progress(0.5, desc="[3/4] Rendering translated text overlay...")
|
|
|
|
|
|
|
|
|
321 |
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox)
|
322 |
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
323 |
|
|
|
325 |
temp_silent_path = f"temp_silent_{timestamp}.mp4"
|
326 |
final_output_path = f"translated_video_{timestamp}.mp4"
|
327 |
|
328 |
+
progress(0.6, desc="[4/4] Composing video with overlay...")
|
329 |
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
330 |
out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
|
|
|
331 |
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
332 |
frame_idx = 0
|
333 |
x_min, y_min, x_max, y_max = overlay_position_box
|
|
|
334 |
while True:
|
335 |
ret, frame = cap.read()
|
336 |
if not ret: break
|
|
|
337 |
roi = frame[y_min:y_max, x_min:x_max]
|
338 |
+
if roi.shape[:2] != (overlay_stamp_cv.shape[0], overlay_stamp_cv.shape[1]):
|
339 |
+
h, w = roi.shape[:2]
|
340 |
+
resized_overlay = cv2.resize(overlay_stamp_cv, (w, h))
|
341 |
+
else: resized_overlay = overlay_stamp_cv
|
342 |
+
alpha = resized_overlay[:, :, 3] / 255.0
|
343 |
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
344 |
+
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + resized_overlay[:, :, :3].astype(float) * alpha_mask)
|
345 |
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
|
|
346 |
out.write(frame)
|
347 |
frame_idx += 1
|
348 |
+
progress(0.6 + (0.35 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
|
|
|
349 |
cap.release(); out.release()
|
350 |
|
351 |
+
progress(0.95, desc="Merging Audio and Applying Fade...")
|
352 |
try:
|
353 |
input_video = ffmpeg.input(temp_silent_path)
|
354 |
input_audio = ffmpeg.input(video_path).audio
|
|
|
355 |
(ffmpeg.output(
|
356 |
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
|
357 |
input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
|
358 |
).run(overwrite_output=True, quiet=True))
|
359 |
+
|
360 |
except ffmpeg.Error as e:
|
361 |
print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
|
362 |
print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
|
|
|
365 |
if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
|
366 |
|
367 |
progress(1, desc="Done!")
|
368 |
+
return final_output_path, analysis_data, instagram_caption
|
369 |
+
|
370 |
+
# --- INSTAGRAM DOWNLOADER FUNCTION ---
|
371 |
+
def download_instagram_video(ig_url: str, progress: gr.Progress = None):
|
372 |
+
"""Fetch video from Instagram post using One-API and save it locally."""
|
373 |
+
if not ig_url:
|
374 |
+
raise gr.Error("Please provide an Instagram URL.")
|
375 |
+
if not ONE_API_KEY:
|
376 |
+
raise gr.Error("ONE_API_KEY is not set for Instagram downloads.")
|
377 |
+
|
378 |
+
if progress is not None:
|
379 |
+
progress(0, desc="Downloading from Instagram...")
|
380 |
+
try:
|
381 |
+
shortcode = ig_url.split("/")[-2]
|
382 |
+
url_one = "https://api.one-api.ir/instagram/v1/post/?shortcode=" + shortcode
|
383 |
+
headers = {
|
384 |
+
"accept": "application/json",
|
385 |
+
"one-api-token": ONE_API_KEY,
|
386 |
+
"Content-Type": "application/json"
|
387 |
+
}
|
388 |
+
response = requests.get(url_one, headers=headers, timeout=30)
|
389 |
+
response.raise_for_status()
|
390 |
+
|
391 |
+
result = response.json().get("result", {})
|
392 |
+
media_list = result.get('media', [])
|
393 |
+
|
394 |
+
if not media_list:
|
395 |
+
raise ValueError("No media found in the API response.")
|
396 |
+
|
397 |
+
# Find the first video URL in the media list
|
398 |
+
video_url = None
|
399 |
+
for media_item in media_list:
|
400 |
+
if media_item.get("type") == "video":
|
401 |
+
video_url = media_item.get("url")
|
402 |
+
break
|
403 |
+
|
404 |
+
if not video_url:
|
405 |
+
raise ValueError("API response did not contain a direct video URL.")
|
406 |
+
|
407 |
+
if progress is not None:
|
408 |
+
progress(0.5, desc="Found video link. Downloading content...")
|
409 |
+
video_response = requests.get(video_url, stream=True, timeout=60)
|
410 |
+
video_response.raise_for_status()
|
411 |
+
|
412 |
+
# Save the video to a temporary file
|
413 |
+
timestamp = int(time.time())
|
414 |
+
local_filename = f"ig_download_{timestamp}.mp4"
|
415 |
+
with open(local_filename, 'wb') as f:
|
416 |
+
for chunk in video_response.iter_content(chunk_size=8192):
|
417 |
+
f.write(chunk)
|
418 |
+
|
419 |
+
print(f"Instagram video successfully downloaded to {local_filename}")
|
420 |
+
if progress is not None:
|
421 |
+
progress(1, desc="Download complete!")
|
422 |
+
return local_filename
|
423 |
+
|
424 |
+
except requests.exceptions.RequestException as e:
|
425 |
+
raise gr.Error(f"Network error while downloading from Instagram: {str(e)}")
|
426 |
+
except (ValueError, KeyError) as e:
|
427 |
+
print(f"API parsing error: {response.text}")
|
428 |
+
raise gr.Error(f"Could not process the Instagram API response: {str(e)}")
|
429 |
+
except Exception as e:
|
430 |
+
raise gr.Error(f"An unexpected error occurred during Instagram download: {str(e)}")
|
431 |
+
|
432 |
+
# --- GRADIO INTERFACE (Updated) ---
|
433 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Video Transcreator") as demo:
|
434 |
+
gr.Markdown("# 🎬 Advanced Video Transcreator v3.4")
|
435 |
+
gr.Markdown(
|
436 |
+
"**This version uses a powerful multi-modal prompt for superior, context-aware 'Transcreation'.**\n\n"
|
437 |
+
"Upload a short video with English text, or provide an Instagram URL and an optional English caption. Clicking 'Download from URL' will download and automatically process the video. The app will analyze the video's mood, style, and caption to generate a perfectly integrated Persian translation and an Instagram caption. Author names (e.g., '- Rumi') are included in the translation and overlaid on a separate line."
|
438 |
+
)
|
439 |
+
|
440 |
with gr.Row():
|
441 |
+
with gr.Column(scale=2):
|
442 |
+
video_input = gr.Video(label="Upload Video or Use URL Below")
|
443 |
+
with gr.Row():
|
444 |
+
ig_url_input = gr.Textbox(label="Instagram Post URL", placeholder="e.g., https://www.instagram.com/p/C1a2b3Y4deF/")
|
445 |
+
english_caption_input = gr.Textbox(label="English Caption (Optional)", placeholder="e.g., A moment of reflection with Rumi's wisdom")
|
446 |
+
download_button = gr.Button("Download from URL")
|
447 |
+
with gr.Column(scale=3):
|
448 |
+
video_output = gr.Video(label="Translated Video Output")
|
449 |
+
caption_output = gr.Textbox(label="Instagram Caption (No Hashtags)", lines=3, interactive=False)
|
450 |
+
json_output = gr.JSON(label="Gemini Transcreation Analysis")
|
451 |
+
|
452 |
+
translate_button = gr.Button("Analyze and Transcreate Video", variant="primary")
|
453 |
+
|
454 |
+
# Define the logic flow
|
455 |
+
def chain_download_and_process(ig_url, english_caption):
|
456 |
+
"""Chains Instagram download with video processing."""
|
457 |
+
video_path = download_instagram_video(ig_url)
|
458 |
+
return process_video(video_path, english_caption)
|
459 |
+
|
460 |
+
download_button.click(
|
461 |
+
fn=chain_download_and_process,
|
462 |
+
inputs=[ig_url_input, english_caption_input],
|
463 |
+
outputs=[video_output, json_output, caption_output]
|
464 |
+
)
|
465 |
|
466 |
+
translate_button.click(
|
467 |
+
fn=process_video,
|
468 |
+
inputs=[video_input, english_caption_input],
|
469 |
+
outputs=[video_output, json_output, caption_output]
|
470 |
+
)
|
471 |
|
472 |
gr.Markdown("---")
|
473 |
+
gr.Markdown(
|
474 |
+
"### How it works:\n"
|
475 |
+
"1. **Gemini Transcreation:** The video and optional English caption are sent to Gemini for a deep, multi-modal analysis. Gemini is specifically instructed to **include author names** (e.g., '- Rumi') in the essential text, **ignore temporary text** (like usernames or subtitles), and generate a Persian Instagram caption based on the video and caption input.\n"
|
476 |
+
"2. **Category Classification:** The app selects a category (MEME_HUMOR, COLD_MOTIVATIONAL, WISE_QUOTE, TWITTER_JOKE) based on text, audio, and visuals, using clear definitions for accurate translation.\n"
|
477 |
+
"3. **Smart BBox Detection:** The app first checks for a **prominent white header box**. If found, it uses that for a clean overlay. If not, it falls back to `EasyOCR` to find the *exact pixel location* of the essential text Gemini identified.\n"
|
478 |
+
"4. **Render & Composite:** The Persian text, including author names on a separate line, is rendered with **adaptive color** inside the detected bounding box, with a font size that's **guaranteed to fit**, and placed precisely over the original.\n"
|
479 |
+
"5. **Finalize with Fade-In:** The original audio is merged back into the new video, and a **1-second fade-in** is applied using `ffmpeg`.\n"
|
480 |
+
"6. **Instagram Caption:** A concise, culturally appropriate caption is generated, incorporating the English caption (if provided), and displayed for use with the translated video."
|
481 |
+
)
|
482 |
|
483 |
if __name__ == "__main__":
|
484 |
+
if not os.path.exists(PERSIAN_FONT_PATH):
|
485 |
+
print(f"WARNING: Font file '{PERSIAN_FONT_PATH}' not found. The app will likely fail. Please ensure it's in the same directory.")
|
486 |
+
demo.launch(debug=True)
|