Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,207 +1,271 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
import
|
|
|
|
|
|
|
|
|
|
|
3 |
import os
|
4 |
-
import json
|
5 |
import time
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
You are an expert producer of viral short-form content. Analyze the provided SRT transcript to find the single most impactful, hook-worthy segment for a video under 3 minutes (ideally 60-90 seconds).
|
18 |
|
19 |
-
|
|
|
20 |
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
}}
|
33 |
-
"""
|
34 |
|
35 |
-
|
36 |
-
|
37 |
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
{transcript_content}
|
42 |
|
43 |
-
|
44 |
-
Your output MUST be a single, valid JSON object and nothing else. Do not include any text, code blocks, or explanations before or after the JSON object.
|
45 |
|
46 |
-
{{
|
47 |
-
"narrative_summary": "A one-sentence summary of the story told in the extracted clip.",
|
48 |
-
"reasoning": "Explain why this segment works as a standalone narrative, mentioning the peak moment and how the start/end points provide a full arc.",
|
49 |
-
"final_clip_start_seconds": <The precise start time in total seconds from the SRT>,
|
50 |
-
"final_clip_end_seconds": <The precise end time in total seconds from the SRT>
|
51 |
-
}}
|
52 |
-
"""
|
53 |
|
54 |
-
# ---
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
if not
|
59 |
-
raise ValueError("ONE_API_KEY and ONE_API_URL secrets are not set correctly.")
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
final_prompt = prompt_template.format(transcript_content=transcript_content)
|
66 |
-
payload = [{"role": "user", "content": final_prompt}]
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
if "result" not in result or not isinstance(result.get("result"), list) or len(result["result"]) == 0:
|
74 |
-
return f"Error: Unexpected JSON structure from API. 'result' list not found or empty.\nFull response: {json.dumps(result)}"
|
75 |
|
76 |
-
|
|
|
|
|
|
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
return message_content
|
82 |
-
else:
|
83 |
-
return f"Error: 'content' key not found in API response dictionary.\nFull response: {json.dumps(result)}"
|
84 |
-
elif isinstance(first_item_in_result, str):
|
85 |
-
return first_item_in_result
|
86 |
-
else:
|
87 |
-
return f"Error: Unknown item type in API 'result' list.\nFull response: {json.dumps(result)}"
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
return f"Error: Failed to decode JSON from API response.\nResponse Body: {response.text}"
|
95 |
|
96 |
-
|
|
|
|
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
if not ONE_API_KEY or not ONE_API_URL:
|
102 |
-
return "Error: API keys for OneAPI are not configured correctly in the Space secrets.", None
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
progress(0.1, desc="Reading SRT file...")
|
108 |
-
with open(srt_file, 'r', encoding='utf-8') as f:
|
109 |
-
transcript_content = f.read()
|
110 |
-
|
111 |
-
progress(0.2, desc="Preparing analysis prompt...")
|
112 |
-
prompt_template = PROMPT_SHORTS_MODE if analysis_mode == "Viral Spot for Shorts (< 3 mins)" else PROMPT_NARRATIVE_MODE
|
113 |
-
|
114 |
-
progress(0.4, desc="Calling AI for analysis...")
|
115 |
-
llm_response_str = call_gpt4o_oneapi(transcript_content, prompt_template)
|
116 |
-
|
117 |
-
progress(0.7, desc="Parsing AI response...")
|
118 |
-
if llm_response_str.startswith("Error") or llm_response_str.startswith("HTTP Error"):
|
119 |
-
return llm_response_str, None
|
120 |
-
|
121 |
-
try:
|
122 |
-
cleaned_response = llm_response_str.strip()
|
123 |
-
if cleaned_response.startswith("```json"):
|
124 |
-
cleaned_response = cleaned_response[7:]
|
125 |
-
if cleaned_response.endswith("```"):
|
126 |
-
cleaned_response = cleaned_response[:-3]
|
127 |
-
|
128 |
-
parsed_response = json.loads(cleaned_response)
|
129 |
-
|
130 |
-
if not isinstance(parsed_response, dict):
|
131 |
-
raise TypeError(f"AI did not return a valid JSON object. It returned a {type(parsed_response).__name__}.")
|
132 |
-
|
133 |
-
start_time = float(parsed_response['final_clip_start_seconds'])
|
134 |
-
end_time = float(parsed_response['final_clip_end_seconds'])
|
135 |
-
reasoning = parsed_response.get('reasoning', 'No reasoning provided.')
|
136 |
-
|
137 |
-
summary = (f"✅ Analysis Complete!\n\n"
|
138 |
-
f"Reasoning: {reasoning}\n\n"
|
139 |
-
f"Title Suggestion: {parsed_response.get('clip_title_suggestion', 'N/A')}\n"
|
140 |
-
f"Narrative Summary: {parsed_response.get('narrative_summary', 'N/A')}\n\n"
|
141 |
-
f"Clipping video from {time.strftime('%H:%M:%S', time.gmtime(start_time))} to {time.strftime('%H:%M:%S', time.gmtime(end_time))}.")
|
142 |
-
|
143 |
-
except (json.JSONDecodeError, KeyError, TypeError) as e:
|
144 |
-
error_msg = f"Error: Failed to parse AI response. Details: {e}\n\nRaw AI Response:\n---\n{llm_response_str}"
|
145 |
-
return error_msg, None
|
146 |
-
|
147 |
-
progress(0.8, desc="Clipping video...")
|
148 |
-
output_filename = "viral_clip.mp4"
|
149 |
-
|
150 |
-
# FIX: Load video using the correct high-level object
|
151 |
-
video = VideoFileClip(video_file)
|
152 |
-
|
153 |
-
if end_time > video.duration:
|
154 |
-
end_time = video.duration
|
155 |
-
summary += f"\n\n⚠️ Warning: End time was beyond video duration, adjusted to {end_time:.2f}s."
|
156 |
-
|
157 |
-
# Now .subclip() will exist because we imported from moviepy.editor
|
158 |
-
new_clip = video.subclip(start_time, end_time)
|
159 |
-
new_clip.write_videofile(output_filename, codec="libx264", audio_codec="aac")
|
160 |
|
161 |
-
|
162 |
-
|
|
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
finally:
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
gr.Markdown(
|
179 |
-
"""
|
180 |
-
# 🎬 AI Viral Video Extractor
|
181 |
-
This tool uses an AI agent to analyze a video transcript and automatically clip the most viral segment.
|
182 |
-
**⚠️ Important Setup:** For best security, configure `ONE_API_KEY` in your Hugging Face **Space Settings > Secrets**.
|
183 |
-
"""
|
184 |
-
)
|
185 |
with gr.Row():
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
submit_button = gr.Button("🚀 Generate Viral Clip", variant="primary")
|
195 |
-
|
196 |
-
with gr.Column(scale=2):
|
197 |
-
summary_output = gr.Textbox(label="Analysis Summary", lines=12, interactive=False)
|
198 |
-
video_output = gr.Video(label="Generated Clip", interactive=False)
|
199 |
-
|
200 |
-
submit_button.click(
|
201 |
-
fn=generate_viral_clip,
|
202 |
-
inputs=[video_input, srt_input, mode_input],
|
203 |
-
outputs=[summary_output, video_output],
|
204 |
-
)
|
205 |
|
206 |
if __name__ == "__main__":
|
207 |
demo.launch(debug=True)
|
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
import gradio as gr
|
4 |
+
import cv2
|
5 |
+
import numpy as np
|
6 |
+
from PIL import Image, ImageDraw, ImageFont
|
7 |
+
import easyocr
|
8 |
+
import google.generativeai as genai
|
9 |
+
import arabic_reshaper
|
10 |
import os
|
|
|
11 |
import time
|
12 |
+
import ffmpeg
|
13 |
+
|
14 |
+
# --- CONFIGURATION ---
|
15 |
+
API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")
|
16 |
+
PERSIAN_FONT_PATH = "Vazir.ttf"
|
17 |
+
FADE_IN_DURATION_SECONDS = 1.0
|
18 |
+
|
19 |
+
# --- GLOBAL INITIALIZATION ---
|
20 |
+
reader = None
|
21 |
+
def initialize_reader():
|
22 |
+
"""Initializes the EasyOCR reader if it hasn't been already."""
|
23 |
+
global reader
|
24 |
+
if reader is None:
|
25 |
+
print("Loading EasyOCR model...")
|
26 |
+
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
|
27 |
+
print("EasyOCR model loaded successfully!")
|
28 |
+
return reader
|
29 |
+
|
30 |
+
# --- CORE PROCESSING FUNCTIONS ---
|
31 |
+
|
32 |
+
### NEW ###: This function now also returns the average height of the original text.
|
33 |
+
def extract_text_and_bbox(image: Image.Image):
|
34 |
+
"""Extracts text, a consolidated bounding box, and the average original text height."""
|
35 |
+
ocr_reader = initialize_reader()
|
36 |
+
img_array = np.array(image)
|
37 |
+
results = ocr_reader.readtext(img_array)
|
38 |
+
if not results: return "No text detected in the image.", None, None
|
39 |
+
|
40 |
+
min_x, min_y = float('inf'), float('inf')
|
41 |
+
max_x, max_y = float('-inf'), float('-inf')
|
42 |
+
text_parts = []
|
43 |
+
original_heights = [] ### NEW ###: List to store heights of each detected text box.
|
44 |
+
|
45 |
+
for (bbox, text, prob) in results:
|
46 |
+
text_parts.append(text)
|
47 |
+
(tl, tr, br, bl) = bbox
|
48 |
+
min_x = min(min_x, tl[0], bl[0])
|
49 |
+
min_y = min(min_y, tl[1], tr[1])
|
50 |
+
max_x = max(max_x, tr[0], br[0])
|
51 |
+
max_y = max(max_y, bl[1], br[1])
|
52 |
+
|
53 |
+
# ### NEW ###: Calculate the height of this specific text box and add it.
|
54 |
+
# This is a direct measure of the original font's pixel size.
|
55 |
+
original_heights.append(br[1] - tr[1])
|
56 |
+
|
57 |
+
extracted_text = ' '.join(text_parts)
|
58 |
+
consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
|
59 |
+
|
60 |
+
# ### NEW ###: Calculate the average height from all detected text parts.
|
61 |
+
average_original_height = sum(original_heights) / len(original_heights) if original_heights else 30 # Fallback
|
62 |
+
|
63 |
+
return extracted_text, consolidated_bbox, average_original_height
|
64 |
+
|
65 |
+
def translate_text_gemini(text: str) -> str:
|
66 |
+
"""Translates text to colloquial Persian using the Gemini API."""
|
67 |
+
if not API_KEY or "YOUR_GEMINI_API_KEY_HERE" in API_KEY:
|
68 |
+
raise gr.Error("GEMINI_API_KEY is not set. Please add it as a Secret in your Hugging Face Space.")
|
69 |
+
if not text or "No text" in text:
|
70 |
+
return "No valid text to translate."
|
71 |
+
|
72 |
+
try:
|
73 |
+
genai.configure(api_key=API_KEY)
|
74 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
75 |
+
prompt = f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
|
76 |
+
response = model.generate_content(prompt)
|
77 |
+
return response.text.strip()
|
78 |
+
except Exception as e:
|
79 |
+
return f"Error during translation with Gemini: {str(e)}"
|
80 |
+
|
81 |
+
### NEW ###: This function now accepts `average_original_height` to guide its font sizing.
|
82 |
+
def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple, average_original_height: float) -> (Image.Image, tuple):
|
83 |
+
"""
|
84 |
+
Creates an overlay layer with correctly rendered, wrapped Persian text,
|
85 |
+
sized to match the original text's height.
|
86 |
+
"""
|
87 |
+
padding = 15
|
88 |
+
overlay_box = (
|
89 |
+
max(0, bbox[0] - padding),
|
90 |
+
max(0, bbox[1] - padding),
|
91 |
+
min(original_image.width, bbox[2] + padding),
|
92 |
+
min(original_image.height, bbox[3] + padding)
|
93 |
+
)
|
94 |
+
overlay_width = overlay_box[2] - overlay_box[0]
|
95 |
+
overlay_height = overlay_box[3] - overlay_box[1]
|
96 |
|
97 |
+
try:
|
98 |
+
sample_x = max(0, int(overlay_box[0]) - 5)
|
99 |
+
sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
|
100 |
+
bg_color = original_image.getpixel((sample_x, sample_y))
|
101 |
+
except (ValueError, IndexError):
|
102 |
+
bg_color = (25, 25, 25)
|
103 |
|
104 |
+
overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
|
105 |
+
draw = ImageDraw.Draw(overlay_layer)
|
|
|
106 |
|
107 |
+
if not os.path.exists(PERSIAN_FONT_PATH):
|
108 |
+
raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it to your Space.")
|
109 |
|
110 |
+
target_width = overlay_width * 0.90
|
111 |
+
target_height = overlay_height * 0.90
|
112 |
+
|
113 |
+
# ### NEW ###: This is the key change! We start the font size based on the original text's measured height.
|
114 |
+
# The 0.95 multiplier accounts for typical font padding, giving a closer visual match.
|
115 |
+
font_size = int(average_original_height * 0.95)
|
116 |
+
|
117 |
+
final_wrapped_lines = []
|
118 |
+
|
119 |
+
# This loop now starts with an intelligent font size and only shrinks if the wrapped
|
120 |
+
# text is too tall for the bounding box (a necessary fallback).
|
121 |
+
while font_size > 10:
|
122 |
+
font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
123 |
+
words = text_to_overlay.split()
|
124 |
+
if not words: break
|
125 |
+
|
126 |
+
raw_lines = []
|
127 |
+
current_line = ""
|
128 |
+
for word in words:
|
129 |
+
test_line = (current_line + " " + word).strip()
|
130 |
+
reshaped_test_line = arabic_reshaper.reshape(test_line)
|
131 |
+
line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
|
132 |
+
|
133 |
+
if line_width <= target_width:
|
134 |
+
current_line = test_line
|
135 |
+
else:
|
136 |
+
raw_lines.append(current_line)
|
137 |
+
current_line = word
|
138 |
+
raw_lines.append(current_line)
|
139 |
+
|
140 |
+
line_spacing = font_size * 0.3
|
141 |
+
reshaped_for_height_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
|
142 |
+
line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
|
143 |
+
total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
|
144 |
+
|
145 |
+
if total_height <= target_height:
|
146 |
+
final_wrapped_lines = raw_lines
|
147 |
+
break
|
148 |
+
else:
|
149 |
+
font_size -= 2 # Shrink font and try again if it doesn't fit
|
150 |
|
151 |
+
if not final_wrapped_lines:
|
152 |
+
final_wrapped_lines = [text_to_overlay]
|
153 |
|
154 |
+
final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
|
155 |
+
line_spacing = font_size * 0.3
|
156 |
+
final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
|
157 |
+
line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
|
158 |
+
total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
|
|
|
|
|
159 |
|
160 |
+
y_start = (overlay_height - total_text_height) / 2
|
161 |
+
current_y = y_start
|
162 |
|
163 |
+
for i, reshaped_line in enumerate(final_reshaped_lines):
|
164 |
+
x_center = overlay_width / 2
|
165 |
+
line_y_center = current_y + line_heights[i] / 2
|
166 |
+
|
167 |
+
draw.text((x_center + 1, line_y_center + 1), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
|
168 |
+
draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
|
169 |
|
170 |
+
current_y += line_heights[i] + line_spacing
|
|
|
171 |
|
172 |
+
return overlay_layer, overlay_box
|
|
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
# --- MAIN VIDEO PROCESSING PIPELINE ---
|
176 |
+
def process_video(video_path, progress=gr.Progress()):
|
177 |
+
if video_path is None: raise gr.Error("Please upload a video file first.")
|
178 |
|
179 |
+
progress(0, desc="Loading Video & Analyzing...")
|
180 |
+
cap = cv2.VideoCapture(video_path)
|
181 |
+
if not cap.isOpened(): raise gr.Error("Could not open video file.")
|
|
|
182 |
|
183 |
+
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
184 |
+
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
185 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
186 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
|
|
|
187 |
|
188 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
|
189 |
+
ret, middle_frame_bgr = cap.read()
|
190 |
+
if not ret: raise gr.Error("Could not read middle frame.")
|
191 |
+
middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
|
|
|
|
|
|
|
192 |
|
193 |
+
progress(0.2, desc="Detecting & Measuring Text (EasyOCR)...")
|
194 |
+
# ### NEW ###: Capture the average_original_height from our updated function.
|
195 |
+
extracted_text, bbox, avg_height = extract_text_and_bbox(middle_frame_rgb_pil)
|
196 |
+
if bbox is None: raise gr.Error(extracted_text)
|
197 |
|
198 |
+
progress(0.4, desc="Translating Text (Gemini API)...")
|
199 |
+
translated_text = translate_text_gemini(extracted_text)
|
200 |
+
if "Error" in translated_text: raise gr.Error(translated_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
+
progress(0.5, desc="Rendering Translated Text Overlay...")
|
203 |
+
# ### NEW ###: Pass the measured average height to the rendering function.
|
204 |
+
overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox, avg_height)
|
205 |
+
|
206 |
+
overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
|
|
|
207 |
|
208 |
+
timestamp = int(time.time())
|
209 |
+
temp_silent_path = f"temp_silent_{timestamp}.mp4"
|
210 |
+
final_output_path = f"translated_video_{timestamp}.mp4"
|
211 |
|
212 |
+
progress(0.6, desc="Composing Silent Video with Overlay...")
|
213 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
214 |
+
out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
|
|
|
|
|
215 |
|
216 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
217 |
+
frame_idx = 0
|
218 |
+
x_min, y_min, x_max, y_max = overlay_position_box
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
+
while True:
|
221 |
+
ret, frame = cap.read()
|
222 |
+
if not ret: break
|
223 |
|
224 |
+
roi = frame[y_min:y_max, x_min:x_max]
|
225 |
+
alpha = overlay_stamp_cv[:, :, 3] / 255.0
|
226 |
+
alpha_mask = cv2.merge([alpha, alpha, alpha])
|
227 |
+
blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_stamp_cv[:, :, :3].astype(float) * alpha_mask)
|
228 |
+
frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
|
229 |
+
|
230 |
+
out.write(frame)
|
231 |
+
frame_idx += 1
|
232 |
+
progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
|
233 |
+
|
234 |
+
cap.release(); out.release()
|
235 |
+
|
236 |
+
progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
|
237 |
+
try:
|
238 |
+
input_video = ffmpeg.input(temp_silent_path)
|
239 |
+
input_audio = ffmpeg.input(video_path).audio
|
240 |
+
|
241 |
+
(ffmpeg.output(
|
242 |
+
input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
|
243 |
+
input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
|
244 |
+
).run(overwrite_output=True, quiet=True))
|
245 |
+
except ffmpeg.Error as e:
|
246 |
+
print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
|
247 |
+
print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
|
248 |
+
raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8', errors='ignore')}")
|
249 |
finally:
|
250 |
+
if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
|
251 |
+
|
252 |
+
progress(1, desc="Done!")
|
253 |
+
return final_output_path
|
254 |
+
|
255 |
+
# --- GRADIO INTERFACE ---
|
256 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="Persian Video Quote Translator") as demo:
|
257 |
+
gr.Markdown("# 🎬 Persian Video Quote Translator")
|
258 |
+
gr.Markdown("Upload a short video containing English text. The app will detect the text, replace it with a poetic Persian translation, and preserve the original audio and video duration.")
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
with gr.Row():
|
260 |
+
video_input = gr.Video(label="Upload Video")
|
261 |
+
video_output = gr.Video(label="Translated Video Output")
|
262 |
+
translate_button = gr.Button("Translate Video", variant="primary")
|
263 |
+
|
264 |
+
translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
|
265 |
+
|
266 |
+
gr.Markdown("---")
|
267 |
+
gr.Markdown("### How it works:\n1. It analyzes the middle frame to **measure the original text's height** and find its location.\n2. It uses the Gemini API to get a high-quality, poetic Persian translation.\n3. It renders the Persian text at a size that **matches the original**, wrapping it smartly to fit.\n4. It composites this new text overlay onto every frame of the video.\n5. Finally, it uses `ffmpeg` to merge the new video with the **original audio** and add a 1-second fade-in effect.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
if __name__ == "__main__":
|
270 |
demo.launch(debug=True)
|
271 |
+
|