Spaces:

kavehtaheri
/

hotspotv1

Sleeping

App Files Files Community

kavehtaheri commited on Jul 28

Commit

b12b399

verified ·

1 Parent(s): 32860f3

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -252

app.py CHANGED Viewed

@@ -1,270 +1,194 @@
-# app.py
 import gradio as gr
-import cv2
-import numpy as np
-from PIL import Image, ImageDraw, ImageFont
-import easyocr
-import google.generativeai as genai
-import arabic_reshaper
 import os
 import time
-import ffmpeg
-# --- CONFIGURATION ---
-API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCu-tb3BRDIJjUt6G5ccWmrR51tOY0VZd4")
-PERSIAN_FONT_PATH = "Vazir.ttf"
-FADE_IN_DURATION_SECONDS = 1.0
-# --- GLOBAL INITIALIZATION ---
-reader = None
-def initialize_reader():
-    """Initializes the EasyOCR reader if it hasn't been already."""
-    global reader
-    if reader is None:
-        print("Loading EasyOCR model...")
-        reader = easyocr.Reader(['en'], gpu=False, verbose=False)
-        print("EasyOCR model loaded successfully!")
-    return reader
-# --- CORE PROCESSING FUNCTIONS ---
-### NEW ###: This function now also returns the average height of the original text.
-def extract_text_and_bbox(image: Image.Image):
-    """Extracts text, a consolidated bounding box, and the average original text height."""
-    ocr_reader = initialize_reader()
-    img_array = np.array(image)
-    results = ocr_reader.readtext(img_array)
-    if not results: return "No text detected in the image.", None, None
-    min_x, min_y = float('inf'), float('inf')
-    max_x, max_y = float('-inf'), float('-inf')
-    text_parts = []
-    original_heights = [] ### NEW ###: List to store heights of each detected text box.
-    for (bbox, text, prob) in results:
-        text_parts.append(text)
-        (tl, tr, br, bl) = bbox
-        min_x = min(min_x, tl[0], bl[0])
-        min_y = min(min_y, tl[1], tr[1])
-        max_x = max(max_x, tr[0], br[0])
-        max_y = max(max_y, bl[1], br[1])
-        # ### NEW ###: Calculate the height of this specific text box and add it.
-        # This is a direct measure of the original font's pixel size.
-        original_heights.append(br[1] - tr[1])
-    extracted_text = ' '.join(text_parts)
-    consolidated_bbox = (int(min_x), int(min_y), int(max_x), int(max_y))
-    # ### NEW ###: Calculate the average height from all detected text parts.
-    average_original_height = sum(original_heights) / len(original_heights) if original_heights else 30 # Fallback
-    return extracted_text, consolidated_bbox, average_original_height
-def translate_text_gemini(text: str) -> str:
-    """Translates text to colloquial Persian using the Gemini API."""
-    if not API_KEY or "YOUR_GEMINI_API_KEY_HERE" in API_KEY:
-        raise gr.Error("GEMINI_API_KEY is not set. Please add it as a Secret in your Hugging Face Space.")
-    if not text or "No text" in text:
-        return "No valid text to translate."
     try:
-        genai.configure(api_key=API_KEY)
-        model = genai.GenerativeModel('gemini-1.5-flash')
-        prompt = f"Translate the following English quotes into Persian. The translation should be colloquial, poetic, concise, and meaningful. Preserve the original message and tone. Avoid literal translations. Provide only the translated Persian text. Quotes: [{text}]"
-        response = model.generate_content(prompt)
-        return response.text.strip()
-    except Exception as e:
-        return f"Error during translation with Gemini: {str(e)}"
-### NEW ###: This function now accepts `average_original_height` to guide its font sizing.
-def render_translated_overlay(original_image: Image.Image, text_to_overlay: str, bbox: tuple, average_original_height: float) -> (Image.Image, tuple):
-    """
-    Creates an overlay layer with correctly rendered, wrapped Persian text,
-    sized to match the original text's height.
-    """
-    padding = 15
-    overlay_box = (
-        max(0, bbox[0] - padding),
-        max(0, bbox[1] - padding),
-        min(original_image.width, bbox[2] + padding),
-        min(original_image.height, bbox[3] + padding)
-    )
-    overlay_width = overlay_box[2] - overlay_box[0]
-    overlay_height = overlay_box[3] - overlay_box[1]
-    try:
-        sample_x = max(0, int(overlay_box[0]) - 5)
-        sample_y = int((overlay_box[1] + overlay_box[3]) / 2)
-        bg_color = original_image.getpixel((sample_x, sample_y))
-    except (ValueError, IndexError):
-        bg_color = (25, 25, 25)
-    overlay_layer = Image.new("RGBA", (overlay_width, overlay_height), bg_color)
-    draw = ImageDraw.Draw(overlay_layer)
-    if not os.path.exists(PERSIAN_FONT_PATH):
-         raise FileNotFoundError(f"Font file not found at '{PERSIAN_FONT_PATH}'. Please upload it to your Space.")
-    target_width = overlay_width * 0.90
-    target_height = overlay_height * 0.90
-    # ### NEW ###: This is the key change! We start the font size based on the original text's measured height.
-    # The 0.95 multiplier accounts for typical font padding, giving a closer visual match.
-    font_size = int(average_original_height * 0.95)
-    final_wrapped_lines = []
-    # This loop now starts with an intelligent font size and only shrinks if the wrapped
-    # text is too tall for the bounding box (a necessary fallback).
-    while font_size > 10:
-        font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
-        words = text_to_overlay.split()
-        if not words: break
-        raw_lines = []
-        current_line = ""
-        for word in words:
-            test_line = (current_line + " " + word).strip()
-            reshaped_test_line = arabic_reshaper.reshape(test_line)
-            line_width = draw.textbbox((0, 0), reshaped_test_line, font=font)[2]
-            if line_width <= target_width:
-                current_line = test_line
             else:
-                raw_lines.append(current_line)
-                current_line = word
-        raw_lines.append(current_line)
-        line_spacing = font_size * 0.3
-        reshaped_for_height_calc = [arabic_reshaper.reshape(l) for l in raw_lines]
-        line_heights = [draw.textbbox((0,0), l, font=font)[3] - draw.textbbox((0,0), l, font=font)[1] for l in reshaped_for_height_calc]
-        total_height = sum(line_heights) + (len(raw_lines) - 1) * line_spacing
-        if total_height <= target_height:
-            final_wrapped_lines = raw_lines
-            break
         else:
-            font_size -= 2 # Shrink font and try again if it doesn't fit
-    if not final_wrapped_lines:
-        final_wrapped_lines = [text_to_overlay]
-    final_font = ImageFont.truetype(PERSIAN_FONT_PATH, font_size)
-    line_spacing = font_size * 0.3
-    final_reshaped_lines = [arabic_reshaper.reshape(l) for l in final_wrapped_lines]
-    line_heights = [draw.textbbox((0,0), l, font=final_font)[3] - draw.textbbox((0,0), l, font=final_font)[1] for l in final_reshaped_lines]
-    total_text_height = sum(line_heights) + (len(final_reshaped_lines) - 1) * line_spacing
-    y_start = (overlay_height - total_text_height) / 2
-    current_y = y_start
-    for i, reshaped_line in enumerate(final_reshaped_lines):
-        x_center = overlay_width / 2
-        line_y_center = current_y + line_heights[i] / 2
-        draw.text((x_center + 1, line_y_center + 1), reshaped_line, font=final_font, fill=(0, 0, 0, 180), anchor="mm")
-        draw.text((x_center, line_y_center), reshaped_line, font=final_font, fill=(255, 255, 255, 255), anchor="mm")
-        current_y += line_heights[i] + line_spacing
-    return overlay_layer, overlay_box
-# --- MAIN VIDEO PROCESSING PIPELINE ---
-def process_video(video_path, progress=gr.Progress()):
-    if video_path is None: raise gr.Error("Please upload a video file first.")
-    progress(0, desc="Loading Video & Analyzing...")
-    cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened(): raise gr.Error("Could not open video file.")
-    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
-    ret, middle_frame_bgr = cap.read()
-    if not ret: raise gr.Error("Could not read middle frame.")
-    middle_frame_rgb_pil = Image.fromarray(cv2.cvtColor(middle_frame_bgr, cv2.COLOR_BGR2RGB))
-    progress(0.2, desc="Detecting & Measuring Text (EasyOCR)...")
-    # ### NEW ###: Capture the average_original_height from our updated function.
-    extracted_text, bbox, avg_height = extract_text_and_bbox(middle_frame_rgb_pil)
-    if bbox is None: raise gr.Error(extracted_text)
-    progress(0.4, desc="Translating Text (Gemini API)...")
-    translated_text = translate_text_gemini(extracted_text)
-    if "Error" in translated_text: raise gr.Error(translated_text)
-    progress(0.5, desc="Rendering Translated Text Overlay...")
-    # ### NEW ###: Pass the measured average height to the rendering function.
-    overlay_stamp_pil, overlay_position_box = render_translated_overlay(middle_frame_rgb_pil, translated_text, bbox, avg_height)
-    overlay_stamp_cv = cv2.cvtColor(np.array(overlay_stamp_pil), cv2.COLOR_RGBA2BGRA)
-    timestamp = int(time.time())
-    temp_silent_path = f"temp_silent_{timestamp}.mp4"
-    final_output_path = f"translated_video_{timestamp}.mp4"
-    progress(0.6, desc="Composing Silent Video with Overlay...")
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(temp_silent_path, fourcc, fps, (frame_width, frame_height))
-    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
-    frame_idx = 0
-    x_min, y_min, x_max, y_max = overlay_position_box
-    while True:
-        ret, frame = cap.read()
-        if not ret: break
-        roi = frame[y_min:y_max, x_min:x_max]
-        alpha = overlay_stamp_cv[:, :, 3] / 255.0
-        alpha_mask = cv2.merge([alpha, alpha, alpha])
-        blended_roi = (roi.astype(float) * (1.0 - alpha_mask) + overlay_stamp_cv[:, :, :3].astype(float) * alpha_mask)
-        frame[y_min:y_max, x_min:x_max] = blended_roi.astype(np.uint8)
-        out.write(frame)
-        frame_idx += 1
-        progress(0.6 + (0.3 * frame_idx / total_frames), desc=f"Processing frame {frame_idx}/{total_frames}")
-    cap.release(); out.release()
-    progress(0.95, desc="Merging Audio and Applying Fade (ffmpeg)...")
-    try:
-        input_video = ffmpeg.input(temp_silent_path)
-        input_audio = ffmpeg.input(video_path).audio
-        (ffmpeg.output(
-            input_video.video.filter('fade', type='in', start_time=0, duration=FADE_IN_DURATION_SECONDS),
-            input_audio, final_output_path, vcodec='libx264', acodec='copy', shortest=None
-        ).run(overwrite_output=True, quiet=True))
-    except ffmpeg.Error as e:
-        print('ffmpeg stdout:', e.stdout.decode('utf8', errors='ignore'))
-        print('ffmpeg stderr:', e.stderr.decode('utf8', errors='ignore'))
-        raise gr.Error(f"ffmpeg error: {e.stderr.decode('utf8', errors='ignore')}")
     finally:
-        if os.path.exists(temp_silent_path): os.remove(temp_silent_path)
-    progress(1, desc="Done!")
-    return final_output_path
-# --- GRADIO INTERFACE ---
-with gr.Blocks(theme=gr.themes.Soft(), title="Persian Video Quote Translator") as demo:
-    gr.Markdown("# 🎬 Persian Video Quote Translator")
-    gr.Markdown("Upload a short video containing English text. The app will detect the text, replace it with a poetic Persian translation, and preserve the original audio and video duration.")
     with gr.Row():
-        video_input = gr.Video(label="Upload Video")
-        video_output = gr.Video(label="Translated Video Output")
-    translate_button = gr.Button("Translate Video", variant="primary")
-    translate_button.click(fn=process_video, inputs=[video_input], outputs=[video_output])
-    gr.Markdown("---")
-    gr.Markdown("### How it works:\n1. It analyzes the middle frame to **measure the original text's height** and find its location.\n2. It uses the Gemini API to get a high-quality, poetic Persian translation.\n3. It renders the Persian text at a size that **matches the original**, wrapping it smartly to fit.\n4. It composites this new text overlay onto every frame of the video.\n5. Finally, it uses `ffmpeg` to merge the new video with the **original audio** and add a 1-second fade-in effect.")
 if __name__ == "__main__":
     demo.launch(debug=True)

 import gradio as gr
+import requests
 import os
+import json
 import time
+# CORRECT, HIGH-LEVEL IMPORT for MoviePy. This brings in all editing functions.
+from moviepy.editor import VideoFileClip
+import traceback # Import traceback for detailed error logging
+# --- 1. CONFIGURATION & CONSTANTS ---
+# Securely load API key from Hugging Face Space secrets.
+ONE_API_KEY = os.environ.get("ONE_API_KEY", "268976:66f4f58a2a905")
+# The custom endpoint for the one-api.ir service.
+ONE_API_URL = "https://api.one-api.ir/chatbot/v1/gpt4o/"
+# --- MASTER PROMPTS ---
+PROMPT_SHORTS_MODE = """
+You are an expert producer of viral short-form content. Analyze the provided SRT transcript to find the single most impactful, hook-worthy segment for a video under 3 minutes (ideally 60-90 seconds).
+PRIORITIES: Strong Hook, Single Clear Point, High Energy, Clear Payoff.
+**Input SRT Content:**
+{transcript_content}
+**Instructions:**
+Your output MUST be a single, valid JSON object and nothing else. Do not include any text, code blocks, or explanations before or after the JSON object.
+{{
+  "clip_title_suggestion": "A catchy, clickbait-style title for this short clip.",
+  "reasoning": "Briefly explain why this segment is perfect for a short video, referencing the hook and payoff.",
+  "final_clip_start_seconds": <The precise start time in total seconds from the SRT>,
+  "final_clip_end_seconds": <The precise end time in total seconds from the SRT>
+}}
+"""
+PROMPT_NARRATIVE_MODE = """
+You are an expert video editor and storyteller. Analyze the provided transcript to find the most compelling narrative segment between 5 and 12 minutes long.
+METHODOLOGY: Identify the peak moment, then find the corresponding setup and resolution to create a complete narrative arc.
+**Input SRT Content:**
+{transcript_content}
+**Instructions:**
+Your output MUST be a single, valid JSON object and nothing else. Do not include any text, code blocks, or explanations before or after the JSON object.
+{{
+  "narrative_summary": "A one-sentence summary of the story told in the extracted clip.",
+  "reasoning": "Explain why this segment works as a standalone narrative, mentioning the peak moment and how the start/end points provide a full arc.",
+  "final_clip_start_seconds": <The precise start time in total seconds from the SRT>,
+  "final_clip_end_seconds": <The precise end time in total seconds from the SRT>
+}}
+"""
+# --- 2. LLM AGENT WRAPPER ---
+def call_gpt4o_oneapi(transcript_content, prompt_template):
+    """Makes a robust, custom API call to the one-api.ir service."""
+    if not ONE_API_KEY or not ONE_API_URL:
+        raise ValueError("ONE_API_KEY and ONE_API_URL secrets are not set correctly.")
+    headers = {
+        "one-api-token": ONE_API_KEY,
+        "Content-Type": "application/json"
+    }
+    final_prompt = prompt_template.format(transcript_content=transcript_content)
+    payload = [{"role": "user", "content": final_prompt}]
     try:
+        response = requests.post(ONE_API_URL, headers=headers, json=payload, timeout=180)
+        response.raise_for_status()
+        result = response.json()
+        if "result" not in result or not isinstance(result.get("result"), list) or len(result["result"]) == 0:
+            return f"Error: Unexpected JSON structure from API. 'result' list not found or empty.\nFull response: {json.dumps(result)}"
+        first_item_in_result = result["result"][0]
+        if isinstance(first_item_in_result, dict):
+            message_content = first_item_in_result.get("content")
+            if message_content:
+                return message_content
             else:
+                 return f"Error: 'content' key not found in API response dictionary.\nFull response: {json.dumps(result)}"
+        elif isinstance(first_item_in_result, str):
+            return first_item_in_result
         else:
+            return f"Error: Unknown item type in API 'result' list.\nFull response: {json.dumps(result)}"
+    except requests.exceptions.HTTPError as e:
+        return f"HTTP Error calling API: {e}\nResponse Body: {e.response.text}"
+    except requests.exceptions.RequestException as e:
+        return f"Error connecting to API: {str(e)}"
+    except json.JSONDecodeError:
+        return f"Error: Failed to decode JSON from API response.\nResponse Body: {response.text}"
+# --- 3. CORE ORCHESTRATOR FUNCTION ---
+def generate_viral_clip(video_file, srt_file, analysis_mode, progress=gr.Progress()):
+    if not video_file or not srt_file:
+        return "Error: Please upload both a video file and an SRT file.", None
+    if not ONE_API_KEY or not ONE_API_URL:
+        return "Error: API keys for OneAPI are not configured correctly in the Space secrets.", None
+    video = None
+    new_clip = None
+    try:
+        progress(0.1, desc="Reading SRT file...")
+        with open(srt_file, 'r', encoding='utf-8') as f:
+            transcript_content = f.read()
+        progress(0.2, desc="Preparing analysis prompt...")
+        prompt_template = PROMPT_SHORTS_MODE if analysis_mode == "Viral Spot for Shorts (< 3 mins)" else PROMPT_NARRATIVE_MODE
+        progress(0.4, desc="Calling AI for analysis...")
+        llm_response_str = call_gpt4o_oneapi(transcript_content, prompt_template)
+        progress(0.7, desc="Parsing AI response...")
+        if llm_response_str.startswith("Error") or llm_response_str.startswith("HTTP Error"):
+             return llm_response_str, None
+        try:
+            cleaned_response = llm_response_str.strip()
+            if cleaned_response.startswith("```json"):
+                cleaned_response = cleaned_response[7:]
+            if cleaned_response.endswith("```"):
+                cleaned_response = cleaned_response[:-3]
+            parsed_response = json.loads(cleaned_response)
+            if not isinstance(parsed_response, dict):
+                raise TypeError(f"AI did not return a valid JSON object. It returned a {type(parsed_response).__name__}.")
+            start_time = float(parsed_response['final_clip_start_seconds'])
+            end_time = float(parsed_response['final_clip_end_seconds'])
+            reasoning = parsed_response.get('reasoning', 'No reasoning provided.')
+            summary = (f"✅ Analysis Complete!\n\n"
+                       f"Reasoning: {reasoning}\n\n"
+                       f"Title Suggestion: {parsed_response.get('clip_title_suggestion', 'N/A')}\n"
+                       f"Narrative Summary: {parsed_response.get('narrative_summary', 'N/A')}\n\n"
+                       f"Clipping video from {time.strftime('%H:%M:%S', time.gmtime(start_time))} to {time.strftime('%H:%M:%S', time.gmtime(end_time))}.")
+        except (json.JSONDecodeError, KeyError, TypeError) as e:
+            error_msg = f"Error: Failed to parse AI response. Details: {e}\n\nRaw AI Response:\n---\n{llm_response_str}"
+            return error_msg, None
+        progress(0.8, desc="Clipping video...")
+        output_filename = "viral_clip.mp4"
+        video = VideoFileClip(video_file)
+        if end_time > video.duration:
+            end_time = video.duration
+            summary += f"\n\n⚠️ Warning: End time was beyond video duration, adjusted to {end_time:.2f}s."
+        new_clip = video.subclip(start_time, end_time)
+        new_clip.write_videofile(output_filename, codec="libx264", audio_codec="aac")
+        progress(1.0, desc="Done!")
+        return summary, output_filename
+    except Exception as e:
+        tb_str = traceback.format_exc()
+        return f"An unexpected error occurred in the main process: {str(e)}\n\nTraceback:\n{tb_str}", None
     finally:
+        if new_clip:
+            new_clip.close()
+        if video:
+            video.close()
+# --- 4. GRADIO UI DEFINITION ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🎬 AI Viral Video Extractor
+        This tool uses an AI agent to analyze a video transcript and automatically clip the most viral segment.
+        **⚠️ Important Setup:** For best security, configure `ONE_API_KEY` in your Hugging Face **Space Settings > Secrets**.
+        """
+    )
     with gr.Row():
+        with gr.Column(scale=1):
+            video_input = gr.Video(label="1. Upload Original Video")
+            srt_input = gr.File(label="2. Upload English SRT File", file_types=['.srt'])
+            mode_input = gr.Radio(
+                label="3. Select Analysis Mode",
+                choices=["Viral Spot for Shorts (< 3 mins)", "Viral Narrative Clip (5-12 mins)"],
+                value="Viral Narrative Clip (5-12 mins)"
+            )
+            submit_button = gr.Button("🚀 Generate Viral Clip", variant="primary")
+        with gr.Column(scale=2):
+            summary_output = gr.Textbox(label="Analysis Summary", lines=12, interactive=False)
+            video_output = gr.Video(label="Generated Clip", interactive=False)
+    submit_button.click(
+        fn=generate_viral_clip,
+        inputs=[video_input, srt_input, mode_input],
+        outputs=[summary_output, video_output],
+    )
 if __name__ == "__main__":
     demo.launch(debug=True)