from moviepy.editor import VideoFileClip, concatenate_videoclips
from pydub import AudioSegment
import numpy as np
import torch
from silero_vad import load_silero_vad, get_speech_timestamps
import os
import json
from google import genai
import pandas as pd
import re
import time
from dotenv import load_dotenv

torch.set_num_threads(1)


load_dotenv()
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

def set_torch_threads(safe_ratio=0.5):
    try:
        total_cores = os.cpu_count()
        optimal_threads = max(1, int(total_cores * safe_ratio))
        torch.set_num_threads(optimal_threads)
        print(f"Set torch threads to: {optimal_threads} (out of {total_cores} cores)")
    except Exception as e:
        print(f"Failed to set torch threads dynamically: {e}")
        torch.set_num_threads(1)

def analyze_single_video(video_path):
    """Analyzes a single video for emotions using the GenAI model."""
    prompt = """
    Detect emotion from this video and classify into 3 categories: happy, sad, normal. Return only JSON format without any extra text.

    Return this JSON schema:

    {
      "Vocal": {
        "sad_score": (%),
        "happy_score": (%),
        "normal_score": (%),
        "sad_reason": (list of timestamps),
        "happy_reason": (list of timestamps),
        "normal_reason": (list of timestamps)
      },
      "Verbal": {
        "sad_score": (%),
        "happy_score": (%),
        "normal_score": (%),
        "sad_reason": (list of timestamps),
        "happy_reason": (list of timestamps),
        "normal_reason": (list of timestamps)
      },
      "Vision": {
        "sad_score": (%),
        "happy_score": (%),
        "normal_score": (%),
        "sad_reason": (list of timestamps),
        "happy_reason": (list of timestamps),
        "normal_reason": (list of timestamps)
      }
    }

    Reasons (sad_reason, happy_reason, normal_reason) should be a list of beginning-ending timestamps. For example: ['0:11-0:14', '0:23-0:25', '0:27-0:29']
    """

    try:
        with open(video_path, 'rb') as video_file:
            video_bytes = video_file.read()

        print(f"Processing: {video_path}")

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[{"text": prompt}, {"inline_data": {"data": video_bytes, "mime_type": "video/mp4"}}],
            config={"http_options": {"timeout": 60000}}
        )
        
        # Extract token usage information
        input_token = response.usage_metadata.prompt_token_count
        output_token = response.usage_metadata.candidates_token_count
        total_token = response.usage_metadata.total_token_count

        response_text = response.text.strip()
        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response_text)
        json_string = json_match.group(1).strip() if json_match else response_text
        result = json.loads(json_string)

        return (video_path, result, input_token, output_token, total_token)
    
    except Exception as e:
        print(f"Error processing {video_path}: {e}")
        return (video_path, None, 0, 0, 0)
    
def wrapper_with_delay(video_path):
    time.sleep(2)  # Add delay to avoid throttling
    return analyze_single_video(video_path)


def process_multiple_videos_from_results(results):
    """Processes results directly without re-analyzing."""
    records = []

    for video_path, result, _, _, _ in results:
        if result is None:
            continue

        video_title = os.path.basename(video_path)

        for category in ['Verbal', 'Vocal', 'Vision']:
            for emotion in ['normal', 'happy', 'sad']:
                score = result[category].get(f"{emotion}_score", 0)
                reasons = result[category].get(f"{emotion}_reason", [])
                records.append({
                    'title': video_title,
                    'category': category,
                    'emotion': emotion,
                    'score': score,
                    'reasons': json.dumps(reasons)
                })

    df = pd.DataFrame(records)
    return df

def getting_video_length(vid):
    clip = VideoFileClip(vid)
    duration = clip.duration
    return np.round(duration, decimals=2)

def get_speech_only_video_duration(video_path: str, sampling_rate: int = 16000, use_onnx: bool = False) -> float:
    # Load VAD model
    model = load_silero_vad(onnx=use_onnx)

    # Extract audio from video using pydub
    audio = AudioSegment.from_file(video_path).set_frame_rate(sampling_rate).set_channels(1)
    samples = np.array(audio.get_array_of_samples()).astype("float32") / (2**15)
    audio_tensor = torch.from_numpy(samples)

    # Get speech timestamps
    speech_timestamps = get_speech_timestamps(audio_tensor, model, sampling_rate=sampling_rate)

    # Convert sample indices to seconds
    for ts in speech_timestamps:
        ts['start'] /= sampling_rate
        ts['end'] /= sampling_rate

    if not speech_timestamps:
        return 0.0  # No speech detected

    # Load video
    video = VideoFileClip(video_path)

    # Extract speech-only clips
    clips = [video.subclip(ts['start'], ts['end']) for ts in speech_timestamps]

    # Concatenate and return duration
    final_video = concatenate_videoclips(clips)
    return final_video.duration


def getting_usage_info_from_results(video_paths, results):
    """Use pre-fetched results to avoid double processing."""
    filenames = np.vectorize(os.path.basename)(video_paths).reshape(-1, 1)
    durations = np.vectorize(getting_video_length)(video_paths).reshape(-1, 1)
    speech_durations = np.vectorize(get_speech_only_video_duration)(video_paths).reshape(-1, 1)

    token_data = np.array([[r[2], r[3], r[4]] for r in results if r[1] is not None])
    if token_data.size == 0:
        token_data = np.zeros((len(video_paths), 3))

    token_data = token_data.astype(float)

    X = 1_000_000
    input_token_price = np.round(token_data[:, 0] * 0.10 / X, decimals=4).reshape(-1, 1)
    output_token_price = np.round(token_data[:, 1] * 0.40 / X, decimals=4).reshape(-1, 1)
    total_token_price = input_token_price + output_token_price

    final_arr = np.concatenate(
        (filenames, durations, speech_durations, token_data, input_token_price, output_token_price, total_token_price),
        axis=1
    )

    df = pd.DataFrame(
        final_arr,
        columns=[
            'title', 'total_duration(s)', 'speech_duration(s)', 'input_token', 'output_token', 'total_token',
            'input_price($)', 'output_price($)', 'total_price($)'
        ]
    )
    return df