# app.py
import streamlit as st
import pandas as pd
import plotly.express as px
import subprocess
import threading
import re
from io import StringIO

# -----------------------------
# Data Preparation using Provided Data
# -----------------------------
data_str = """
Original (Source) Audio	Target Audio	number of speakers	whisper model	Time required	video length	run	comments
french(fr)	english(en)	2	large-v3	12:20 min	30s	Subsequent Run	Harshad
french(fr)	korean(ko)	2	large-v3	0:13 min	30s	Subsequent Run	Harshad
french(fr)	japanes(ja)	2	large-v3	3:05 min	30s	Subsequent Run	Harshad
french(fr)	chinese(zh)	2	large-v3	9:02 min	30s	Subsequent Run	Harshad
french(fr)	german(de)	2	large-v3	17:48 min	30s	First Run	Harshad
french(fr)	spanish(es)	2	large-v3	0:14 min	30s	Subsequent Run	Harshad
french(fr)	korean(ko)	2	large-v3	2:47 min	30s	Subsequent Run	Harshad

french(fr)	english(en)	2	medium	2:16 min	30s	First Run	om
french(fr)	korean(ko)	2	medium	20s	30s	Subsequent Run	om
french(fr)	japanes(ja)	2	medium	30s	30s	Subsequent Run	om
french(fr)	chinese(zh)	2	medium	17s	30s	Subsequent Run	om
french(fr)	german(de)	2	medium	19s	30s	Subsequent Run	om
french(fr)	spanish(es)	2	medium	12s	30s	Subsequent Run	om

french(fr)	spanish(es)	2	small	2:51 min	30s	First Run	Harshad
french(fr)	korean(ko)	2	small	0:09 min	30s	Subsequent Run	"Harshad,As per our observation for the 30 seconds video, the change in models is just affecting the processing time and the quality is not  much affected"

french(fr)	english(en)	2	base	3min 28s	1 min	First Run	om
french(fr)	korean(ko)	2	base	3min 21s	1 min	First Run	om
french(fr)	japanes(ja)	2	base	18s	1 min	Subsequent Run	om
french(fr)	chinese(zh)	2	base	27s	1 min	Subsequent Run	om
french(fr)	german(de)	2	base	25s	1 min	Subsequent Run	om
french(fr)	spanish(es)	2	base	24s	1 min	Subsequent Run	om
french(fr)	korean(ko)	2	base	15s	1 min	Subsequent Run	om

french(fr)	english(en)	2	medium	24s	1 min	Subsequent Run	"om, It starts from translating step, skipping upto the diarization step in the subsequent run"
french(fr)	korean(ko)	2	medium	17s	1 min	Subsequent Run	om
french(fr)	japanes(ja)	2	medium	28s	1 min	Subsequent Run	om
french(fr)	chinese(zh)	2	medium	20s	1 min	Subsequent Run	om
french(fr)	german(de)	2	medium	30s	1 min	Subsequent Run	om
french(fr)	spanish(es)	2	medium	4min 57s	1 min	First Run	om

french(fr)	english(en)	2	medium	3min 31s	2 min	First Run	om
french(fr)	korean(ko)	2	medium	10min 41s	2 min	Subsequent Run	om
french(fr)	japanes(ja)	2	medium	1min 18s	2 min	Subsequent Run	om
french(fr)	chinese(zh)	2	medium	1min 23s	2 min	Subsequent Run	om
french(fr)	german(de)	2	medium	0:40 min	2 min	Subsequent Run	Harshad
french(fr)	spanish(es)	2	medium	10:16 min	2 min	First Run	Harshad

french(fr)	english(en)	2	small	8:53 min	2 min	First Run	Harshad
french(fr)	korean(ko)	2	small	1:09 min	2 min	Subsequent Run	Harshad
french(fr)	japanes(ja)	2	small	1:10 min	2 min	Subsequent Run	Harshad
french(fr)	german(de)	2	small	0:42 min	2 min	Subsequent Run	Harshad
french(fr)	spanish(es)	2	small	0:43 min	2 min	Subsequent Run	Harshad
french(fr)	chinese(zh)	2	small	0:22 min	2 min	Subsequent Run	Harshad

french(fr)	english(en)	2	small	18:42 min	5 min	First Run	Harshad
french(fr)	korean(ko)	2	small	1:33 min	5 min	Subsequent Run	"Harshad, Saw some error but processing continued"
french(fr)	japanes(ja)	2	small	1:50 min	5 min	Subsequent Run	Harshad
french(fr)	german(de)	2	small	2:11 min	5 min	Subsequent Run	Harshad
french(fr)	spanish(es)	2	small	2:06 min	5 min	Subsequent Run	Harshad
french(fr)	chinese(zh)	2	small	1:34 min	5 min	Subsequent Run	Harshad
"""

# Read the provided data from the string
df = pd.read_csv(StringIO(data_str), delimiter='\t')

# Rename "Original (Source) Audio" to "Original Audio" for consistency
df = df.rename(columns={"Original (Source) Audio": "Original Audio"})

# -----------------------------
# Utility Function: Parse Time Strings
# -----------------------------
def parse_time(time_str):
    """
    Convert a time string (e.g., '17:48 min', '3min 28s', '30s', or '1 min')
    into total seconds.
    """
    if pd.isna(time_str):
        return 0
    time_str = str(time_str).strip()
    if not time_str:
        return 0

    # If the string contains a colon, assume mm:ss or hh:mm:ss format.
    if ":" in time_str:
        clean_str = re.sub(r"[^\d:]", "", time_str)
        parts = clean_str.split(":")
        try:
            parts = [int(p) for p in parts]
        except ValueError:
            return 0
        if len(parts) == 2:
            minutes, seconds = parts
            return minutes * 60 + seconds
        elif len(parts) == 3:
            hours, minutes, seconds = parts
            return hours * 3600 + minutes * 60 + seconds
    else:
        # Use regex to capture hours, minutes, seconds if available.
        pattern = r'(?:(?P<hours>\d+)\s*hr)?\s*(?:(?P<minutes>\d+)\s*min)?\s*(?:(?P<seconds>\d+)\s*s)?'
        match = re.search(pattern, time_str)
        if match:
            hours = int(match.group("hours")) if match.group("hours") else 0
            minutes = int(match.group("minutes")) if match.group("minutes") else 0
            seconds = int(match.group("seconds")) if match.group("seconds") else 0
            return hours * 3600 + minutes * 60 + seconds
    return 0

# Create new numeric columns (processing times in seconds)
df["Time_required_seconds"] = df["Time required"].apply(parse_time)
df["Video_length_seconds"] = df["video length"].apply(parse_time)

# -----------------------------
# Compute Aggregated Insights
# -----------------------------
# Overall average processing time by whisper model
avg_time_by_model = df.groupby("whisper model")["Time_required_seconds"].mean().reset_index()

# Average processing time by target audio
avg_time_by_target = df.groupby("Target Audio")["Time_required_seconds"].mean().reset_index()

# Mark run type based on the "run" column (if "First" appears then it's First Run)
df["Run_type"] = df["run"].apply(lambda x: "First Run" if "First" in str(x) else "Subsequent Run")

# Run counts by whisper model and run type
run_counts = df.groupby(["whisper model", "Run_type"]).size().reset_index(name="count")

# -----------------------------
# Create Plotly Figures
# -----------------------------
# 1. Bar chart: Average Processing Time by Whisper Model
fig_model = px.bar(
    avg_time_by_model,
    x="whisper model",
    y="Time_required_seconds",
    title="Average Processing Time by Whisper Model",
    labels={"Time_required_seconds": "Avg Time (seconds)", "whisper model": "Whisper Model"}
)

# 2. Bar chart: Average Processing Time by Target Audio
fig_target = px.bar(
    avg_time_by_target,
    x="Target Audio",
    y="Time_required_seconds",
    title="Average Processing Time by Target Audio",
    labels={"Time_required_seconds": "Avg Time (seconds)", "Target Audio": "Target Audio"}
)

# 3. Scatter Plot: Processing Time vs Video Length
fig_scatter = px.scatter(
    df,
    x="Video_length_seconds",
    y="Time_required_seconds",
    color="whisper model",
    title="Processing Time vs Video Length",
    labels={"Video_length_seconds": "Video Length (seconds)", "Time_required_seconds": "Processing Time (seconds)"}
)

# 4. Bar chart: Run Counts by Whisper Model and Run Type
fig_run = px.bar(
    run_counts,
    x="whisper model",
    y="count",
    color="Run_type",
    barmode="group",
    title="Run Counts by Whisper Model",
    labels={"count": "Number of Runs", "whisper model": "Whisper Model", "Run_type": "Run Type"}
)

# Additional Graphs
# Histogram of Processing Times
fig_hist_time = px.histogram(
    df,
    x="Time_required_seconds",
    nbins=10,
    title="Distribution of Processing Times",
    labels={"Time_required_seconds": "Processing Time (seconds)"}
)

# Box Plot of Processing Times by Whisper Model
fig_box_model = px.box(
    df,
    x="whisper model",
    y="Time_required_seconds",
    title="Processing Time Distribution by Whisper Model",
    labels={"Time_required_seconds": "Processing Time (seconds)"}
)

# Pie Chart for Distribution of Whisper Models
model_counts = df["whisper model"].value_counts().reset_index()
model_counts.columns = ["whisper model", "count"]
fig_pie_model = px.pie(
    model_counts,
    names="whisper model",
    values="count",
    title="Distribution of Whisper Models"
)

# Histogram of Video Lengths
fig_hist_video = px.histogram(
    df,
    x="Video_length_seconds",
    nbins=5,
    title="Distribution of Video Lengths",
    labels={"Video_length_seconds": "Video Length (seconds)"}
)

# Scatter Plot: Processing Time vs Video Length by Run Type
fig_scatter_run = px.scatter(
    df,
    x="Video_length_seconds",
    y="Time_required_seconds",
    color="Run_type",
    title="Processing Time vs Video Length by Run Type",
    labels={"Video_length_seconds": "Video Length (seconds)", "Time_required_seconds": "Processing Time (seconds)"}
)

# Correlation Heatmap (using only numeric columns)
corr = df[["Time_required_seconds", "Video_length_seconds"]].corr()
fig_corr = px.imshow(
    corr,
    text_auto=True,
    title="Correlation Matrix: Processing Time & Video Length",
    labels=dict(color="Correlation")
)

# 3D Scatter Plot: Whisper Model vs Processing Time vs Video Length
fig_model_time_video = px.scatter_3d(
    df,
    x="whisper model",
    y="Time_required_seconds",
    z="Video_length_seconds",
    color="whisper model",
    title="Whisper Model vs Processing Time vs Video Length",
    labels={
        "whisper model": "Whisper Model",
        "Time_required_seconds": "Processing Time (seconds)",
        "Video_length_seconds": "Video Length (seconds)"
    }
)
# Enlarge the 3D graph layout
fig_model_time_video.update_layout(height=1000, width=1200)

# New Graph:
# First Run Average Processing Time by Whisper Model Grouped by Video Duration
first_run_df = df[df["Run_type"] == "First Run"]
avg_time_first_run = first_run_df.groupby(["whisper model", "video length"])["Time_required_seconds"].mean().reset_index()
fig_first_run = px.bar(
    avg_time_first_run,
    x="whisper model",
    y="Time_required_seconds",
    color="video length",
    barmode="group",
    title="First Run Average Processing Time by Whisper Model (Grouped by Video Duration)",
    labels={"Time_required_seconds": "Avg Time (seconds)", "whisper model": "Whisper Model"}
)

# -----------------------------
# Build the Streamlit App Layout
# -----------------------------
st.title("Video Processing Dashboard")
st.markdown("""
This dashboard provides insights into audio processing tasks using different Whisper models.
It displays processing times, target languages, video lengths, run types, and more.
""")

# Display the raw data
st.subheader("Raw Data")
st.dataframe(df)

# Display the original Plotly figures
st.subheader("Average Processing Time by Whisper Model")
st.plotly_chart(fig_model, use_container_width=True)

st.subheader("Average Processing Time by Target Audio")
st.plotly_chart(fig_target, use_container_width=True)

st.subheader("Processing Time vs Video Length")
st.plotly_chart(fig_scatter, use_container_width=True)

st.subheader("Run Counts by Whisper Model")
st.plotly_chart(fig_run, use_container_width=True)

# Display the additional graphs
st.subheader("Distribution of Processing Times")
st.plotly_chart(fig_hist_time, use_container_width=True)

st.subheader("Processing Time Distribution by Whisper Model (Box Plot)")
st.plotly_chart(fig_box_model, use_container_width=True)

st.subheader("Distribution of Whisper Models")
st.plotly_chart(fig_pie_model, use_container_width=True)

st.subheader("Distribution of Video Lengths")
st.plotly_chart(fig_hist_video, use_container_width=True)

st.subheader("Processing Time vs Video Length by Run Type")
st.plotly_chart(fig_scatter_run, use_container_width=True)

st.subheader("Correlation Matrix: Processing Time & Video Length")
st.plotly_chart(fig_corr, use_container_width=True)

st.subheader("Whisper Model vs Processing Time vs Video Length (3D)")
st.plotly_chart(fig_model_time_video, use_container_width=True)

st.subheader("First Run Avg Processing Time by Whisper Model and Video Duration")
st.plotly_chart(fig_first_run, use_container_width=True)

# -----------------------------
# Optional: Start LocalTunnel for Public Access
# -----------------------------
def start_localtunnel(port=8501):
    """
    Launch LocalTunnel via the command line.
    (Make sure you have it installed: npm install -g localtunnel)
    """
    try:
        proc = subprocess.Popen(
            ["lt", "--port", str(port)],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
        def read_tunnel_output(process):
            while True:
                line = process.stdout.readline()
                if not line:
                    break
                st.write(line.strip())
                if "your url is:" in line.lower():
                    public_url = line.split("your url is:")[-1].strip()
                    st.success(f"LocalTunnel URL: {public_url}")
        thread = threading.Thread(target=read_tunnel_output, args=(proc,), daemon=True)
        thread.start()
    except Exception as e:
        st.error(f"Error starting LocalTunnel: {e}")

# Uncomment the following line to start LocalTunnel when the app runs.
# start_localtunnel(port=8501)