video_dashboard / app.py
mohsinmubaraksk's picture
Update app.py
2832c16 verified
# app.py
import streamlit as st
import pandas as pd
import plotly.express as px
import subprocess
import threading
import re
from io import StringIO
# -----------------------------
# Data Preparation using Provided Data
# -----------------------------
data_str = """
Original (Source) Audio Target Audio number of speakers whisper model Time required video length run comments
french(fr) english(en) 2 large-v3 12:20 min 30s Subsequent Run Harshad
french(fr) korean(ko) 2 large-v3 0:13 min 30s Subsequent Run Harshad
french(fr) japanes(ja) 2 large-v3 3:05 min 30s Subsequent Run Harshad
french(fr) chinese(zh) 2 large-v3 9:02 min 30s Subsequent Run Harshad
french(fr) german(de) 2 large-v3 17:48 min 30s First Run Harshad
french(fr) spanish(es) 2 large-v3 0:14 min 30s Subsequent Run Harshad
french(fr) korean(ko) 2 large-v3 2:47 min 30s Subsequent Run Harshad
french(fr) english(en) 2 medium 2:16 min 30s First Run om
french(fr) korean(ko) 2 medium 20s 30s Subsequent Run om
french(fr) japanes(ja) 2 medium 30s 30s Subsequent Run om
french(fr) chinese(zh) 2 medium 17s 30s Subsequent Run om
french(fr) german(de) 2 medium 19s 30s Subsequent Run om
french(fr) spanish(es) 2 medium 12s 30s Subsequent Run om
french(fr) spanish(es) 2 small 2:51 min 30s First Run Harshad
french(fr) korean(ko) 2 small 0:09 min 30s Subsequent Run "Harshad,As per our observation for the 30 seconds video, the change in models is just affecting the processing time and the quality is not much affected"
french(fr) english(en) 2 base 3min 28s 1 min First Run om
french(fr) korean(ko) 2 base 3min 21s 1 min First Run om
french(fr) japanes(ja) 2 base 18s 1 min Subsequent Run om
french(fr) chinese(zh) 2 base 27s 1 min Subsequent Run om
french(fr) german(de) 2 base 25s 1 min Subsequent Run om
french(fr) spanish(es) 2 base 24s 1 min Subsequent Run om
french(fr) korean(ko) 2 base 15s 1 min Subsequent Run om
french(fr) english(en) 2 medium 24s 1 min Subsequent Run "om, It starts from translating step, skipping upto the diarization step in the subsequent run"
french(fr) korean(ko) 2 medium 17s 1 min Subsequent Run om
french(fr) japanes(ja) 2 medium 28s 1 min Subsequent Run om
french(fr) chinese(zh) 2 medium 20s 1 min Subsequent Run om
french(fr) german(de) 2 medium 30s 1 min Subsequent Run om
french(fr) spanish(es) 2 medium 4min 57s 1 min First Run om
french(fr) english(en) 2 medium 3min 31s 2 min First Run om
french(fr) korean(ko) 2 medium 10min 41s 2 min Subsequent Run om
french(fr) japanes(ja) 2 medium 1min 18s 2 min Subsequent Run om
french(fr) chinese(zh) 2 medium 1min 23s 2 min Subsequent Run om
french(fr) german(de) 2 medium 0:40 min 2 min Subsequent Run Harshad
french(fr) spanish(es) 2 medium 10:16 min 2 min First Run Harshad
french(fr) english(en) 2 small 8:53 min 2 min First Run Harshad
french(fr) korean(ko) 2 small 1:09 min 2 min Subsequent Run Harshad
french(fr) japanes(ja) 2 small 1:10 min 2 min Subsequent Run Harshad
french(fr) german(de) 2 small 0:42 min 2 min Subsequent Run Harshad
french(fr) spanish(es) 2 small 0:43 min 2 min Subsequent Run Harshad
french(fr) chinese(zh) 2 small 0:22 min 2 min Subsequent Run Harshad
french(fr) english(en) 2 small 18:42 min 5 min First Run Harshad
french(fr) korean(ko) 2 small 1:33 min 5 min Subsequent Run "Harshad, Saw some error but processing continued"
french(fr) japanes(ja) 2 small 1:50 min 5 min Subsequent Run Harshad
french(fr) german(de) 2 small 2:11 min 5 min Subsequent Run Harshad
french(fr) spanish(es) 2 small 2:06 min 5 min Subsequent Run Harshad
french(fr) chinese(zh) 2 small 1:34 min 5 min Subsequent Run Harshad
"""
# Read the provided data from the string
df = pd.read_csv(StringIO(data_str), delimiter='\t')
# Rename "Original (Source) Audio" to "Original Audio" for consistency
df = df.rename(columns={"Original (Source) Audio": "Original Audio"})
# -----------------------------
# Utility Function: Parse Time Strings
# -----------------------------
def parse_time(time_str):
"""
Convert a time string (e.g., '17:48 min', '3min 28s', '30s', or '1 min')
into total seconds.
"""
if pd.isna(time_str):
return 0
time_str = str(time_str).strip()
if not time_str:
return 0
# If the string contains a colon, assume mm:ss or hh:mm:ss format.
if ":" in time_str:
clean_str = re.sub(r"[^\d:]", "", time_str)
parts = clean_str.split(":")
try:
parts = [int(p) for p in parts]
except ValueError:
return 0
if len(parts) == 2:
minutes, seconds = parts
return minutes * 60 + seconds
elif len(parts) == 3:
hours, minutes, seconds = parts
return hours * 3600 + minutes * 60 + seconds
else:
# Use regex to capture hours, minutes, seconds if available.
pattern = r'(?:(?P<hours>\d+)\s*hr)?\s*(?:(?P<minutes>\d+)\s*min)?\s*(?:(?P<seconds>\d+)\s*s)?'
match = re.search(pattern, time_str)
if match:
hours = int(match.group("hours")) if match.group("hours") else 0
minutes = int(match.group("minutes")) if match.group("minutes") else 0
seconds = int(match.group("seconds")) if match.group("seconds") else 0
return hours * 3600 + minutes * 60 + seconds
return 0
# Create new numeric columns (processing times in seconds)
df["Time_required_seconds"] = df["Time required"].apply(parse_time)
df["Video_length_seconds"] = df["video length"].apply(parse_time)
# -----------------------------
# Compute Aggregated Insights
# -----------------------------
# Overall average processing time by whisper model
avg_time_by_model = df.groupby("whisper model")["Time_required_seconds"].mean().reset_index()
# Average processing time by target audio
avg_time_by_target = df.groupby("Target Audio")["Time_required_seconds"].mean().reset_index()
# Mark run type based on the "run" column (if "First" appears then it's First Run)
df["Run_type"] = df["run"].apply(lambda x: "First Run" if "First" in str(x) else "Subsequent Run")
# Run counts by whisper model and run type
run_counts = df.groupby(["whisper model", "Run_type"]).size().reset_index(name="count")
# -----------------------------
# Create Plotly Figures
# -----------------------------
# 1. Bar chart: Average Processing Time by Whisper Model
fig_model = px.bar(
avg_time_by_model,
x="whisper model",
y="Time_required_seconds",
title="Average Processing Time by Whisper Model",
labels={"Time_required_seconds": "Avg Time (seconds)", "whisper model": "Whisper Model"}
)
# 2. Bar chart: Average Processing Time by Target Audio
fig_target = px.bar(
avg_time_by_target,
x="Target Audio",
y="Time_required_seconds",
title="Average Processing Time by Target Audio",
labels={"Time_required_seconds": "Avg Time (seconds)", "Target Audio": "Target Audio"}
)
# 3. Scatter Plot: Processing Time vs Video Length
fig_scatter = px.scatter(
df,
x="Video_length_seconds",
y="Time_required_seconds",
color="whisper model",
title="Processing Time vs Video Length",
labels={"Video_length_seconds": "Video Length (seconds)", "Time_required_seconds": "Processing Time (seconds)"}
)
# 4. Bar chart: Run Counts by Whisper Model and Run Type
fig_run = px.bar(
run_counts,
x="whisper model",
y="count",
color="Run_type",
barmode="group",
title="Run Counts by Whisper Model",
labels={"count": "Number of Runs", "whisper model": "Whisper Model", "Run_type": "Run Type"}
)
# Additional Graphs
# Histogram of Processing Times
fig_hist_time = px.histogram(
df,
x="Time_required_seconds",
nbins=10,
title="Distribution of Processing Times",
labels={"Time_required_seconds": "Processing Time (seconds)"}
)
# Box Plot of Processing Times by Whisper Model
fig_box_model = px.box(
df,
x="whisper model",
y="Time_required_seconds",
title="Processing Time Distribution by Whisper Model",
labels={"Time_required_seconds": "Processing Time (seconds)"}
)
# Pie Chart for Distribution of Whisper Models
model_counts = df["whisper model"].value_counts().reset_index()
model_counts.columns = ["whisper model", "count"]
fig_pie_model = px.pie(
model_counts,
names="whisper model",
values="count",
title="Distribution of Whisper Models"
)
# Histogram of Video Lengths
fig_hist_video = px.histogram(
df,
x="Video_length_seconds",
nbins=5,
title="Distribution of Video Lengths",
labels={"Video_length_seconds": "Video Length (seconds)"}
)
# Scatter Plot: Processing Time vs Video Length by Run Type
fig_scatter_run = px.scatter(
df,
x="Video_length_seconds",
y="Time_required_seconds",
color="Run_type",
title="Processing Time vs Video Length by Run Type",
labels={"Video_length_seconds": "Video Length (seconds)", "Time_required_seconds": "Processing Time (seconds)"}
)
# Correlation Heatmap (using only numeric columns)
corr = df[["Time_required_seconds", "Video_length_seconds"]].corr()
fig_corr = px.imshow(
corr,
text_auto=True,
title="Correlation Matrix: Processing Time & Video Length",
labels=dict(color="Correlation")
)
# 3D Scatter Plot: Whisper Model vs Processing Time vs Video Length
fig_model_time_video = px.scatter_3d(
df,
x="whisper model",
y="Time_required_seconds",
z="Video_length_seconds",
color="whisper model",
title="Whisper Model vs Processing Time vs Video Length",
labels={
"whisper model": "Whisper Model",
"Time_required_seconds": "Processing Time (seconds)",
"Video_length_seconds": "Video Length (seconds)"
}
)
# Enlarge the 3D graph layout
fig_model_time_video.update_layout(height=1000, width=1200)
# New Graph:
# First Run Average Processing Time by Whisper Model Grouped by Video Duration
first_run_df = df[df["Run_type"] == "First Run"]
avg_time_first_run = first_run_df.groupby(["whisper model", "video length"])["Time_required_seconds"].mean().reset_index()
fig_first_run = px.bar(
avg_time_first_run,
x="whisper model",
y="Time_required_seconds",
color="video length",
barmode="group",
title="First Run Average Processing Time by Whisper Model (Grouped by Video Duration)",
labels={"Time_required_seconds": "Avg Time (seconds)", "whisper model": "Whisper Model"}
)
# -----------------------------
# Build the Streamlit App Layout
# -----------------------------
st.title("Video Processing Dashboard")
st.markdown("""
This dashboard provides insights into audio processing tasks using different Whisper models.
It displays processing times, target languages, video lengths, run types, and more.
""")
# Display the raw data
st.subheader("Raw Data")
st.dataframe(df)
# Display the original Plotly figures
st.subheader("Average Processing Time by Whisper Model")
st.plotly_chart(fig_model, use_container_width=True)
st.subheader("Average Processing Time by Target Audio")
st.plotly_chart(fig_target, use_container_width=True)
st.subheader("Processing Time vs Video Length")
st.plotly_chart(fig_scatter, use_container_width=True)
st.subheader("Run Counts by Whisper Model")
st.plotly_chart(fig_run, use_container_width=True)
# Display the additional graphs
st.subheader("Distribution of Processing Times")
st.plotly_chart(fig_hist_time, use_container_width=True)
st.subheader("Processing Time Distribution by Whisper Model (Box Plot)")
st.plotly_chart(fig_box_model, use_container_width=True)
st.subheader("Distribution of Whisper Models")
st.plotly_chart(fig_pie_model, use_container_width=True)
st.subheader("Distribution of Video Lengths")
st.plotly_chart(fig_hist_video, use_container_width=True)
st.subheader("Processing Time vs Video Length by Run Type")
st.plotly_chart(fig_scatter_run, use_container_width=True)
st.subheader("Correlation Matrix: Processing Time & Video Length")
st.plotly_chart(fig_corr, use_container_width=True)
st.subheader("Whisper Model vs Processing Time vs Video Length (3D)")
st.plotly_chart(fig_model_time_video, use_container_width=True)
st.subheader("First Run Avg Processing Time by Whisper Model and Video Duration")
st.plotly_chart(fig_first_run, use_container_width=True)
# -----------------------------
# Optional: Start LocalTunnel for Public Access
# -----------------------------
def start_localtunnel(port=8501):
"""
Launch LocalTunnel via the command line.
(Make sure you have it installed: npm install -g localtunnel)
"""
try:
proc = subprocess.Popen(
["lt", "--port", str(port)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
def read_tunnel_output(process):
while True:
line = process.stdout.readline()
if not line:
break
st.write(line.strip())
if "your url is:" in line.lower():
public_url = line.split("your url is:")[-1].strip()
st.success(f"LocalTunnel URL: {public_url}")
thread = threading.Thread(target=read_tunnel_output, args=(proc,), daemon=True)
thread.start()
except Exception as e:
st.error(f"Error starting LocalTunnel: {e}")
# Uncomment the following line to start LocalTunnel when the app runs.
# start_localtunnel(port=8501)