Spaces:

mohsinmubaraksk
/

video_dashboard

Sleeping

App Files Files Community

video_dashboard / app.py

mohsinmubaraksk

Update app.py

2832c16 verified 3 months ago

raw

history blame contribute delete

13.3 kB

	# app.py
	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import subprocess
	import threading
	import re
	from io import StringIO

	# -----------------------------
	# Data Preparation using Provided Data
	# -----------------------------
	data_str = """
	Original (Source) Audio Target Audio number of speakers whisper model Time required video length run comments
	french(fr) english(en) 2 large-v3 12:20 min 30s Subsequent Run Harshad
	french(fr) korean(ko) 2 large-v3 0:13 min 30s Subsequent Run Harshad
	french(fr) japanes(ja) 2 large-v3 3:05 min 30s Subsequent Run Harshad
	french(fr) chinese(zh) 2 large-v3 9:02 min 30s Subsequent Run Harshad
	french(fr) german(de) 2 large-v3 17:48 min 30s First Run Harshad
	french(fr) spanish(es) 2 large-v3 0:14 min 30s Subsequent Run Harshad
	french(fr) korean(ko) 2 large-v3 2:47 min 30s Subsequent Run Harshad

	french(fr) english(en) 2 medium 2:16 min 30s First Run om
	french(fr) korean(ko) 2 medium 20s 30s Subsequent Run om
	french(fr) japanes(ja) 2 medium 30s 30s Subsequent Run om
	french(fr) chinese(zh) 2 medium 17s 30s Subsequent Run om
	french(fr) german(de) 2 medium 19s 30s Subsequent Run om
	french(fr) spanish(es) 2 medium 12s 30s Subsequent Run om

	french(fr) spanish(es) 2 small 2:51 min 30s First Run Harshad
	french(fr) korean(ko) 2 small 0:09 min 30s Subsequent Run "Harshad,As per our observation for the 30 seconds video, the change in models is just affecting the processing time and the quality is not much affected"

	french(fr) english(en) 2 base 3min 28s 1 min First Run om
	french(fr) korean(ko) 2 base 3min 21s 1 min First Run om
	french(fr) japanes(ja) 2 base 18s 1 min Subsequent Run om
	french(fr) chinese(zh) 2 base 27s 1 min Subsequent Run om
	french(fr) german(de) 2 base 25s 1 min Subsequent Run om
	french(fr) spanish(es) 2 base 24s 1 min Subsequent Run om
	french(fr) korean(ko) 2 base 15s 1 min Subsequent Run om

	french(fr) english(en) 2 medium 24s 1 min Subsequent Run "om, It starts from translating step, skipping upto the diarization step in the subsequent run"
	french(fr) korean(ko) 2 medium 17s 1 min Subsequent Run om
	french(fr) japanes(ja) 2 medium 28s 1 min Subsequent Run om
	french(fr) chinese(zh) 2 medium 20s 1 min Subsequent Run om
	french(fr) german(de) 2 medium 30s 1 min Subsequent Run om
	french(fr) spanish(es) 2 medium 4min 57s 1 min First Run om

	french(fr) english(en) 2 medium 3min 31s 2 min First Run om
	french(fr) korean(ko) 2 medium 10min 41s 2 min Subsequent Run om
	french(fr) japanes(ja) 2 medium 1min 18s 2 min Subsequent Run om
	french(fr) chinese(zh) 2 medium 1min 23s 2 min Subsequent Run om
	french(fr) german(de) 2 medium 0:40 min 2 min Subsequent Run Harshad
	french(fr) spanish(es) 2 medium 10:16 min 2 min First Run Harshad

	french(fr) english(en) 2 small 8:53 min 2 min First Run Harshad
	french(fr) korean(ko) 2 small 1:09 min 2 min Subsequent Run Harshad
	french(fr) japanes(ja) 2 small 1:10 min 2 min Subsequent Run Harshad
	french(fr) german(de) 2 small 0:42 min 2 min Subsequent Run Harshad
	french(fr) spanish(es) 2 small 0:43 min 2 min Subsequent Run Harshad
	french(fr) chinese(zh) 2 small 0:22 min 2 min Subsequent Run Harshad

	french(fr) english(en) 2 small 18:42 min 5 min First Run Harshad
	french(fr) korean(ko) 2 small 1:33 min 5 min Subsequent Run "Harshad, Saw some error but processing continued"
	french(fr) japanes(ja) 2 small 1:50 min 5 min Subsequent Run Harshad
	french(fr) german(de) 2 small 2:11 min 5 min Subsequent Run Harshad
	french(fr) spanish(es) 2 small 2:06 min 5 min Subsequent Run Harshad
	french(fr) chinese(zh) 2 small 1:34 min 5 min Subsequent Run Harshad
	"""

	# Read the provided data from the string
	df = pd.read_csv(StringIO(data_str), delimiter='\t')

	# Rename "Original (Source) Audio" to "Original Audio" for consistency
	df = df.rename(columns={"Original (Source) Audio": "Original Audio"})

	# -----------------------------
	# Utility Function: Parse Time Strings
	# -----------------------------
	def parse_time(time_str):
	"""
	Convert a time string (e.g., '17:48 min', '3min 28s', '30s', or '1 min')
	into total seconds.
	"""
	if pd.isna(time_str):
	return 0
	time_str = str(time_str).strip()
	if not time_str:
	return 0

	# If the string contains a colon, assume mm:ss or hh:mm:ss format.
	if ":" in time_str:
	clean_str = re.sub(r"[^\d:]", "", time_str)
	parts = clean_str.split(":")
	try:
	parts = [int(p) for p in parts]
	except ValueError:
	return 0
	if len(parts) == 2:
	minutes, seconds = parts
	return minutes * 60 + seconds
	elif len(parts) == 3:
	hours, minutes, seconds = parts
	return hours * 3600 + minutes * 60 + seconds
	else:
	# Use regex to capture hours, minutes, seconds if available.
	pattern = r'(?:(?P<hours>\d+)\shr)?\s(?:(?P<minutes>\d+)\smin)?\s(?:(?P<seconds>\d+)\s*s)?'
	match = re.search(pattern, time_str)
	if match:
	hours = int(match.group("hours")) if match.group("hours") else 0
	minutes = int(match.group("minutes")) if match.group("minutes") else 0
	seconds = int(match.group("seconds")) if match.group("seconds") else 0
	return hours * 3600 + minutes * 60 + seconds
	return 0

	# Create new numeric columns (processing times in seconds)
	df["Time_required_seconds"] = df["Time required"].apply(parse_time)
	df["Video_length_seconds"] = df["video length"].apply(parse_time)

	# -----------------------------
	# Compute Aggregated Insights
	# -----------------------------
	# Overall average processing time by whisper model
	avg_time_by_model = df.groupby("whisper model")["Time_required_seconds"].mean().reset_index()

	# Average processing time by target audio
	avg_time_by_target = df.groupby("Target Audio")["Time_required_seconds"].mean().reset_index()

	# Mark run type based on the "run" column (if "First" appears then it's First Run)
	df["Run_type"] = df["run"].apply(lambda x: "First Run" if "First" in str(x) else "Subsequent Run")

	# Run counts by whisper model and run type
	run_counts = df.groupby(["whisper model", "Run_type"]).size().reset_index(name="count")

	# -----------------------------
	# Create Plotly Figures
	# -----------------------------
	# 1. Bar chart: Average Processing Time by Whisper Model
	fig_model = px.bar(
	avg_time_by_model,
	x="whisper model",
	y="Time_required_seconds",
	title="Average Processing Time by Whisper Model",
	labels={"Time_required_seconds": "Avg Time (seconds)", "whisper model": "Whisper Model"}
	)

	# 2. Bar chart: Average Processing Time by Target Audio
	fig_target = px.bar(
	avg_time_by_target,
	x="Target Audio",
	y="Time_required_seconds",
	title="Average Processing Time by Target Audio",
	labels={"Time_required_seconds": "Avg Time (seconds)", "Target Audio": "Target Audio"}
	)

	# 3. Scatter Plot: Processing Time vs Video Length
	fig_scatter = px.scatter(
	df,
	x="Video_length_seconds",
	y="Time_required_seconds",
	color="whisper model",
	title="Processing Time vs Video Length",
	labels={"Video_length_seconds": "Video Length (seconds)", "Time_required_seconds": "Processing Time (seconds)"}
	)

	# 4. Bar chart: Run Counts by Whisper Model and Run Type
	fig_run = px.bar(
	run_counts,
	x="whisper model",
	y="count",
	color="Run_type",
	barmode="group",
	title="Run Counts by Whisper Model",
	labels={"count": "Number of Runs", "whisper model": "Whisper Model", "Run_type": "Run Type"}
	)

	# Additional Graphs
	# Histogram of Processing Times
	fig_hist_time = px.histogram(
	df,
	x="Time_required_seconds",
	nbins=10,
	title="Distribution of Processing Times",
	labels={"Time_required_seconds": "Processing Time (seconds)"}
	)

	# Box Plot of Processing Times by Whisper Model
	fig_box_model = px.box(
	df,
	x="whisper model",
	y="Time_required_seconds",
	title="Processing Time Distribution by Whisper Model",
	labels={"Time_required_seconds": "Processing Time (seconds)"}
	)

	# Pie Chart for Distribution of Whisper Models
	model_counts = df["whisper model"].value_counts().reset_index()
	model_counts.columns = ["whisper model", "count"]
	fig_pie_model = px.pie(
	model_counts,
	names="whisper model",
	values="count",
	title="Distribution of Whisper Models"
	)

	# Histogram of Video Lengths
	fig_hist_video = px.histogram(
	df,
	x="Video_length_seconds",
	nbins=5,
	title="Distribution of Video Lengths",
	labels={"Video_length_seconds": "Video Length (seconds)"}
	)

	# Scatter Plot: Processing Time vs Video Length by Run Type
	fig_scatter_run = px.scatter(
	df,
	x="Video_length_seconds",
	y="Time_required_seconds",
	color="Run_type",
	title="Processing Time vs Video Length by Run Type",
	labels={"Video_length_seconds": "Video Length (seconds)", "Time_required_seconds": "Processing Time (seconds)"}
	)

	# Correlation Heatmap (using only numeric columns)
	corr = df[["Time_required_seconds", "Video_length_seconds"]].corr()
	fig_corr = px.imshow(
	corr,
	text_auto=True,
	title="Correlation Matrix: Processing Time & Video Length",
	labels=dict(color="Correlation")
	)

	# 3D Scatter Plot: Whisper Model vs Processing Time vs Video Length
	fig_model_time_video = px.scatter_3d(
	df,
	x="whisper model",
	y="Time_required_seconds",
	z="Video_length_seconds",
	color="whisper model",
	title="Whisper Model vs Processing Time vs Video Length",
	labels={
	"whisper model": "Whisper Model",
	"Time_required_seconds": "Processing Time (seconds)",
	"Video_length_seconds": "Video Length (seconds)"
	}
	)
	# Enlarge the 3D graph layout
	fig_model_time_video.update_layout(height=1000, width=1200)

	# New Graph:
	# First Run Average Processing Time by Whisper Model Grouped by Video Duration
	first_run_df = df[df["Run_type"] == "First Run"]
	avg_time_first_run = first_run_df.groupby(["whisper model", "video length"])["Time_required_seconds"].mean().reset_index()
	fig_first_run = px.bar(
	avg_time_first_run,
	x="whisper model",
	y="Time_required_seconds",
	color="video length",
	barmode="group",
	title="First Run Average Processing Time by Whisper Model (Grouped by Video Duration)",
	labels={"Time_required_seconds": "Avg Time (seconds)", "whisper model": "Whisper Model"}
	)

	# -----------------------------
	# Build the Streamlit App Layout
	# -----------------------------
	st.title("Video Processing Dashboard")
	st.markdown("""
	This dashboard provides insights into audio processing tasks using different Whisper models.
	It displays processing times, target languages, video lengths, run types, and more.
	""")

	# Display the raw data
	st.subheader("Raw Data")
	st.dataframe(df)

	# Display the original Plotly figures
	st.subheader("Average Processing Time by Whisper Model")
	st.plotly_chart(fig_model, use_container_width=True)

	st.subheader("Average Processing Time by Target Audio")
	st.plotly_chart(fig_target, use_container_width=True)

	st.subheader("Processing Time vs Video Length")
	st.plotly_chart(fig_scatter, use_container_width=True)

	st.subheader("Run Counts by Whisper Model")
	st.plotly_chart(fig_run, use_container_width=True)

	# Display the additional graphs
	st.subheader("Distribution of Processing Times")
	st.plotly_chart(fig_hist_time, use_container_width=True)

	st.subheader("Processing Time Distribution by Whisper Model (Box Plot)")
	st.plotly_chart(fig_box_model, use_container_width=True)

	st.subheader("Distribution of Whisper Models")
	st.plotly_chart(fig_pie_model, use_container_width=True)

	st.subheader("Distribution of Video Lengths")
	st.plotly_chart(fig_hist_video, use_container_width=True)

	st.subheader("Processing Time vs Video Length by Run Type")
	st.plotly_chart(fig_scatter_run, use_container_width=True)

	st.subheader("Correlation Matrix: Processing Time & Video Length")
	st.plotly_chart(fig_corr, use_container_width=True)

	st.subheader("Whisper Model vs Processing Time vs Video Length (3D)")
	st.plotly_chart(fig_model_time_video, use_container_width=True)

	st.subheader("First Run Avg Processing Time by Whisper Model and Video Duration")
	st.plotly_chart(fig_first_run, use_container_width=True)

	# -----------------------------
	# Optional: Start LocalTunnel for Public Access
	# -----------------------------
	def start_localtunnel(port=8501):
	"""
	Launch LocalTunnel via the command line.
	(Make sure you have it installed: npm install -g localtunnel)
	"""
	try:
	proc = subprocess.Popen(
	["lt", "--port", str(port)],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	)
	def read_tunnel_output(process):
	while True:
	line = process.stdout.readline()
	if not line:
	break
	st.write(line.strip())
	if "your url is:" in line.lower():
	public_url = line.split("your url is:")[-1].strip()
	st.success(f"LocalTunnel URL: {public_url}")
	thread = threading.Thread(target=read_tunnel_output, args=(proc,), daemon=True)
	thread.start()
	except Exception as e:
	st.error(f"Error starting LocalTunnel: {e}")

	# Uncomment the following line to start LocalTunnel when the app runs.
	# start_localtunnel(port=8501)