import streamlit as st import pandas as pd import plotly.graph_objects as go import plotly.express as px from plotly.subplots import make_subplots import numpy as np # Page configuration st.set_page_config( page_title="AI Model Leaderboard", page_icon="🏆", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS with improved contrast st.markdown(""" """, unsafe_allow_html=True) # Title and introduction st.title("🏆 OpenElla & MiniMaid Models Leaderboard") st.markdown("""

This interactive dashboard showcases the performance of OpenElla and MiniMaid model series on roleplay benchmarks. Explore different metrics, compare models, and discover performance insights.

""", unsafe_allow_html=True) # Create sample data based on the images provided data = { "Model": ["DeepSeek-RL-3B", "Dolphin-RL-GGUF", "Hermes-3-GGUF", "MiniMaid-L1", "OpenElla-Llama-3-2B", "MiniMaid-L2", "MiniMaid-L3"], "Length Score": [1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0], "Character Consistency": [1.0, 0.83, 0.83, 0.5, 0.83, 0.54, 0.54], "Immersion": [0.63, 0.46, 0.43, 0.13, 0.67, 0.6, 0.73], "Overall Score": [0.88, 0.76, 0.75, 0.51, 0.83, 0.71, 0.76], "Parameters (B)": [3.0, 7.0, 7.0, 1.0, 2.0, 1.5, 2.5], "Speed (tokens/s)": [180, 75, 70, 320, 250, 280, 220], "Family": ["DeepSeek", "Dolphin", "Hermes", "MiniMaid", "OpenElla", "MiniMaid", "MiniMaid"], "Release Date": ["2023-10", "2023-11", "2023-12", "2024-01", "2024-02", "2024-03", "2024-04"], "Description": [ "General-purpose model with strong instruction following capabilities", "Dolphin-based model optimized for roleplay", "Fine-tuned Hermes model for creative tasks", "Lightweight model optimized for speed and efficiency", "Optimized for roleplay with high character consistency", "Improved version with better immersion capabilities", "Latest generation with the best immersion scores" ] } df = pd.DataFrame(data) # Your models filter your_models = ["OpenElla-Llama-3-2B", "MiniMaid-L1", "MiniMaid-L2", "MiniMaid-L3"] # Instead of creating a separate column, we'll use the 'Family' column for coloring # Sidebar st.sidebar.markdown("

Leaderboard Controls

", unsafe_allow_html=True) # Model selection st.sidebar.markdown("### Models to Display") all_models = st.sidebar.checkbox("All Models", value=True) if all_models: selected_models = list(df["Model"]) else: selected_models = st.sidebar.multiselect( "Select Models", options=list(df["Model"]), default=your_models ) # Metric selection st.sidebar.markdown("### Metrics to Display") selected_metrics = st.sidebar.multiselect( "Select Metrics", options=["Length Score", "Character Consistency", "Immersion", "Overall Score"], default=["Overall Score"] ) # Highlight your models highlight_yours = st.sidebar.checkbox("Highlight Your Models", value=True) # Sort options sort_by = st.sidebar.selectbox( "Sort By", options=["Overall Score", "Character Consistency", "Immersion", "Length Score", "Parameters (B)", "Speed (tokens/s)"], index=0 ) ascending = st.sidebar.checkbox("Ascending Order", value=False) # Filter data and ensure proper sorting filtered_df = df[df["Model"].isin(selected_models)].sort_values(by=sort_by, ascending=ascending).reset_index(drop=True) # Create tabs tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Performance Charts", "🔍 Model Details", "📘 About"]) # Tab 1: Leaderboard with tab1: st.markdown("## 📊 Model Rankings") # Create a more visually appealing table with Plotly - using improved contrast fig = go.Figure(data=[go.Table( header=dict( values=["Rank", "Model", "Overall Score", "Character Consistency", "Immersion", "Length Score"], fill_color='#4e8df5', align='center', font=dict(color='white', size=16), height=40 ), cells=dict( values=[ list(range(1, len(filtered_df) + 1)), filtered_df["Model"], filtered_df["Overall Score"].apply(lambda x: f"{x:.2f}"), filtered_df["Character Consistency"].apply(lambda x: f"{x:.2f}"), filtered_df["Immersion"].apply(lambda x: f"{x:.2f}"), filtered_df["Length Score"].apply(lambda x: f"{x:.2f}") ], fill_color=[['#e6f7ff' if model in your_models and highlight_yours else '#f0f0f0' for model in filtered_df["Model"]]], align='center', font=dict(color='#333333', size=14), height=35 ) )]) fig.update_layout( margin=dict(l=0, r=0, t=0, b=0), height=min(100 + len(filtered_df) * 35, 500) ) st.plotly_chart(fig, use_container_width=True) # Performance overview st.markdown("## 💯 Performance Overview") if "Overall Score" in selected_metrics: fig = px.bar( filtered_df, x="Model", y="Overall Score", color="Family" if highlight_yours else None, color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"}, text_auto='.2f', title="Overall Roleplay Performance", height=400 ) fig.update_traces(textposition='outside') fig.update_layout( xaxis_title="", yaxis_title="Score", yaxis=dict(range=[0, 1.1]), plot_bgcolor="white", legend_title_text="", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5) ) st.plotly_chart(fig, use_container_width=True) # Metrics comparison if len(selected_metrics) > 0 and len(selected_metrics) < 4: cols = st.columns(len(selected_metrics)) for i, metric in enumerate(selected_metrics): if metric != "Overall Score": # Skip if already shown above with cols[i]: fig = px.bar( filtered_df, x="Model", y=metric, color="Family" if highlight_yours else None, color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"}, text_auto='.2f', title=f"{metric}", height=350 ) fig.update_traces(textposition='outside') fig.update_layout( xaxis_title="", yaxis_title="Score", yaxis=dict(range=[0, 1.1]), plot_bgcolor="white", showlegend=False ) st.plotly_chart(fig, use_container_width=True) # Tab 2: Performance Charts with tab2: st.markdown("## 📈 Performance Charts") # Radar chart for model comparison st.markdown("### Model Comparison (Radar Chart)") fig = go.Figure() categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"] # Add traces for each model for model in filtered_df["Model"]: model_data = filtered_df[filtered_df["Model"] == model] values = model_data[categories].values.flatten().tolist() # Close the radar by repeating the first value values = values + [values[0]] is_your_model = model in your_models line_width = 3 if is_your_model else 1.5 opacity = 0.9 if is_your_model else 0.6 fig.add_trace(go.Scatterpolar( r=values, theta=categories + [categories[0]], fill='toself', name=model, line=dict(width=line_width), opacity=opacity )) fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, 1] ) ), showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5), height=600 ) st.plotly_chart(fig, use_container_width=True) # Scatter plot: Parameters vs Performance st.markdown("### Efficiency Analysis") fig = px.scatter( filtered_df, x="Parameters (B)", y="Overall Score", size="Speed (tokens/s)", color="Family", hover_name="Model", text="Model", size_max=40, height=500, color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"} ) fig.update_traces( textposition='top center', marker=dict(line=dict(width=2, color='DarkSlateGrey')), ) fig.update_layout( title="Model Size vs Performance", xaxis_title="Parameters (Billions)", yaxis_title="Overall Score", yaxis=dict(range=[0.4, 1.0]), legend_title="Model Family", plot_bgcolor="white" ) st.plotly_chart(fig, use_container_width=True) # Heatmap of all metrics - improved color scale for better readability st.markdown("### Metrics Heatmap") metrics = ["Length Score", "Character Consistency", "Immersion", "Overall Score"] heatmap_df = filtered_df.set_index("Model")[metrics] fig = px.imshow( heatmap_df.values, x=metrics, y=heatmap_df.index, color_continuous_scale="Blues", # Deeper blues for better contrast labels=dict(x="Metric", y="Model", color="Score"), text_auto=".2f", height=500 ) fig.update_layout( xaxis_title="", yaxis_title="", coloraxis_colorbar=dict(title="Score"), plot_bgcolor="white" ) # Ensure text is visible on all cells fig.update_traces( texttemplate="%{text}", textfont={"color":"black"} ) st.plotly_chart(fig, use_container_width=True) # Tab 3: Model Details with tab3: st.markdown("## 🔍 Model Details") # OpenElla card with improved contrast if "OpenElla-Llama-3-2B" in selected_models: st.markdown("""

OpenElla-Llama-3-2B

OpenElla

3B Parameters

Released: February 2024

OpenElla-Llama-3-2B is optimized for roleplay with excellent character consistency and good immersion capabilities. Built on the Llama 3.2 architecture, this model delivers impressively balanced performance despite its compact 3B parameter size.

Overall Score

0.83

Character Consistency

0.83

Immersion

0.67

""", unsafe_allow_html=True) # MiniMaid model cards with improved contrast if "MiniMaid-L1" in selected_models: st.markdown("""

MiniMaid-L1

MiniMaid

1B Parameters

Released: January 2024

MiniMaid-L1 is the first generation of the MiniMaid series, designed for maximum speed and efficiency. With only 1B parameters, it's optimized for low-resource environments while still maintaining good length handling capabilities.

Overall Score

0.51

Character Consistency

0.50

Speed

320 t/s

""", unsafe_allow_html=True) if "MiniMaid-L2" in selected_models: st.markdown("""

MiniMaid-L2

MiniMaid

1B Parameters

Released: March 2024

MiniMaid-L2 represents a significant improvement over L1, with enhanced immersion capabilities and better overall roleplay performance. The model retains excellent efficiency while delivering more engaging and consistent character portrayals.

Overall Score

0.71

Immersion

0.60

Speed

280 t/s

""", unsafe_allow_html=True) if "MiniMaid-L3" in selected_models: st.markdown("""

MiniMaid-L3

MiniMaid

1B Parameters

Released: April 2024

MiniMaid-L3 is the latest and most advanced model in the MiniMaid series. With 1B parameters, it achieves the highest immersion score of all models while maintaining excellent length handling. This model represents the pinnacle of the MiniMaid series' development.

Overall Score

0.76

Immersion

0.73

Length Score

1.00

""", unsafe_allow_html=True) # Other models with improved contrast other_models = [m for m in selected_models if m not in your_models] if other_models: st.markdown("### Other Models") cols = st.columns(min(3, len(other_models))) for i, model in enumerate(other_models): model_data = df[df["Model"] == model].iloc[0] with cols[i % min(3, len(other_models))]: st.markdown(f"""

{model}

{model_data['Family']}

{model_data['Parameters (B)']}B

{model_data['Description']}

Overall Score: {model_data['Overall Score']:.2f}

""", unsafe_allow_html=True) # Tab 4: About with tab4: st.markdown("## 📘 About This Leaderboard") st.markdown("""

Understanding the Metrics

Length Score: Measures the model's ability to generate appropriately lengthy responses without being too verbose or too brief.

Character Consistency: Evaluates how well the model maintains character personality, backstory, and traits throughout the conversation.

Immersion: Assesses the model's ability to create an engaging, believable experience that draws users into the roleplay scenario.

Overall Score: A weighted combination of the above metrics, representing the model's general roleplay capability.

""", unsafe_allow_html=True) st.markdown("""

Evaluation Methodology

Models were evaluated using a comprehensive roleplay benchmark suite consisting of:

20 diverse character archetypes
15 different scenarios per character
5 conversation turns per scenario

Responses were scored by a panel of expert evaluators using standardized rubrics for each metric.

""", unsafe_allow_html=True) st.markdown("""

MiniMaid Series Development

The MiniMaid series represents an evolution in efficient roleplay models:

MiniMaid-L1: Initial release focusing on speed and efficiency
MiniMaid-L2: Improved version with better immersion and consistency
MiniMaid-L3: Latest generation with enhanced immersion capabilities

Each iteration builds upon the strengths of the previous version while addressing identified weaknesses.

""", unsafe_allow_html=True) st.markdown("""

OpenElla Development

OpenElla represents a parallel development track focused on maximizing roleplay quality in a compact model size.

Built on the Llama 3 architecture, OpenElla achieves exceptional character consistency and overall performance despite its relatively small 2B parameter size.

""", unsafe_allow_html=True) # Footer with better visibility st.markdown(""" """, unsafe_allow_html=True)