Spaces:

sartifyllc
/

Swahili-Text-Embeddings-Leaderboard

Running

File size: 3,364 Bytes

bda7c4e
 
6b2b26c
 
 
00b7e99
bda7c4e
 
00b7e99
bda7c4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b7e99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b2b26c
 
bda7c4e
 
 
 
00b7e99
6b2b26c
bda7c4e
 
 
6b2b26c
1c32a9e
 
 
bda7c4e
1c32a9e
bda7c4e
 
 
1c32a9e
bda7c4e
 
00b7e99
bda7c4e
 
 
 
 
00b7e99
6b2b26c
 
bda7c4e
 
 
6b2b26c
bda7c4e
 
 
 
 
 
 
 
 
 
 
 
 
6b2b26c
 
00b7e99

import streamlit as st
import pandas as pd
import io
import re


# Constants
GITHUB_URL = "https://github.com/Sartify/STEL"
POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]

def extract_table_from_markdown(markdown_text, table_start):
    """Extract table content from markdown text."""
    lines = markdown_text.split('\n')
    table_content = []
    capture = False
    for line in lines:
        if line.startswith(table_start):
            capture = True
        if capture and line.strip() == '':
            break
        if capture:
            table_content.append(line)
    return '\n'.join(table_content)

def markdown_table_to_df(table_content):
    """Convert markdown table to pandas DataFrame."""
    # Split the table content into lines
    lines = table_content.split('\n')
    
    # Extract headers
    headers = [h.strip() for h in lines[0].split('|') if h.strip()]
    
    # Extract data
    data = []
    for line in lines[2:]:  # Skip the header separator line
        row = [cell.strip() for cell in line.split('|') if cell.strip()]
        if row:
            data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    # Convert numeric columns to float
    for col in df.columns:
        if df[col].dtype == object:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass  # Keep as string if conversion fails
    
    return df

def setup_page():
    """Set up the Streamlit page."""
    st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
    st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
    st.image("https://raw.githubusercontent.com/username/repo/main/STEL.jpg", width=300)

def display_leaderboard(df):
    """Display the leaderboard."""
    st.header("📊 Leaderboard")
    
    # Determine which non-benchmark columns are present
    present_non_benchmark_cols = [col for col in POSSIBLE_NON_BENCHMARK_COLS if col in df.columns]
    
    # Add filters
    columns_to_filter = [col for col in df.columns if col not in present_non_benchmark_cols]
    selected_columns = st.multiselect("Select benchmarks to display:", columns_to_filter, default=columns_to_filter)
    
    # Filter dataframe
    df_display = df[present_non_benchmark_cols + selected_columns]
    
    # Display dataframe
    st.dataframe(df_display.style.format("{:.4f}", subset=selected_columns))
    
    # Download buttons
    csv = df_display.to_csv(index=False)
    st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")

# ... (rest of the code remains the same)

def main():
    setup_page()
    
    # Read README content
    with open("README.md", "r") as f:
        readme_content = f.read()
    
    # Extract and process leaderboard table
    leaderboard_table = extract_table_from_markdown(readme_content, "| Model Name")
    df_leaderboard = markdown_table_to_df(leaderboard_table)
    
    display_leaderboard(df_leaderboard)
    display_evaluation()
    display_contribution()
    display_sponsorship()
    
    st.markdown("---")
    st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")

if __name__ == "__main__":
    main()