File size: 3,364 Bytes
bda7c4e
 
6b2b26c
 
 
00b7e99
bda7c4e
 
00b7e99
bda7c4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b7e99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b2b26c
 
bda7c4e
 
 
 
00b7e99
6b2b26c
bda7c4e
 
 
6b2b26c
1c32a9e
 
 
bda7c4e
1c32a9e
bda7c4e
 
 
1c32a9e
bda7c4e
 
00b7e99
bda7c4e
 
 
 
 
00b7e99
6b2b26c
 
bda7c4e
 
 
6b2b26c
bda7c4e
 
 
 
 
 
 
 
 
 
 
 
 
6b2b26c
 
00b7e99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import streamlit as st
import pandas as pd
import io
import re


# Constants
GITHUB_URL = "https://github.com/Sartify/STEL"
POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]

def extract_table_from_markdown(markdown_text, table_start):
    """Extract table content from markdown text."""
    lines = markdown_text.split('\n')
    table_content = []
    capture = False
    for line in lines:
        if line.startswith(table_start):
            capture = True
        if capture and line.strip() == '':
            break
        if capture:
            table_content.append(line)
    return '\n'.join(table_content)

def markdown_table_to_df(table_content):
    """Convert markdown table to pandas DataFrame."""
    # Split the table content into lines
    lines = table_content.split('\n')
    
    # Extract headers
    headers = [h.strip() for h in lines[0].split('|') if h.strip()]
    
    # Extract data
    data = []
    for line in lines[2:]:  # Skip the header separator line
        row = [cell.strip() for cell in line.split('|') if cell.strip()]
        if row:
            data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    # Convert numeric columns to float
    for col in df.columns:
        if df[col].dtype == object:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass  # Keep as string if conversion fails
    
    return df

def setup_page():
    """Set up the Streamlit page."""
    st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
    st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
    st.image("https://raw.githubusercontent.com/username/repo/main/STEL.jpg", width=300)

def display_leaderboard(df):
    """Display the leaderboard."""
    st.header("📊 Leaderboard")
    
    # Determine which non-benchmark columns are present
    present_non_benchmark_cols = [col for col in POSSIBLE_NON_BENCHMARK_COLS if col in df.columns]
    
    # Add filters
    columns_to_filter = [col for col in df.columns if col not in present_non_benchmark_cols]
    selected_columns = st.multiselect("Select benchmarks to display:", columns_to_filter, default=columns_to_filter)
    
    # Filter dataframe
    df_display = df[present_non_benchmark_cols + selected_columns]
    
    # Display dataframe
    st.dataframe(df_display.style.format("{:.4f}", subset=selected_columns))
    
    # Download buttons
    csv = df_display.to_csv(index=False)
    st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")

# ... (rest of the code remains the same)

def main():
    setup_page()
    
    # Read README content
    with open("README.md", "r") as f:
        readme_content = f.read()
    
    # Extract and process leaderboard table
    leaderboard_table = extract_table_from_markdown(readme_content, "| Model Name")
    df_leaderboard = markdown_table_to_df(leaderboard_table)
    
    display_leaderboard(df_leaderboard)
    display_evaluation()
    display_contribution()
    display_sponsorship()
    
    st.markdown("---")
    st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")

if __name__ == "__main__":
    main()