Spaces:

sartifyllc
/

Swahili-Text-Embeddings-Leaderboard

Running

App Files Files Community

Mollel commited on Jul 13, 2024

Commit

f5b2e86

verified ·

1 Parent(s): 4a04f21

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -31

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import pandas as pd
 import io
 import re
 # Constants
 GITHUB_URL = "https://github.com/Sartify/STEL"
 POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]
@@ -16,8 +15,8 @@ def extract_table_from_markdown(markdown_text, table_start):
     for line in lines:
         if line.startswith(table_start):
             capture = True
-        if capture and line.strip() == '':
-            break
         if capture:
             table_content.append(line)
     return '\n'.join(table_content)
@@ -34,7 +33,7 @@ def markdown_table_to_df(table_content):
     data = []
     for line in lines[2:]:  # Skip the header separator line
         row = [cell.strip() for cell in line.split('|') if cell.strip()]
-        if row:
             data.append(row)
     # Create DataFrame
@@ -42,11 +41,8 @@ def markdown_table_to_df(table_content):
     # Convert numeric columns to float
     for col in df.columns:
-        if df[col].dtype == object:
-            try:
-                df[col] = df[col].astype(float)
-            except ValueError:
-                pass  # Keep as string if conversion fails
     return df
@@ -56,27 +52,6 @@ def setup_page():
     st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
     st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)
-# def display_leaderboard(df):
-#     """Display the leaderboard."""
-#     st.header("📊 Leaderboard")
-#     # Determine which non-benchmark columns are present
-#     present_non_benchmark_cols = [col for col in POSSIBLE_NON_BENCHMARK_COLS if col in df.columns]
-#     # Add filters
-#     columns_to_filter = [col for col in df.columns if col not in present_non_benchmark_cols]
-#     selected_columns = st.multiselect("Select benchmarks to display:", columns_to_filter, default=columns_to_filter)
-#     # Filter dataframe
-#     df_display = df[present_non_benchmark_cols + selected_columns]
-#     # Display dataframe
-#     st.dataframe(df_display.style.format("{:.4f}", subset=selected_columns))
-#     # Download buttons
-#     csv = df_display.to_csv(index=False)
-#     st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
 def display_leaderboard(df):
     """Display the leaderboard."""
     st.header("📊 Leaderboard")
@@ -98,8 +73,73 @@ def display_leaderboard(df):
     csv = df_display.to_csv(index=False)
     st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
-# ... (rest of the code remains the same)
 def main():
     setup_page()

 import io
 import re
 # Constants
 GITHUB_URL = "https://github.com/Sartify/STEL"
 POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]
     for line in lines:
         if line.startswith(table_start):
             capture = True
+        elif capture and (line.startswith('#') or line.strip() == ''):
+            break  # Stop capturing when we reach a new section or an empty line
         if capture:
             table_content.append(line)
     return '\n'.join(table_content)
     data = []
     for line in lines[2:]:  # Skip the header separator line
         row = [cell.strip() for cell in line.split('|') if cell.strip()]
+        if row and len(row) == len(headers):  # Ensure row has the correct number of columns
             data.append(row)
     # Create DataFrame
     # Convert numeric columns to float
     for col in df.columns:
+        if col not in ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka"]:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
     return df
     st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
     st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)
 def display_leaderboard(df):
     """Display the leaderboard."""
     st.header("📊 Leaderboard")
     csv = df_display.to_csv(index=False)
     st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
+def display_evaluation():
+    """Display the evaluation section."""
+    st.header("🧪 Evaluation")
+    st.markdown("""
+    To evaluate a model on the Swahili Embeddings Text Benchmark, you can use the following Python script:
+    ```python
+    pip install mteb
+    pip install sentence-transformers
+    import mteb
+    from sentence_transformers import SentenceTransformer
+    models = ["sartifyllc/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka"]
+    for model_name in models:
+        truncate_dim = 768
+        language = "swa"
+        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
+        model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
+        tasks = [
+            mteb.get_task("AfriSentiClassification", languages=["swa"]),
+            mteb.get_task("AfriSentiLangClassification", languages=["swa"]),
+            mteb.get_task("MasakhaNEWSClassification", languages=["swa"]),
+            mteb.get_task("MassiveIntentClassification", languages=["swa"]),
+            mteb.get_task("MassiveScenarioClassification", languages=["swa"]),
+            mteb.get_task("SwahiliNewsClassification", languages=["swa"]),
+        ]
+        evaluation = mteb.MTEB(tasks=tasks)
+        results = evaluation.run(model, output_folder=f"{model_name}")
+        tasks = mteb.get_tasks(task_types=["PairClassification", "Reranking", "BitextMining", "Clustering", "Retrieval"], languages=["swa"])
+        evaluation = mteb.MTEB(tasks=tasks)
+        results = evaluation.run(model, output_folder=f"{model_name}")
+    ```
+    """)
+def display_contribution():
+    """Display the contribution section."""
+    st.header("🤝 How to Contribute")
+    st.markdown("""
+    We welcome and appreciate all contributions! You can help by:
+    ### Table Work
+    - Filling in missing entries.
+    - New models are added as new rows to the leaderboard (maintaining descending order).
+    - Add new benchmarks as new columns in the leaderboard and include them in the benchmarks table (maintaining descending order).
+    ### Code Work
+    - Improving the existing code.
+    - Requesting and implementing new features.
+    """)
+def display_sponsorship():
+    """Display the sponsorship section."""
+    st.header("🤝 Sponsorship")
+    st.markdown("""
+    This benchmark is Swahili-based, and we need support translating and curating more tasks into Swahili.
+    Sponsorships are welcome to help advance this endeavour. Your sponsorship will facilitate essential
+    translation efforts, bridge language barriers, and make the benchmark accessible to a broader audience.
+    We are grateful for the dedication shown by our collaborators and aim to extend this impact further
+    with the support of sponsors committed to advancing language technologies.
+    """)
 def main():
     setup_page()