Mollel commited on
Commit
f5b2e86
1 Parent(s): 4a04f21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -31
app.py CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
3
  import io
4
  import re
5
 
6
-
7
  # Constants
8
  GITHUB_URL = "https://github.com/Sartify/STEL"
9
  POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]
@@ -16,8 +15,8 @@ def extract_table_from_markdown(markdown_text, table_start):
16
  for line in lines:
17
  if line.startswith(table_start):
18
  capture = True
19
- if capture and line.strip() == '':
20
- break
21
  if capture:
22
  table_content.append(line)
23
  return '\n'.join(table_content)
@@ -34,7 +33,7 @@ def markdown_table_to_df(table_content):
34
  data = []
35
  for line in lines[2:]: # Skip the header separator line
36
  row = [cell.strip() for cell in line.split('|') if cell.strip()]
37
- if row:
38
  data.append(row)
39
 
40
  # Create DataFrame
@@ -42,11 +41,8 @@ def markdown_table_to_df(table_content):
42
 
43
  # Convert numeric columns to float
44
  for col in df.columns:
45
- if df[col].dtype == object:
46
- try:
47
- df[col] = df[col].astype(float)
48
- except ValueError:
49
- pass # Keep as string if conversion fails
50
 
51
  return df
52
 
@@ -56,27 +52,6 @@ def setup_page():
56
  st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
57
  st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)
58
 
59
- # def display_leaderboard(df):
60
- # """Display the leaderboard."""
61
- # st.header("📊 Leaderboard")
62
-
63
- # # Determine which non-benchmark columns are present
64
- # present_non_benchmark_cols = [col for col in POSSIBLE_NON_BENCHMARK_COLS if col in df.columns]
65
-
66
- # # Add filters
67
- # columns_to_filter = [col for col in df.columns if col not in present_non_benchmark_cols]
68
- # selected_columns = st.multiselect("Select benchmarks to display:", columns_to_filter, default=columns_to_filter)
69
-
70
- # # Filter dataframe
71
- # df_display = df[present_non_benchmark_cols + selected_columns]
72
-
73
- # # Display dataframe
74
- # st.dataframe(df_display.style.format("{:.4f}", subset=selected_columns))
75
-
76
- # # Download buttons
77
- # csv = df_display.to_csv(index=False)
78
- # st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
79
-
80
  def display_leaderboard(df):
81
  """Display the leaderboard."""
82
  st.header("📊 Leaderboard")
@@ -98,8 +73,73 @@ def display_leaderboard(df):
98
  csv = df_display.to_csv(index=False)
99
  st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- # ... (rest of the code remains the same)
 
 
 
 
 
 
 
 
 
103
 
104
  def main():
105
  setup_page()
 
3
  import io
4
  import re
5
 
 
6
  # Constants
7
  GITHUB_URL = "https://github.com/Sartify/STEL"
8
  POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]
 
15
  for line in lines:
16
  if line.startswith(table_start):
17
  capture = True
18
+ elif capture and (line.startswith('#') or line.strip() == ''):
19
+ break # Stop capturing when we reach a new section or an empty line
20
  if capture:
21
  table_content.append(line)
22
  return '\n'.join(table_content)
 
33
  data = []
34
  for line in lines[2:]: # Skip the header separator line
35
  row = [cell.strip() for cell in line.split('|') if cell.strip()]
36
+ if row and len(row) == len(headers): # Ensure row has the correct number of columns
37
  data.append(row)
38
 
39
  # Create DataFrame
 
41
 
42
  # Convert numeric columns to float
43
  for col in df.columns:
44
+ if col not in ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka"]:
45
+ df[col] = pd.to_numeric(df[col], errors='coerce')
 
 
 
46
 
47
  return df
48
 
 
52
  st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
53
  st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def display_leaderboard(df):
56
  """Display the leaderboard."""
57
  st.header("📊 Leaderboard")
 
73
  csv = df_display.to_csv(index=False)
74
  st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")
75
 
76
+ def display_evaluation():
77
+ """Display the evaluation section."""
78
+ st.header("🧪 Evaluation")
79
+ st.markdown("""
80
+ To evaluate a model on the Swahili Embeddings Text Benchmark, you can use the following Python script:
81
+ ```python
82
+ pip install mteb
83
+ pip install sentence-transformers
84
+ import mteb
85
+ from sentence_transformers import SentenceTransformer
86
+
87
+ models = ["sartifyllc/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka"]
88
+
89
+ for model_name in models:
90
+ truncate_dim = 768
91
+ language = "swa"
92
+
93
+ device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
94
+ model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
95
+
96
+ tasks = [
97
+ mteb.get_task("AfriSentiClassification", languages=["swa"]),
98
+ mteb.get_task("AfriSentiLangClassification", languages=["swa"]),
99
+ mteb.get_task("MasakhaNEWSClassification", languages=["swa"]),
100
+ mteb.get_task("MassiveIntentClassification", languages=["swa"]),
101
+ mteb.get_task("MassiveScenarioClassification", languages=["swa"]),
102
+ mteb.get_task("SwahiliNewsClassification", languages=["swa"]),
103
+ ]
104
+
105
+ evaluation = mteb.MTEB(tasks=tasks)
106
+ results = evaluation.run(model, output_folder=f"{model_name}")
107
+
108
+ tasks = mteb.get_tasks(task_types=["PairClassification", "Reranking", "BitextMining", "Clustering", "Retrieval"], languages=["swa"])
109
+
110
+ evaluation = mteb.MTEB(tasks=tasks)
111
+ results = evaluation.run(model, output_folder=f"{model_name}")
112
+ ```
113
+ """)
114
+
115
+ def display_contribution():
116
+ """Display the contribution section."""
117
+ st.header("🤝 How to Contribute")
118
+ st.markdown("""
119
+ We welcome and appreciate all contributions! You can help by:
120
+
121
+ ### Table Work
122
+
123
+ - Filling in missing entries.
124
+ - New models are added as new rows to the leaderboard (maintaining descending order).
125
+ - Add new benchmarks as new columns in the leaderboard and include them in the benchmarks table (maintaining descending order).
126
+
127
+ ### Code Work
128
+
129
+ - Improving the existing code.
130
+ - Requesting and implementing new features.
131
+ """)
132
 
133
+ def display_sponsorship():
134
+ """Display the sponsorship section."""
135
+ st.header("🤝 Sponsorship")
136
+ st.markdown("""
137
+ This benchmark is Swahili-based, and we need support translating and curating more tasks into Swahili.
138
+ Sponsorships are welcome to help advance this endeavour. Your sponsorship will facilitate essential
139
+ translation efforts, bridge language barriers, and make the benchmark accessible to a broader audience.
140
+ We are grateful for the dedication shown by our collaborators and aim to extend this impact further
141
+ with the support of sponsors committed to advancing language technologies.
142
+ """)
143
 
144
  def main():
145
  setup_page()