Spaces:

sartifyllc
/

Swahili-Text-Embeddings-Leaderboard

Running

App Files Files Community

Swahili-Text-Embeddings-Leaderboard / app.py

Mollel

Update app.py

97556d1 verified 7 months ago

raw

history blame

7.18 kB

	import streamlit as st
	import pandas as pd
	import io
	import re

	# Constants
	GITHUB_URL = "https://github.com/Sartify/STEL"
	POSSIBLE_NON_BENCHMARK_COLS = ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "Dimension", "Average"]

	def extract_table_from_markdown(markdown_text, table_start):
	"""Extract table content from markdown text."""
	lines = markdown_text.split('\n')
	table_content = []
	capture = False
	for line in lines:
	if line.startswith(table_start):
	capture = True
	elif capture and (line.startswith('#') or line.strip() == ''):
	break # Stop capturing when we reach a new section or an empty line
	if capture:
	table_content.append(line)
	return '\n'.join(table_content)

	def parse_markdown_link(text):
	"""Parse a Markdown link and return the display text and URL."""
	match = re.match(r'\[(.?)\]\((.?)\)', text)
	if match:
	return match.group(1), match.group(2)
	return text, None

	def markdown_table_to_df(table_content):
	"""Convert markdown table to pandas DataFrame."""
	# Split the table content into lines
	lines = table_content.split('\n')

	# Extract headers
	headers = [h.strip() for h in lines[0].split('\|') if h.strip()]

	# Extract data
	data = []
	for line in lines[2:]: # Skip the header separator line
	row = [cell.strip() for cell in line.split('\|') if cell.strip()]
	if row and len(row) == len(headers): # Ensure row has the correct number of columns
	# Parse the Model Name column for Markdown links
	model_name, url = parse_markdown_link(row[0])
	row[0] = model_name
	data.append(row + [url]) # Add URL as a new column

	# Create DataFrame
	df = pd.DataFrame(data, columns=headers + ['URL'])

	# Convert numeric columns to float
	for col in df.columns:
	if col not in ["Model Name", "Publisher", "Open?", "Basemodel", "Matryoshka", "URL"]:
	df[col] = pd.to_numeric(df[col], errors='coerce')

	return df

	def setup_page():
	"""Set up the Streamlit page."""
	st.set_page_config(page_title="Swahili Text Embeddings Leaderboard", page_icon="⚡", layout="wide")
	st.title("⚡ Swahili Text Embeddings Leaderboard (STEL)")
	st.image("https://raw.githubusercontent.com/username/repo/main/files/STEL.jpg", width=300)

	def display_leaderboard(df):
	"""Display the leaderboard."""
	st.header("📊 Leaderboard")

	# Determine which non-benchmark columns are present
	present_non_benchmark_cols = [col for col in POSSIBLE_NON_BENCHMARK_COLS if col in df.columns]

	# Add filters
	columns_to_filter = [col for col in df.columns if col not in present_non_benchmark_cols and col != 'URL']
	selected_columns = st.multiselect("Select benchmarks to display:", columns_to_filter, default=columns_to_filter)

	# Filter dataframe
	df_display = df[present_non_benchmark_cols + selected_columns]

	# Create a copy of the dataframe for display
	df_display_with_links = df_display.copy()

	# Create clickable links in the Model Name column
	df_display_with_links['Model Name'] = df_display_with_links.apply(
	lambda row: f'<a href="{row["URL"]}" target="_blank">{row["Model Name"]}</a>' if pd.notnull(row["URL"]) else row["Model Name"],
	axis=1
	)

	# Display dataframe with clickable links
	st.write(df_display_with_links.to_html(escape=False, index=False), unsafe_allow_html=True)

	# Download buttons
	csv = df_display.to_csv(index=False)
	st.download_button(label="Download as CSV", data=csv, file_name="leaderboard.csv", mime="text/csv")


	def display_evaluation():
	"""Display the evaluation section."""
	st.header("🧪 Evaluation")
	st.markdown("""
	To evaluate a model on the Swahili Embeddings Text Benchmark, you can use the following Python script:
	```python
	pip install mteb
	pip install sentence-transformers
	import mteb
	from sentence_transformers import SentenceTransformer

	models = ["sartifyllc/MultiLinguSwahili-bert-base-sw-cased-nli-matryoshka"]

	for model_name in models:
	truncate_dim = 768
	language = "swa"

	device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
	model = SentenceTransformer(model_name, device=device, trust_remote_code=True)

	tasks = [
	mteb.get_task("AfriSentiClassification", languages=["swa"]),
	mteb.get_task("AfriSentiLangClassification", languages=["swa"]),
	mteb.get_task("MasakhaNEWSClassification", languages=["swa"]),
	mteb.get_task("MassiveIntentClassification", languages=["swa"]),
	mteb.get_task("MassiveScenarioClassification", languages=["swa"]),
	mteb.get_task("SwahiliNewsClassification", languages=["swa"]),
	]

	evaluation = mteb.MTEB(tasks=tasks)
	results = evaluation.run(model, output_folder=f"{model_name}")

	tasks = mteb.get_tasks(task_types=["PairClassification", "Reranking", "BitextMining", "Clustering", "Retrieval"], languages=["swa"])

	evaluation = mteb.MTEB(tasks=tasks)
	results = evaluation.run(model, output_folder=f"{model_name}")
	```
	""")

	def display_contribution():
	"""Display the contribution section."""
	st.header("🤝 How to Contribute")
	st.markdown("""
	We welcome and appreciate all contributions! You can help by:

	### Table Work

	- Filling in missing entries.
	- New models are added as new rows to the leaderboard (maintaining descending order).
	- Add new benchmarks as new columns in the leaderboard and include them in the benchmarks table (maintaining descending order).

	### Code Work

	- Improving the existing code.
	- Requesting and implementing new features.
	""")

	def display_sponsorship():
	"""Display the sponsorship section."""
	st.header("🤝 Sponsorship")
	st.markdown("""
	This benchmark is Swahili-based, and we need support translating and curating more tasks into Swahili.
	Sponsorships are welcome to help advance this endeavour. Your sponsorship will facilitate essential
	translation efforts, bridge language barriers, and make the benchmark accessible to a broader audience.
	We are grateful for the dedication shown by our collaborators and aim to extend this impact further
	with the support of sponsors committed to advancing language technologies.
	""")

	def main():
	setup_page()

	# Read README content
	with open("README.md", "r") as f:
	readme_content = f.read()

	# Extract and process leaderboard table
	leaderboard_table = extract_table_from_markdown(readme_content, "\| Model Name")
	df_leaderboard = markdown_table_to_df(leaderboard_table)

	display_leaderboard(df_leaderboard)
	display_evaluation()
	display_contribution()
	display_sponsorship()

	st.markdown("---")
	st.markdown("Thank you for being part of this effort to advance Swahili language technologies!")

	if __name__ == "__main__":
	main()