Spaces:

AMR-KELEG
/

MLADI

Running

App Files Files Community

AMR-KELEG commited on Oct 23, 2024

Commit

f818d64

1 Parent(s): 31760f5

Add the two tabs to the spaces

Browse files

Files changed (1) hide show

app.py +58 -145

app.py CHANGED Viewed

@@ -1,158 +1,71 @@
 # TODO: requirments.txt
 import os
-import numpy as np
-import pandas as pd
 import streamlit as st
-import torch
 import datasets
 from tqdm import tqdm
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
-model_name = st.text_input("Enter a model's name on HF")
-# MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
-DIALECTS = [
-    "Algeria",
-    "Bahrain",
-    "Egypt",
-    "Iraq",
-    "Jordan",
-    "Kuwait",
-    "Lebanon",
-    "Libya",
-    "Morocco",
-    "Oman",
-    "Palestine",
-    "Qatar",
-    "Saudi_Arabia",
-    "Sudan",
-    "Syria",
-    "Tunisia",
-    "UAE",
-    "Yemen",
-]
-assert len(DIALECTS) == 18
-DIALECTS_WITH_LABELS = [
-    "Algeria",
-    "Egypt",
-    "Iraq",
-    "Jordan",
-    "Morocco",
-    "Palestine",
-    "Saudi_Arabia",
-    "Sudan",
-    "Syria",
-    "Tunisia",
-    "Yemen",
-]
-assert len(DIALECTS_WITH_LABELS) == 11
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-def predict_top_p(text, P=0.9):
-    """Predict the top dialects with an accumulative confidence of at least P."""
-    assert P <= 1 and P >= 0
-    logits = model(**tokenizer(text, return_tensors="pt")).logits
-    probabilities = torch.softmax(logits, dim=1).flatten().tolist()
-    topk_predictions = torch.topk(logits, 18).indices.flatten().tolist()
-    predictions = [0 for _ in range(18)]
-    total_prob = 0
-    for i in range(18):
-        total_prob += probabilities[topk_predictions[i]]
-        predictions[topk_predictions[i]] = 1
-        if total_prob >= P:
-            break
-    return [
-        predictions[i]
-        for i, dialect in enumerate(DIALECTS)
-        if dialect in DIALECTS_WITH_LABELS
-    ]
-    return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
-# Load the dataset
-dataset_name = "AMR-KELEG/test-dataset"
-dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
-sentences_labels, sentences_predictions = [], []
-for sample in tqdm(dataset):
-    text = sample["sentence"]
-    labels = [
-        1
-        if DIALECTS_WITH_LABELS[i] in sample.keys()
-        and int(sample[DIALECTS_WITH_LABELS[i]]) == 1
-        else 0
-        for i in range(len(DIALECTS_WITH_LABELS))
-    ]
-    pred = predict_top_p(text)
-    sentences_labels.append(labels)
-    sentences_predictions.append(pred)
-st.table(
-    data=pd.DataFrame(
-        {
-            "text": dataset["sentence"],
-            "labels": sentences_labels,
-            "predictions": sentences_predictions,
-        }
-    )
-)
-gold_matrix = np.array(sentences_labels)
-prediction_matrix = np.array(sentences_predictions)
-# Compute the scores for each label (country) on its own
-accuracy_scores = [
-    accuracy_score(y_true=gold_matrix[:, i], y_pred=prediction_matrix[:, i]) * 100
-    for i in range(gold_matrix.shape[1])
-]
-precision_scores = [
-    precision_score(
-        y_true=gold_matrix[:, i],
-        y_pred=prediction_matrix[:, i],
-        average="binary",
-        pos_label=1,
-    )
-    * 100
-    for i in range(gold_matrix.shape[1])
-]
-recall_scores = [
-    recall_score(
-        y_true=gold_matrix[:, i],
-        y_pred=prediction_matrix[:, i],
-        average="binary",
-        pos_label=1,
-    )
-    * 100
-    for i in range(gold_matrix.shape[1])
-]
-f1_scores = [
-    f1_score(
-        y_true=gold_matrix[:, i],
-        y_pred=prediction_matrix[:, i],
-        average="binary",
-        pos_label=1,
     )
-    * 100
-    for i in range(gold_matrix.shape[1])
-]
-# Compute the averaged scores
-average_accuracy = np.mean(accuracy_scores)
-average_precision = np.mean(precision_scores)
-average_recall = np.mean(recall_scores)
-average_f1 = np.mean(f1_scores)
-st.write(f"Average Accuracy: {average_accuracy:.2f}%")
-st.write(f"Average Precision: {average_precision:.2f}%")
-st.write(f"Average Recall: {average_recall:.2f}%")
-st.write(f"Average F1: {average_f1:.2f}%")

 # TODO: requirments.txt
 import os
 import streamlit as st
 import datasets
 from tqdm import tqdm
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from constants import DIALECTS_WITH_LABELS
+from inspect import getmembers, isfunction
+import eval_utils
+import numpy as np
+from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
+tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
+with tab1:
+    st.write("Leaderboard")
+with tab2:
+    model_name = st.text_input("Enter a model's name on HF")
+    inference_function = st.selectbox(
+        "Inference Method",
+        [func_name for func_name, _ in getmembers(eval_utils, isfunction)],
     )
+    if model_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        # Load the dataset
+        dataset_name = os.environ["DATASET_NAME"]
+        dataset = datasets.load_dataset(dataset_name)["test"]
+        # dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
+        sentences = dataset["sentence"]
+        labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
+        # TODO: Perform the inference in batches?
+        predictions = [
+            getattr(eval_utils, inference_function)(model, tokenizer, sentence)
+            for sentence in tqdm(sentences)
+        ]
+        # TODO: Store the predictions in a private dataset
+        # Evaluate the model
+        accuracy_scores = {}
+        f1_scores = {}
+        recall_scores = {}
+        precision_scores = {}
+        for dialect in DIALECTS_WITH_LABELS:
+            y_true = labels[dialect]
+            y_pred = [dialect in prediction for prediction in predictions]
+            accuracy = accuracy_score(y_true, y_pred)
+            f1 = f1_score(y_true, y_pred)
+            recall = recall_score(y_true, y_pred)
+            precision = precision_score(y_true, y_pred)
+            accuracy_scores[dialect] = accuracy
+            f1_scores[dialect] = f1
+            recall_scores[dialect] = recall
+            precision_scores[dialect] = precision
+        macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
+        macro_avg_f1 = np.mean(list(f1_scores.values()))
+        macro_avg_recall = np.mean(list(recall_scores.values()))
+        macro_avg_precision = np.mean(list(precision_scores.values()))
+        st.toast(f"Evaluation completed!")