AMR-KELEG commited on
Commit
f818d64
·
1 Parent(s): 31760f5

Add the two tabs to the spaces

Browse files
Files changed (1) hide show
  1. app.py +58 -145
app.py CHANGED
@@ -1,158 +1,71 @@
1
  # TODO: requirments.txt
2
  import os
3
- import numpy as np
4
- import pandas as pd
5
  import streamlit as st
6
 
7
- import torch
8
  import datasets
9
  from tqdm import tqdm
10
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
11
- from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
12
-
13
- model_name = st.text_input("Enter a model's name on HF")
14
- # MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
15
- DIALECTS = [
16
- "Algeria",
17
- "Bahrain",
18
- "Egypt",
19
- "Iraq",
20
- "Jordan",
21
- "Kuwait",
22
- "Lebanon",
23
- "Libya",
24
- "Morocco",
25
- "Oman",
26
- "Palestine",
27
- "Qatar",
28
- "Saudi_Arabia",
29
- "Sudan",
30
- "Syria",
31
- "Tunisia",
32
- "UAE",
33
- "Yemen",
34
- ]
35
- assert len(DIALECTS) == 18
36
-
37
- DIALECTS_WITH_LABELS = [
38
- "Algeria",
39
- "Egypt",
40
- "Iraq",
41
- "Jordan",
42
- "Morocco",
43
- "Palestine",
44
- "Saudi_Arabia",
45
- "Sudan",
46
- "Syria",
47
- "Tunisia",
48
- "Yemen",
49
- ]
50
- assert len(DIALECTS_WITH_LABELS) == 11
51
-
52
- tokenizer = AutoTokenizer.from_pretrained(model_name)
53
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
54
-
55
-
56
- def predict_top_p(text, P=0.9):
57
- """Predict the top dialects with an accumulative confidence of at least P."""
58
- assert P <= 1 and P >= 0
59
-
60
- logits = model(**tokenizer(text, return_tensors="pt")).logits
61
- probabilities = torch.softmax(logits, dim=1).flatten().tolist()
62
- topk_predictions = torch.topk(logits, 18).indices.flatten().tolist()
63
-
64
- predictions = [0 for _ in range(18)]
65
- total_prob = 0
66
-
67
- for i in range(18):
68
- total_prob += probabilities[topk_predictions[i]]
69
- predictions[topk_predictions[i]] = 1
70
- if total_prob >= P:
71
- break
72
 
73
- return [
74
- predictions[i]
75
- for i, dialect in enumerate(DIALECTS)
76
- if dialect in DIALECTS_WITH_LABELS
77
- ]
78
- return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
79
-
80
-
81
- # Load the dataset
82
- dataset_name = "AMR-KELEG/test-dataset"
83
- dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
84
-
85
- sentences_labels, sentences_predictions = [], []
86
 
87
- for sample in tqdm(dataset):
88
- text = sample["sentence"]
89
- labels = [
90
- 1
91
- if DIALECTS_WITH_LABELS[i] in sample.keys()
92
- and int(sample[DIALECTS_WITH_LABELS[i]]) == 1
93
- else 0
94
- for i in range(len(DIALECTS_WITH_LABELS))
95
- ]
96
- pred = predict_top_p(text)
97
- sentences_labels.append(labels)
98
- sentences_predictions.append(pred)
99
 
100
- st.table(
101
- data=pd.DataFrame(
102
- {
103
- "text": dataset["sentence"],
104
- "labels": sentences_labels,
105
- "predictions": sentences_predictions,
106
- }
107
- )
108
- )
109
 
110
- gold_matrix = np.array(sentences_labels)
111
- prediction_matrix = np.array(sentences_predictions)
112
 
113
- # Compute the scores for each label (country) on its own
114
- accuracy_scores = [
115
- accuracy_score(y_true=gold_matrix[:, i], y_pred=prediction_matrix[:, i]) * 100
116
- for i in range(gold_matrix.shape[1])
117
- ]
118
- precision_scores = [
119
- precision_score(
120
- y_true=gold_matrix[:, i],
121
- y_pred=prediction_matrix[:, i],
122
- average="binary",
123
- pos_label=1,
124
- )
125
- * 100
126
- for i in range(gold_matrix.shape[1])
127
- ]
128
- recall_scores = [
129
- recall_score(
130
- y_true=gold_matrix[:, i],
131
- y_pred=prediction_matrix[:, i],
132
- average="binary",
133
- pos_label=1,
134
- )
135
- * 100
136
- for i in range(gold_matrix.shape[1])
137
- ]
138
- f1_scores = [
139
- f1_score(
140
- y_true=gold_matrix[:, i],
141
- y_pred=prediction_matrix[:, i],
142
- average="binary",
143
- pos_label=1,
144
  )
145
- * 100
146
- for i in range(gold_matrix.shape[1])
147
- ]
148
-
149
- # Compute the averaged scores
150
- average_accuracy = np.mean(accuracy_scores)
151
- average_precision = np.mean(precision_scores)
152
- average_recall = np.mean(recall_scores)
153
- average_f1 = np.mean(f1_scores)
154
-
155
- st.write(f"Average Accuracy: {average_accuracy:.2f}%")
156
- st.write(f"Average Precision: {average_precision:.2f}%")
157
- st.write(f"Average Recall: {average_recall:.2f}%")
158
- st.write(f"Average F1: {average_f1:.2f}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # TODO: requirments.txt
2
  import os
 
 
3
  import streamlit as st
4
 
 
5
  import datasets
6
  from tqdm import tqdm
7
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ from constants import DIALECTS_WITH_LABELS
10
+ from inspect import getmembers, isfunction
11
+ import eval_utils
12
+ import numpy as np
13
+ from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
 
 
 
 
 
 
 
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"])
 
 
 
 
 
 
 
 
17
 
18
+ with tab1:
19
+ st.write("Leaderboard")
20
 
21
+ with tab2:
22
+ model_name = st.text_input("Enter a model's name on HF")
23
+ inference_function = st.selectbox(
24
+ "Inference Method",
25
+ [func_name for func_name, _ in getmembers(eval_utils, isfunction)],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
+ if model_name:
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
30
+
31
+ # Load the dataset
32
+ dataset_name = os.environ["DATASET_NAME"]
33
+ dataset = datasets.load_dataset(dataset_name)["test"]
34
+ # dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
35
+
36
+ sentences = dataset["sentence"]
37
+ labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS}
38
+
39
+ # TODO: Perform the inference in batches?
40
+ predictions = [
41
+ getattr(eval_utils, inference_function)(model, tokenizer, sentence)
42
+ for sentence in tqdm(sentences)
43
+ ]
44
+
45
+ # TODO: Store the predictions in a private dataset
46
+
47
+ # Evaluate the model
48
+ accuracy_scores = {}
49
+ f1_scores = {}
50
+ recall_scores = {}
51
+ precision_scores = {}
52
+
53
+ for dialect in DIALECTS_WITH_LABELS:
54
+ y_true = labels[dialect]
55
+ y_pred = [dialect in prediction for prediction in predictions]
56
+ accuracy = accuracy_score(y_true, y_pred)
57
+ f1 = f1_score(y_true, y_pred)
58
+ recall = recall_score(y_true, y_pred)
59
+ precision = precision_score(y_true, y_pred)
60
+
61
+ accuracy_scores[dialect] = accuracy
62
+ f1_scores[dialect] = f1
63
+ recall_scores[dialect] = recall
64
+ precision_scores[dialect] = precision
65
+
66
+ macro_avg_accuracy = np.mean(list(accuracy_scores.values()))
67
+ macro_avg_f1 = np.mean(list(f1_scores.values()))
68
+ macro_avg_recall = np.mean(list(recall_scores.values()))
69
+ macro_avg_precision = np.mean(list(precision_scores.values()))
70
+
71
+ st.toast(f"Evaluation completed!")