AMR-KELEG commited on
Commit
48a308f
·
1 Parent(s): 80852b8

Compute the Evaluation Metrics

Browse files
Files changed (2) hide show
  1. app.py +73 -2
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # TODO: requirments.txt
2
  import os
 
3
  import pandas as pd
4
  import streamlit as st
5
 
@@ -7,6 +8,7 @@ import torch
7
  import datasets
8
  from tqdm import tqdm
9
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
10
 
11
  model_name = st.text_input("Enter a model's name on HF")
12
  # MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
@@ -32,6 +34,21 @@ DIALECTS = [
32
  ]
33
  assert len(DIALECTS) == 18
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  tokenizer = AutoTokenizer.from_pretrained(model_name)
36
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
37
 
@@ -53,6 +70,11 @@ def predict_top_p(text, P=0.9):
53
  if total_prob >= P:
54
  break
55
 
 
 
 
 
 
56
  return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
57
 
58
 
@@ -65,9 +87,8 @@ sentences_labels, sentences_predictions = [], []
65
  for sample in tqdm(dataset):
66
  text = sample["sentence"]
67
  labels = [
68
- DIALECTS[i]
69
  for i in range(len(DIALECTS))
70
- if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1
71
  ]
72
  pred = predict_top_p(text)
73
  sentences_labels.append(labels)
@@ -82,3 +103,53 @@ st.table(
82
  }
83
  )
84
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # TODO: requirments.txt
2
  import os
3
+ import numpy as np
4
  import pandas as pd
5
  import streamlit as st
6
 
 
8
  import datasets
9
  from tqdm import tqdm
10
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
11
+ from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
12
 
13
  model_name = st.text_input("Enter a model's name on HF")
14
  # MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
 
34
  ]
35
  assert len(DIALECTS) == 18
36
 
37
+ DIALECTS_WITH_LABELS = [
38
+ "Algeria",
39
+ "Egypt",
40
+ "Iraq",
41
+ "Jordan",
42
+ "Morocco",
43
+ "Palestine",
44
+ "Saudi_Arabia",
45
+ "Sudan",
46
+ "Syria",
47
+ "Tunisia",
48
+ "Yemen",
49
+ ]
50
+ assert len(DIALECTS_WITH_LABELS) == 11
51
+
52
  tokenizer = AutoTokenizer.from_pretrained(model_name)
53
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
54
 
 
70
  if total_prob >= P:
71
  break
72
 
73
+ return [
74
+ predictions[i]
75
+ for i, dialect in enumerate(DIALECTS)
76
+ if dialect in DIALECTS_WITH_LABELS
77
+ ]
78
  return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
79
 
80
 
 
87
  for sample in tqdm(dataset):
88
  text = sample["sentence"]
89
  labels = [
90
+ 1 if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1 else 0
91
  for i in range(len(DIALECTS))
 
92
  ]
93
  pred = predict_top_p(text)
94
  sentences_labels.append(labels)
 
103
  }
104
  )
105
  )
106
+
107
+ gold_matrix = np.array(sentences_labels)
108
+ prediction_matrix = np.array(sentences_predictions)
109
+
110
+ # Compute the scores for each label (country) on its own
111
+ accuracy_scores = [
112
+ accuracy_score(y_true=gold_matrix[:, i], y_pred=prediction_matrix[:, i]) * 100
113
+ for i in range(gold_matrix.shape[1])
114
+ ]
115
+ precision_scores = [
116
+ precision_score(
117
+ y_true=gold_matrix[:, i],
118
+ y_pred=prediction_matrix[:, i],
119
+ average="binary",
120
+ pos_label="1",
121
+ )
122
+ * 100
123
+ for i in range(gold_matrix.shape[1])
124
+ ]
125
+ recall_scores = [
126
+ recall_score(
127
+ y_true=gold_matrix[:, i],
128
+ y_pred=prediction_matrix[:, i],
129
+ average="binary",
130
+ pos_label="1",
131
+ )
132
+ * 100
133
+ for i in range(gold_matrix.shape[1])
134
+ ]
135
+ f1_scores = [
136
+ f1_score(
137
+ y_true=gold_matrix[:, i],
138
+ y_pred=prediction_matrix[:, i],
139
+ average="binary",
140
+ pos_label="1",
141
+ )
142
+ * 100
143
+ for i in range(gold_matrix.shape[1])
144
+ ]
145
+
146
+ # Compute the averaged scores
147
+ average_accuracy = np.mean(accuracy_scores)
148
+ average_precision = np.mean(precision_scores)
149
+ average_recall = np.mean(recall_scores)
150
+ average_f1 = np.mean(f1_scores)
151
+
152
+ st.write(f"Average Accuracy: {average_accuracy:.2f}%")
153
+ st.write(f"Average Precision: {average_precision:.2f}%")
154
+ st.write(f"Average Recall: {average_recall:.2f}%")
155
+ st.write(f"Average F1: {average_f1:.2f}%")
requirements.txt CHANGED
@@ -2,3 +2,4 @@ transformers
2
  torch
3
  datasets
4
  pandas
 
 
2
  torch
3
  datasets
4
  pandas
5
+ numpy