Add small descriptions of each of the datasets
Browse files- src/about.py +39 -39
- src/leaderboard/read_evals.py +6 -4
src/about.py
CHANGED
@@ -100,45 +100,45 @@ If the icon is "?", it indicates that there is insufficient information about th
|
|
100 |
|
101 |
Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, BERTScore, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance. Metrics for specific tasks are as follows:
|
102 |
|
103 |
-
- **FPB**: F1
|
104 |
-
- **FiQA-SA**: F1
|
105 |
-
- **TSA**:
|
106 |
-
- **Headlines**: AvgF1
|
107 |
-
- **FOMC**: F1
|
108 |
-
- **FinArg-ACC**:
|
109 |
-
- **FinArg-ARC**:
|
110 |
-
- **
|
111 |
-
- **MA**:
|
112 |
-
- **MLESG**:
|
113 |
-
- **NER**: EntityF1
|
114 |
-
- **FINER-ORD**: EntityF1
|
115 |
-
- **FinRED**: F1
|
116 |
-
- **SC**: F1
|
117 |
-
- **CD**: F1
|
118 |
-
- **FinQA**: EmAcc
|
119 |
-
- **TATQA**: EmAcc
|
120 |
-
- **ConvFinQA**: EmAcc
|
121 |
-
- **FNXL**:
|
122 |
-
- **FSRL**:
|
123 |
-
- **EDTSUM**:
|
124 |
-
- **ECTSUM**:
|
125 |
-
- **BigData22**:
|
126 |
-
- **ACL18**:
|
127 |
-
- **CIKM18**:
|
128 |
-
- **German**: MCC
|
129 |
-
- **Australian**: MCC
|
130 |
-
- **LendingClub**: MCC
|
131 |
-
- **ccf**: MCC
|
132 |
-
- **ccfraud**: MCC
|
133 |
-
- **polish**: MCC
|
134 |
-
- **taiwan**: MCC
|
135 |
-
- **portoseguro**: MCC
|
136 |
-
- **travelinsurance**: MCC
|
137 |
-
- **MultiFin-ES**: F1
|
138 |
-
- **EFP**: F1
|
139 |
-
- **EFPA**: F1
|
140 |
-
- **FinanceES**: F1
|
141 |
-
- **TSA-Spanish**: F1
|
142 |
|
143 |
|
144 |
To ensure a fair and unbiased assessment of the models' true capabilities, all evaluations are conducted in zero-shot settings (0-shots). This approach eliminates any potential advantage from task-specific fine-tuning, providing a clear indication of how well the models can generalize to new tasks.
|
|
|
100 |
|
101 |
Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, BERTScore, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance. Metrics for specific tasks are as follows:
|
102 |
|
103 |
+
- **FPB**: F1, Accuracy. Financial PhraseBank classification task.
|
104 |
+
- **FiQA-SA**: F1. Sentiment analysis on FiQA financial domain.
|
105 |
+
- **TSA**: F1, Accuracy. Sentiment analysis.
|
106 |
+
- **Headlines**: AvgF1. News headline classification.
|
107 |
+
- **FOMC**: F1, Accuracy. Hawkish-dovish classification.
|
108 |
+
- **FinArg-ACC**: F1, Accuracy. Financial argument unit classification.
|
109 |
+
- **FinArg-ARC**: F1, Accuracy. Financial argument relation classification.
|
110 |
+
- **MultiFin**: F1, Accuracy. Multi-class financial sentiment analysis.
|
111 |
+
- **MA**: F1, Accuracy. Deal completeness classification.
|
112 |
+
- **MLESG**: F1, Accuracy. ESG issue identification.
|
113 |
+
- **NER**: EntityF1. Named entity recognition in financial texts.
|
114 |
+
- **FINER-ORD**: EntityF1. Ordinal classification in financial NER.
|
115 |
+
- **FinRED**: F1, EntityF1. Financial relation extraction from text.
|
116 |
+
- **SC**: F1, EntityF1. Causal classification task in the financial domain.
|
117 |
+
- **CD**: F1, EntityF1. Causal detection.
|
118 |
+
- **FinQA**: EmAcc. Numerical question answering in finance.
|
119 |
+
- **TATQA**: F1, EmAcc. Table-based question answering in financial documents.
|
120 |
+
- **ConvFinQA**: EmAcc. Multi-turn question answering in finance.
|
121 |
+
- **FNXL**: F1, EmAcc. Numeric labeling in financial texts.
|
122 |
+
- **FSRL**: F1, EmAcc. Financial statement relation linking.
|
123 |
+
- **EDTSUM**: ROUGE, BERTScore, BARTScore. Extractive document summarization in finance.
|
124 |
+
- **ECTSUM**: ROUGE, BERTScore, BARTScore. Extractive content summarization.
|
125 |
+
- **BigData22**: Accuracy, MCC. Stock movement prediction.
|
126 |
+
- **ACL18**: Accuracy, MCC. Financial news-based stock prediction.
|
127 |
+
- **CIKM18**: Accuracy, MCC. Financial market prediction using news.
|
128 |
+
- **German**: F1, MCC. Credit scoring in the German market.
|
129 |
+
- **Australian**: F1, MCC. Credit scoring in the Australian market.
|
130 |
+
- **LendingClub**: F1, MCC. Peer-to-peer lending risk prediction.
|
131 |
+
- **ccf**: F1, MCC. Credit card fraud detection.
|
132 |
+
- **ccfraud**: F1, MCC. Credit card transaction fraud detection.
|
133 |
+
- **polish**: F1, MCC. Credit risk prediction in the Polish market.
|
134 |
+
- **taiwan**: F1, MCC. Credit risk prediction in the Taiwanese market.
|
135 |
+
- **portoseguro**: F1, MCC. Claim analysis in the Brazilian market.
|
136 |
+
- **travelinsurance**: F1, MCC. Travel insurance claim prediction.
|
137 |
+
- **MultiFin-ES**: F1. Multi-class financial sentiment analysis in Spanish.
|
138 |
+
- **EFP**: F1. Financial phrase classification in Spanish.
|
139 |
+
- **EFPA**: F1. Financial argument classification in Spanish.
|
140 |
+
- **FinanceES**: F1. Financial sentiment classification in Spanish.
|
141 |
+
- **TSA-Spanish**: F1. Sentiment analysis in Spanish.
|
142 |
|
143 |
|
144 |
To ensure a fair and unbiased assessment of the models' true capabilities, all evaluations are conducted in zero-shot settings (0-shots). This approach eliminates any potential advantage from task-specific fine-tuning, providing a clear indication of how well the models can generalize to new tasks.
|
src/leaderboard/read_evals.py
CHANGED
@@ -11,6 +11,7 @@ from src.display.formatting import make_clickable_model
|
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
|
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
@@ -38,8 +39,6 @@ class EvalResult:
|
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
|
41 |
-
print(f"Processing file: {json_filepath}")
|
42 |
-
|
43 |
config = data.get("config")
|
44 |
# Precision
|
45 |
precision = Precision.from_str(config.get("model_dtype"))
|
@@ -83,7 +82,11 @@ class EvalResult:
|
|
83 |
mean_acc = np.mean(accs) * 100.0
|
84 |
results[task.benchmark] = mean_acc
|
85 |
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
|
88 |
return self(
|
89 |
eval_name=result_key,
|
@@ -102,7 +105,6 @@ class EvalResult:
|
|
102 |
def update_with_request_file(self, requests_path):
|
103 |
"""Finds the relevant request file for the current model and updates info with it"""
|
104 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
105 |
-
|
106 |
try:
|
107 |
with open(request_file, "r") as f:
|
108 |
request = json.load(f)
|
|
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
+
task_benchmarks = {task.value.benchmark for task in Tasks}
|
15 |
|
16 |
@dataclass
|
17 |
class EvalResult:
|
|
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
41 |
|
|
|
|
|
42 |
config = data.get("config")
|
43 |
# Precision
|
44 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
82 |
mean_acc = np.mean(accs) * 100.0
|
83 |
results[task.benchmark] = mean_acc
|
84 |
|
85 |
+
# Print missing benchmarks if any
|
86 |
+
missing_benchmarks = task_benchmarks - results.keys()
|
87 |
+
if missing_benchmarks:
|
88 |
+
print(f"(Missing results) Model {model} is missing {', '.join(missing_benchmarks)} from result files")
|
89 |
+
|
90 |
|
91 |
return self(
|
92 |
eval_name=result_key,
|
|
|
105 |
def update_with_request_file(self, requests_path):
|
106 |
"""Finds the relevant request file for the current model and updates info with it"""
|
107 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
|
|
108 |
try:
|
109 |
with open(request_file, "r") as f:
|
110 |
request = json.load(f)
|