Add FinTrade SR dataset
Browse files- src/about.py +2 -2
- src/leaderboard/read_evals.py +9 -2
- src/populate.py +5 -1
src/about.py
CHANGED
@@ -52,7 +52,7 @@ class Tasks(Enum):
|
|
52 |
task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
|
53 |
task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
|
54 |
task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
|
55 |
-
task56 = Task("FinTrade", "
|
56 |
|
57 |
NUM_FEWSHOT = 0 # Change with your few shot
|
58 |
# ---------------------------------------------------
|
@@ -140,7 +140,7 @@ Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUG
|
|
140 |
- **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
|
141 |
- **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
|
142 |
- **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
|
143 |
-
- **FinTrade**:
|
144 |
|
145 |
|
146 |
|
|
|
52 |
task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
|
53 |
task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
|
54 |
task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
|
55 |
+
task56 = Task("FinTrade", "SR", "FinTrade", category="Decision-Making (DM)")
|
56 |
|
57 |
NUM_FEWSHOT = 0 # Change with your few shot
|
58 |
# ---------------------------------------------------
|
|
|
140 |
- **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
|
141 |
- **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
|
142 |
- **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
|
143 |
+
- **FinTrade**: SR. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.
|
144 |
|
145 |
|
146 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -37,6 +37,7 @@ class EvalResult:
|
|
37 |
def init_from_json_file(self, json_filepath):
|
38 |
"""Inits the result from the specific model result file"""
|
39 |
with open(json_filepath) as fp:
|
|
|
40 |
data = json.load(fp)
|
41 |
|
42 |
config = data.get("config")
|
@@ -149,11 +150,17 @@ class EvalResult:
|
|
149 |
elif task.value.category == "Text Generation (TG)":
|
150 |
category_averages["average_TG"].append(score)
|
151 |
elif task.value.category == "Risk Management (RM)":
|
152 |
-
|
|
|
|
|
|
|
153 |
elif task.value.category == "Forecasting (FO)":
|
154 |
category_averages["average_FO"].append(score)
|
155 |
elif task.value.category == "Decision-Making (DM)":
|
156 |
-
|
|
|
|
|
|
|
157 |
elif task.value.category == "Spanish":
|
158 |
category_averages["average_Spanish"].append(score)
|
159 |
|
|
|
37 |
def init_from_json_file(self, json_filepath):
|
38 |
"""Inits the result from the specific model result file"""
|
39 |
with open(json_filepath) as fp:
|
40 |
+
print(json_filepath)
|
41 |
data = json.load(fp)
|
42 |
|
43 |
config = data.get("config")
|
|
|
150 |
elif task.value.category == "Text Generation (TG)":
|
151 |
category_averages["average_TG"].append(score)
|
152 |
elif task.value.category == "Risk Management (RM)":
|
153 |
+
if score == "missing":
|
154 |
+
category_averages["average_RM"].append(score)
|
155 |
+
else:
|
156 |
+
category_averages["average_RM"].append((score + 100) / 2)
|
157 |
elif task.value.category == "Forecasting (FO)":
|
158 |
category_averages["average_FO"].append(score)
|
159 |
elif task.value.category == "Decision-Making (DM)":
|
160 |
+
if task.value.benchmark == "FinTrade" and score != "missing":
|
161 |
+
category_averages["average_DM"].append((score + 3)/6)
|
162 |
+
else:
|
163 |
+
category_averages["average_DM"].append(score)
|
164 |
elif task.value.category == "Spanish":
|
165 |
category_averages["average_Spanish"].append(score)
|
166 |
|
src/populate.py
CHANGED
@@ -35,7 +35,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
35 |
mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
|
36 |
for task in mcc_tasks:
|
37 |
if task in df.columns:
|
38 |
-
df[task] = (
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Now, select the columns that were passed to the function
|
41 |
df = df[cols].round(decimals=2)
|
|
|
35 |
mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
|
36 |
for task in mcc_tasks:
|
37 |
if task in df.columns:
|
38 |
+
df[task] = df.apply(lambda row: (row[task] + 100) / 2.0 if row[task] != "missing" else row[task], axis=1)
|
39 |
+
|
40 |
+
for index, row in df.iterrows():
|
41 |
+
if "FinTrade" in row and row["FinTrade"] != "missing":
|
42 |
+
df.loc[index, "FinTrade"] = (row["FinTrade"] + 3) / 6
|
43 |
|
44 |
# Now, select the columns that were passed to the function
|
45 |
df = df[cols].round(decimals=2)
|