Spaces:

finosfoundation
/

Open-Financial-LLM-Leaderboard

Running

App Files Files Community

mirageco commited on Sep 2, 2024

Commit

cfd9447

1 Parent(s): 5858f03

Add FinTrade SR dataset

Browse files

Files changed (3) hide show

src/about.py +2 -2
src/leaderboard/read_evals.py +9 -2
src/populate.py +5 -1

src/about.py CHANGED Viewed

@@ -52,7 +52,7 @@ class Tasks(Enum):
     task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
     task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
     task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
-    task56 = Task("FinTrade", "CR", "FinTrade", category="Decision-Making (DM)")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
@@ -140,7 +140,7 @@ Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUG
 - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
 - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
 - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
-- **FinTrade**: CR, SR, DV, AV, MD. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.

     task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
     task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
     task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
+    task56 = Task("FinTrade", "SR", "FinTrade", category="Decision-Making (DM)")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
 - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
 - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
+- **FinTrade**: SR. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.

src/leaderboard/read_evals.py CHANGED Viewed

@@ -37,6 +37,7 @@ class EvalResult:
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         config = data.get("config")
@@ -149,11 +150,17 @@ class EvalResult:
                 elif task.value.category == "Text Generation (TG)":
                     category_averages["average_TG"].append(score)
                 elif task.value.category == "Risk Management (RM)":
-                    category_averages["average_RM"].append((score + 100) / 2)
                 elif task.value.category == "Forecasting (FO)":
                     category_averages["average_FO"].append(score)
                 elif task.value.category == "Decision-Making (DM)":
-                    category_averages["average_DM"].append(score)
                 elif task.value.category == "Spanish":
                     category_averages["average_Spanish"].append(score)

     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
+            print(json_filepath)
             data = json.load(fp)
         config = data.get("config")
                 elif task.value.category == "Text Generation (TG)":
                     category_averages["average_TG"].append(score)
                 elif task.value.category == "Risk Management (RM)":
+                    if score == "missing":
+                        category_averages["average_RM"].append(score)
+                    else:
+                        category_averages["average_RM"].append((score + 100) / 2)
                 elif task.value.category == "Forecasting (FO)":
                     category_averages["average_FO"].append(score)
                 elif task.value.category == "Decision-Making (DM)":
+                    if task.value.benchmark == "FinTrade" and score != "missing":
+                        category_averages["average_DM"].append((score + 3)/6)
+                    else:
+                        category_averages["average_DM"].append(score)
                 elif task.value.category == "Spanish":
                     category_averages["average_Spanish"].append(score)

src/populate.py CHANGED Viewed

@@ -35,7 +35,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
     for task in mcc_tasks:
         if task in df.columns:
-            df[task] = (df[task] + 100) / 2.0
     # Now, select the columns that were passed to the function
     df = df[cols].round(decimals=2)

     mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
     for task in mcc_tasks:
         if task in df.columns:
+            df[task] = df.apply(lambda row: (row[task] + 100) / 2.0 if row[task] != "missing" else row[task], axis=1)
+    for index, row in df.iterrows():
+        if "FinTrade" in row and row["FinTrade"] != "missing":
+            df.loc[index, "FinTrade"] = (row["FinTrade"] + 3) / 6
     # Now, select the columns that were passed to the function
     df = df[cols].round(decimals=2)