mirageco commited on
Commit
cfd9447
1 Parent(s): 5858f03

Add FinTrade SR dataset

Browse files
src/about.py CHANGED
@@ -52,7 +52,7 @@ class Tasks(Enum):
52
  task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
53
  task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
54
  task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
55
- task56 = Task("FinTrade", "CR", "FinTrade", category="Decision-Making (DM)")
56
 
57
  NUM_FEWSHOT = 0 # Change with your few shot
58
  # ---------------------------------------------------
@@ -140,7 +140,7 @@ Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUG
140
  - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
141
  - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
142
  - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
143
- - **FinTrade**: CR, SR, DV, AV, MD. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.
144
 
145
 
146
 
 
52
  task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
53
  task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
54
  task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
55
+ task56 = Task("FinTrade", "SR", "FinTrade", category="Decision-Making (DM)")
56
 
57
  NUM_FEWSHOT = 0 # Change with your few shot
58
  # ---------------------------------------------------
 
140
  - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
141
  - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
142
  - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
143
+ - **FinTrade**: SR. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.
144
 
145
 
146
 
src/leaderboard/read_evals.py CHANGED
@@ -37,6 +37,7 @@ class EvalResult:
37
  def init_from_json_file(self, json_filepath):
38
  """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
 
40
  data = json.load(fp)
41
 
42
  config = data.get("config")
@@ -149,11 +150,17 @@ class EvalResult:
149
  elif task.value.category == "Text Generation (TG)":
150
  category_averages["average_TG"].append(score)
151
  elif task.value.category == "Risk Management (RM)":
152
- category_averages["average_RM"].append((score + 100) / 2)
 
 
 
153
  elif task.value.category == "Forecasting (FO)":
154
  category_averages["average_FO"].append(score)
155
  elif task.value.category == "Decision-Making (DM)":
156
- category_averages["average_DM"].append(score)
 
 
 
157
  elif task.value.category == "Spanish":
158
  category_averages["average_Spanish"].append(score)
159
 
 
37
  def init_from_json_file(self, json_filepath):
38
  """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
40
+ print(json_filepath)
41
  data = json.load(fp)
42
 
43
  config = data.get("config")
 
150
  elif task.value.category == "Text Generation (TG)":
151
  category_averages["average_TG"].append(score)
152
  elif task.value.category == "Risk Management (RM)":
153
+ if score == "missing":
154
+ category_averages["average_RM"].append(score)
155
+ else:
156
+ category_averages["average_RM"].append((score + 100) / 2)
157
  elif task.value.category == "Forecasting (FO)":
158
  category_averages["average_FO"].append(score)
159
  elif task.value.category == "Decision-Making (DM)":
160
+ if task.value.benchmark == "FinTrade" and score != "missing":
161
+ category_averages["average_DM"].append((score + 3)/6)
162
+ else:
163
+ category_averages["average_DM"].append(score)
164
  elif task.value.category == "Spanish":
165
  category_averages["average_Spanish"].append(score)
166
 
src/populate.py CHANGED
@@ -35,7 +35,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
35
  mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
36
  for task in mcc_tasks:
37
  if task in df.columns:
38
- df[task] = (df[task] + 100) / 2.0
 
 
 
 
39
 
40
  # Now, select the columns that were passed to the function
41
  df = df[cols].round(decimals=2)
 
35
  mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
36
  for task in mcc_tasks:
37
  if task in df.columns:
38
+ df[task] = df.apply(lambda row: (row[task] + 100) / 2.0 if row[task] != "missing" else row[task], axis=1)
39
+
40
+ for index, row in df.iterrows():
41
+ if "FinTrade" in row and row["FinTrade"] != "missing":
42
+ df.loc[index, "FinTrade"] = (row["FinTrade"] + 3) / 6
43
 
44
  # Now, select the columns that were passed to the function
45
  df = df[cols].round(decimals=2)