aaditya commited on
Commit
565f4e3
·
1 Parent(s): 0d18408
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Open Medical Llm Leaderboard
3
- emoji: 📊
4
  colorFrom: pink
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.19.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Test Leaderboard
3
+ emoji: 🐢
4
  colorFrom: pink
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 4.15.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -39,7 +39,7 @@ from src.display.utils import Tasks
39
 
40
  from huggingface_hub import snapshot_download
41
 
42
- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## -------##
43
 
44
  def restart_space():
45
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
@@ -100,6 +100,8 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
100
 
101
  def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool) -> pd.DataFrame:
102
 
 
 
103
  print(f"filter_models()'s df: {df}\n")
104
  # Show all models
105
  if show_deleted:
@@ -108,7 +110,10 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
108
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
109
 
110
  type_emoji = [t[0] for t in type_query]
 
 
111
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
 
112
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
113
 
114
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
@@ -353,7 +358,7 @@ scheduler = BackgroundScheduler()
353
  scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
354
 
355
  scheduler.start()
356
- # demo.queue(default_concurrency_limit=40).launch()
357
 
358
  # demo.launch(show_api=False, enable_queue=False)
359
- demo.launch() # TypeError: Blocks.launch() got an unexpected keyword argument 'enable_queue'
 
39
 
40
  from huggingface_hub import snapshot_download
41
 
42
+ ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## ------- ## -------
43
 
44
  def restart_space():
45
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 
100
 
101
  def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool) -> pd.DataFrame:
102
 
103
+
104
+ print("aa this is an example", df)
105
  print(f"filter_models()'s df: {df}\n")
106
  # Show all models
107
  if show_deleted:
 
110
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
111
 
112
  type_emoji = [t[0] for t in type_query]
113
+ print("aa this is an example", df, AutoEvalColumn.model_type_symbol.name, "thhhthht")
114
+ print("type", type_emoji)
115
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
116
+ print("bb", filtered_df)
117
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
118
 
119
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
 
358
  scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
359
 
360
  scheduler.start()
361
+ demo.queue().launch()
362
 
363
  # demo.launch(show_api=False, enable_queue=False)
364
+ # demo.launch(enable_queue=False).queue() # TypeError: Blocks.launch() got an unexpected keyword argument 'enable_queue'
app_empty.py CHANGED
@@ -4,5 +4,4 @@ def greet(name):
4
  return "Hello " + name + "!!"
5
 
6
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- # iface.launch()
8
- # autocomplete
 
4
  return "Hello " + name + "!!"
5
 
6
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ # iface.launch()
 
requirements.txt CHANGED
@@ -22,6 +22,7 @@ accelerate
22
  sentencepiece
23
  langdetect
24
  sacrebleu
 
25
  rouge_score
26
  bert-score
27
  evaluate
 
22
  sentencepiece
23
  langdetect
24
  sacrebleu
25
+ cchardet
26
  rouge_score
27
  bert-score
28
  evaluate
src/backend/envs.py CHANGED
@@ -27,7 +27,7 @@ class Tasks(Enum):
27
  task5 = Task("college_medicine (mmlu)", "MMLU College Medicine", 0)
28
  task6 = Task("medical_genetics (mmlu)", "MMLU Medical Genetics", 0)
29
  task7 = Task("professional_medicine (mmlu)", "MMLU Professional Medicine", 0)
30
- task8 = Task("pubmedqa", "PubMedQA", 0)
31
 
32
 
33
 
 
27
  task5 = Task("college_medicine (mmlu)", "MMLU College Medicine", 0)
28
  task6 = Task("medical_genetics (mmlu)", "MMLU Medical Genetics", 0)
29
  task7 = Task("professional_medicine (mmlu)", "MMLU Professional Medicine", 0)
30
+ task8 = Task("pubmedqa", "PubMedQA", 0)
31
 
32
 
33
 
src/backend/run_eval_suite.py CHANGED
@@ -33,7 +33,7 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
33
  # indexes all tasks from the `lm_eval/tasks` subdirectory.
34
  # Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
35
  # to include a set of tasks in a separate directory.
36
- task_manager = TaskManager(include_path="src/backend/open_medical_llm_leaderboard_tasks")
37
 
38
  if "gpt" in eval_request.model:
39
  model = "openai-chat-completions"
 
33
  # indexes all tasks from the `lm_eval/tasks` subdirectory.
34
  # Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
35
  # to include a set of tasks in a separate directory.
36
+ task_manager = TaskManager(include_path="src/backend/probing_tasks")
37
 
38
  if "gpt" in eval_request.model:
39
  model = "openai-chat-completions"
src/display/utils.py CHANGED
@@ -1,13 +1,11 @@
1
- from dataclasses import dataclass, field, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
5
 
6
 
7
  def fields(raw_class):
8
- return [
9
- v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
10
- ]
11
 
12
 
13
  @dataclass
@@ -28,13 +26,10 @@ class Tasks(Enum):
28
  mmlu_mg = Task("medical_genetics (mmlu)", "acc", "MMLU Medical Genetics")
29
  mmlu_pm = Task("professional_medicine (mmlu)", "acc", "MMLU Professional Medicine")
30
  pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
31
-
32
-
33
  # These classes are for user facing column names,
34
  # to avoid having to change them all around the code
35
  # when a modif is needed
36
-
37
-
38
  @dataclass
39
  class ColumnContent:
40
  name: str
@@ -45,103 +40,29 @@ class ColumnContent:
45
  dummy: bool = False
46
  is_task: bool = False
47
 
48
-
49
- # Define a function to generate ColumnContent instances
50
- def column_content_factory(
51
- name: str,
52
- type: str,
53
- displayed_by_default: bool,
54
- hidden: bool = False,
55
- never_hidden: bool = False,
56
- dummy: bool = False,
57
- is_task: bool = False,
58
- ):
59
- return lambda: ColumnContent(
60
- name=name,
61
- type=type,
62
- displayed_by_default=displayed_by_default,
63
- hidden=hidden,
64
- never_hidden=never_hidden,
65
- dummy=dummy,
66
- is_task=is_task,
67
- )
68
-
69
-
70
  auto_eval_column_dict = []
71
  # Init
72
- auto_eval_column_dict.append(
73
- [
74
- "model_type_symbol",
75
- ColumnContent,
76
- ColumnContent("T", "str", True, never_hidden=True),
77
- ]
78
- )
79
- auto_eval_column_dict.append(
80
- [
81
- "model",
82
- ColumnContent,
83
- ColumnContent("Model", "markdown", True, never_hidden=True),
84
- ]
85
- )
86
- # Scores
87
- auto_eval_column_dict.append(
88
- ["average", ColumnContent, ColumnContent("Avg", "number", True)]
89
- )
90
  for task in Tasks:
91
- auto_eval_column_dict.append(
92
- [
93
- task.name,
94
- ColumnContent,
95
- ColumnContent(task.value.col_name, "number", True, is_task=True),
96
- ]
97
- ) # hidden was true by default
98
  # Model information
99
- auto_eval_column_dict.append(
100
- ["model_type", ColumnContent, ColumnContent("Type", "str", False)]
101
- )
102
- auto_eval_column_dict.append(
103
- ["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]
104
- )
105
- auto_eval_column_dict.append(
106
- ["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]
107
- )
108
- auto_eval_column_dict.append(
109
- ["precision", ColumnContent, ColumnContent("Precision", "str", False)]
110
- )
111
- auto_eval_column_dict.append(
112
- ["license", ColumnContent, ColumnContent("Hub License", "str", False)]
113
- )
114
- auto_eval_column_dict.append(
115
- ["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]
116
- )
117
- auto_eval_column_dict.append(
118
- ["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]
119
- )
120
- auto_eval_column_dict.append(
121
- [
122
- "still_on_hub",
123
- ColumnContent,
124
- ColumnContent("Available on the hub", "bool", False),
125
- ]
126
- )
127
- auto_eval_column_dict.append(
128
- ["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]
129
- )
130
  # Dummy column for the search bar (hidden by the custom CSS)
131
- # Define the structure of your dataclass fields with default_factory for mutable defaults
132
- auto_eval_column_fields = [
133
- (
134
- "model_type_symbol",
135
- ColumnContent,
136
- field(
137
- default_factory=column_content_factory("T", "str", True, never_hidden=True)
138
- ),
139
- ),
140
- # Add other fields similarly...
141
- ]
142
 
143
  # We use make dataclass to dynamically fill the scores from Tasks
144
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_fields, frozen=True)
145
 
146
 
147
  @dataclass(frozen=True)
@@ -189,6 +110,9 @@ class WeightType(Enum):
189
  Delta = ModelDetails("Delta")
190
 
191
 
 
 
 
192
  class Precision(Enum):
193
  float32 = ModelDetails("float32")
194
  float16 = ModelDetails("float16")
@@ -213,17 +137,13 @@ class Precision(Enum):
213
  if precision in ["GPTQ", "None"]:
214
  return Precision.qt_GPTQ
215
  return Precision.Unknown
216
-
217
 
218
  # Column selection
219
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
220
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
221
- COLS_LITE = [
222
- c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
223
- ]
224
- TYPES_LITE = [
225
- c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
226
- ]
227
 
228
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
229
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
1
+ from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
5
 
6
 
7
  def fields(raw_class):
8
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
 
9
 
10
 
11
  @dataclass
 
26
  mmlu_mg = Task("medical_genetics (mmlu)", "acc", "MMLU Medical Genetics")
27
  mmlu_pm = Task("professional_medicine (mmlu)", "acc", "MMLU Professional Medicine")
28
  pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
29
+
 
30
  # These classes are for user facing column names,
31
  # to avoid having to change them all around the code
32
  # when a modif is needed
 
 
33
  @dataclass
34
  class ColumnContent:
35
  name: str
 
40
  dummy: bool = False
41
  is_task: bool = False
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  auto_eval_column_dict = []
44
  # Init
45
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
46
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
47
+ #Scores
48
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  for task in Tasks:
50
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, is_task=True)]) # hidden was true by default
 
 
 
 
 
 
51
  # Model information
52
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
53
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
54
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
55
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
56
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
57
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
58
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
59
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
60
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # Dummy column for the search bar (hidden by the custom CSS)
62
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 
 
 
 
 
 
 
 
 
 
63
 
64
  # We use make dataclass to dynamically fill the scores from Tasks
65
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
66
 
67
 
68
  @dataclass(frozen=True)
 
110
  Delta = ModelDetails("Delta")
111
 
112
 
113
+
114
+
115
+
116
  class Precision(Enum):
117
  float32 = ModelDetails("float32")
118
  float16 = ModelDetails("float16")
 
137
  if precision in ["GPTQ", "None"]:
138
  return Precision.qt_GPTQ
139
  return Precision.Unknown
140
+
141
 
142
  # Column selection
143
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
144
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
145
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
146
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
 
 
 
147
 
148
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
149
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
src/envs.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import HfApi
5
 
6
  H4_TOKEN = os.environ.get("HF_SECRET", None)
7
 
 
8
  REPO_ID = "openlifescienceai/open_medical_llm_leaderboard"
9
 
10
  QUEUE_REPO = "openlifescienceai/test_requests"
@@ -16,7 +17,7 @@ PRIVATE_RESULTS_REPO = "openlifescienceai/test_private-results"
16
 
17
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
18
 
19
-
20
  CACHE_PATH = os.getenv("HF_HOME", ".")
21
 
22
  print(f"CACHE_PATH = {CACHE_PATH}")
@@ -27,6 +28,7 @@ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
27
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
28
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
29
 
 
30
 
31
  # Rate limit variables
32
  RATE_LIMIT_PERIOD = 7
@@ -34,4 +36,4 @@ RATE_LIMIT_QUOTA = 5
34
  HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
35
 
36
  API = HfApi(token=H4_TOKEN)
37
- # API = HfApi()
 
5
 
6
  H4_TOKEN = os.environ.get("HF_SECRET", None)
7
 
8
+ # REPO_ID = "pminervini/hallucinations-leaderboard"
9
  REPO_ID = "openlifescienceai/open_medical_llm_leaderboard"
10
 
11
  QUEUE_REPO = "openlifescienceai/test_requests"
 
17
 
18
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
19
 
20
+ # CACHE_PATH = "/Users/chaeeunlee/Documents/VSC_workspaces/test_leaderboard" #
21
  CACHE_PATH = os.getenv("HF_HOME", ".")
22
 
23
  print(f"CACHE_PATH = {CACHE_PATH}")
 
28
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
29
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
30
 
31
+ # PATH_TO_COLLECTION = "hallucinations-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03" # ??
32
 
33
  # Rate limit variables
34
  RATE_LIMIT_PERIOD = 7
 
36
  HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
37
 
38
  API = HfApi(token=H4_TOKEN)
39
+ # API = HfApi()