sheonhan commited on
Commit
2102b66
1 Parent(s): d06dc21

format utils.py

Browse files
Files changed (1) hide show
  1. utils.py +71 -52
utils.py CHANGED
@@ -11,6 +11,7 @@ import datetime
11
  import glob
12
  from dataclasses import dataclass
13
  from typing import List, Tuple, Dict
 
14
  # clone / pull the lmeh eval data
15
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
16
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
@@ -18,67 +19,74 @@ LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
18
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
19
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
20
  BENCH_TO_NAME = {
21
- "arc_challenge":"ARC (25-shot) ⬆️",
22
- "hellaswag":"HellaSwag (10-shot) ⬆️",
23
- "hendrycks":"MMLU (5-shot) ⬆️",
24
- "truthfulqa_mc":"TruthfulQA (0-shot) ⬆️",
25
  }
26
- def make_clickable_model(model_name):
27
- LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
 
 
 
 
 
 
 
28
  if model_name in LLAMAS:
29
  model = model_name.split("/")[1]
30
  return f'<a target="_blank" href="https://ai.facebook.com/blog/large-language-model-llama-meta-ai/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model}</a>'
31
-
32
  if model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
33
  link = "https://huggingface.co/" + "CarperAI/stable-vicuna-13b-delta"
34
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">stable-vicuna-13b</a>'
35
-
36
  if model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
37
  link = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
38
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">alpaca-13b</a>'
39
 
40
  # remove user from model name
41
- #model_name_show = ' '.join(model_name.split('/')[1:])
42
 
43
  link = "https://huggingface.co/" + model_name
44
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
45
 
 
46
  @dataclass
47
  class EvalResult:
48
- eval_name : str
49
- org : str
50
- model : str
51
- revision : str
52
- is_8bit : bool
53
- results : dict
54
-
55
  def to_dict(self):
56
-
57
  if self.org is not None:
58
- base_model =f"{self.org}/{self.model}"
59
  else:
60
- base_model =f"{self.model}"
61
  data_dict = {}
62
-
63
  data_dict["eval_name"] = self.eval_name
64
  data_dict["8bit"] = self.is_8bit
65
  data_dict["Model"] = make_clickable_model(base_model)
66
  data_dict["Revision"] = self.revision
67
- data_dict["Average ⬆️"] = round(sum([v for k,v in self.results.items()])/4.0,1)
68
- #data_dict["# params"] = get_n_params(base_model)
69
-
 
 
70
  for benchmark in BENCHMARKS:
71
  if not benchmark in self.results.keys():
72
  self.results[benchmark] = None
73
-
74
- for k,v in BENCH_TO_NAME.items():
75
  data_dict[v] = self.results[k]
76
-
77
  return data_dict
78
-
79
-
80
-
81
-
82
  def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
83
  with open(json_filepath) as fp:
84
  data = json.load(fp)
@@ -88,49 +96,60 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
88
  model = path_split[-4]
89
  is_8bit = path_split[-2] == "8bit"
90
  revision = path_split[-3]
91
- if len(path_split)== 7:
92
  # handles gpt2 type models that don't have an org
93
  result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
94
  else:
95
- result_key = f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
 
 
96
  org = path_split[-5]
97
-
98
  eval_result = None
99
- for benchmark, metric in zip(BENCHMARKS, METRICS):
100
  if benchmark in json_filepath:
101
  accs = np.array([v[metric] for k, v in data["results"].items()])
102
- mean_acc = round(np.mean(accs)*100.0,1)
103
- eval_result = EvalResult(result_key, org, model, revision, is_8bit, {benchmark:mean_acc})
104
-
 
 
105
  return result_key, eval_result
106
-
107
-
108
-
109
-
110
  def get_eval_results(is_public) -> List[EvalResult]:
111
- json_filepaths = glob.glob("evals/eval_results/public/**/16bit/*.json", recursive=True)
 
 
112
  if not is_public:
113
- json_filepaths += glob.glob("evals/eval_results/private/**/*.json", recursive=True)
114
- json_filepaths += glob.glob("evals/eval_results/private/**/*.json", recursive=True)
115
- json_filepaths += glob.glob("evals/eval_results/public/**/8bit/*.json", recursive=True) # include the 8bit evals of public models
 
 
 
 
 
 
116
  eval_results = {}
117
-
118
  for json_filepath in json_filepaths:
119
  result_key, eval_result = parse_eval_result(json_filepath)
120
  if result_key in eval_results.keys():
121
  eval_results[result_key].results.update(eval_result.results)
122
  else:
123
  eval_results[result_key] = eval_result
124
-
125
-
126
- eval_results = [v for k,v in eval_results.items()]
127
-
128
  return eval_results
129
-
 
130
  def get_eval_results_dicts(is_public=True) -> List[Dict]:
131
  eval_results = get_eval_results(is_public)
132
-
133
  return [e.to_dict() for e in eval_results]
134
 
 
135
  eval_results_dict = get_eval_results_dicts()
136
  # print(eval_results_dict)
 
11
  import glob
12
  from dataclasses import dataclass
13
  from typing import List, Tuple, Dict
14
+
15
  # clone / pull the lmeh eval data
16
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
17
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 
19
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
20
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
21
  BENCH_TO_NAME = {
22
+ "arc_challenge": "ARC (25-shot) ⬆️",
23
+ "hellaswag": "HellaSwag (10-shot) ⬆️",
24
+ "hendrycks": "MMLU (5-shot) ⬆️",
25
+ "truthfulqa_mc": "TruthfulQA (0-shot) ⬆️",
26
  }
27
+
28
+
29
+ def make_clickable_model(model_name):
30
+ LLAMAS = [
31
+ "huggingface/llama-7b",
32
+ "huggingface/llama-13b",
33
+ "huggingface/llama-30b",
34
+ "huggingface/llama-65b",
35
+ ]
36
  if model_name in LLAMAS:
37
  model = model_name.split("/")[1]
38
  return f'<a target="_blank" href="https://ai.facebook.com/blog/large-language-model-llama-meta-ai/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model}</a>'
39
+
40
  if model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
41
  link = "https://huggingface.co/" + "CarperAI/stable-vicuna-13b-delta"
42
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">stable-vicuna-13b</a>'
43
+
44
  if model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
45
  link = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
46
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">alpaca-13b</a>'
47
 
48
  # remove user from model name
49
+ # model_name_show = ' '.join(model_name.split('/')[1:])
50
 
51
  link = "https://huggingface.co/" + model_name
52
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
53
 
54
+
55
  @dataclass
56
  class EvalResult:
57
+ eval_name: str
58
+ org: str
59
+ model: str
60
+ revision: str
61
+ is_8bit: bool
62
+ results: dict
63
+
64
  def to_dict(self):
 
65
  if self.org is not None:
66
+ base_model = f"{self.org}/{self.model}"
67
  else:
68
+ base_model = f"{self.model}"
69
  data_dict = {}
70
+
71
  data_dict["eval_name"] = self.eval_name
72
  data_dict["8bit"] = self.is_8bit
73
  data_dict["Model"] = make_clickable_model(base_model)
74
  data_dict["Revision"] = self.revision
75
+ data_dict["Average ⬆️"] = round(
76
+ sum([v for k, v in self.results.items()]) / 4.0, 1
77
+ )
78
+ # data_dict["# params"] = get_n_params(base_model)
79
+
80
  for benchmark in BENCHMARKS:
81
  if not benchmark in self.results.keys():
82
  self.results[benchmark] = None
83
+
84
+ for k, v in BENCH_TO_NAME.items():
85
  data_dict[v] = self.results[k]
86
+
87
  return data_dict
88
+
89
+
 
 
90
  def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
91
  with open(json_filepath) as fp:
92
  data = json.load(fp)
 
96
  model = path_split[-4]
97
  is_8bit = path_split[-2] == "8bit"
98
  revision = path_split[-3]
99
+ if len(path_split) == 7:
100
  # handles gpt2 type models that don't have an org
101
  result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
102
  else:
103
+ result_key = (
104
+ f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
105
+ )
106
  org = path_split[-5]
107
+
108
  eval_result = None
109
+ for benchmark, metric in zip(BENCHMARKS, METRICS):
110
  if benchmark in json_filepath:
111
  accs = np.array([v[metric] for k, v in data["results"].items()])
112
+ mean_acc = round(np.mean(accs) * 100.0, 1)
113
+ eval_result = EvalResult(
114
+ result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
115
+ )
116
+
117
  return result_key, eval_result
118
+
119
+
 
 
120
  def get_eval_results(is_public) -> List[EvalResult]:
121
+ json_filepaths = glob.glob(
122
+ "evals/eval_results/public/**/16bit/*.json", recursive=True
123
+ )
124
  if not is_public:
125
+ json_filepaths += glob.glob(
126
+ "evals/eval_results/private/**/*.json", recursive=True
127
+ )
128
+ json_filepaths += glob.glob(
129
+ "evals/eval_results/private/**/*.json", recursive=True
130
+ )
131
+ json_filepaths += glob.glob(
132
+ "evals/eval_results/public/**/8bit/*.json", recursive=True
133
+ ) # include the 8bit evals of public models
134
  eval_results = {}
135
+
136
  for json_filepath in json_filepaths:
137
  result_key, eval_result = parse_eval_result(json_filepath)
138
  if result_key in eval_results.keys():
139
  eval_results[result_key].results.update(eval_result.results)
140
  else:
141
  eval_results[result_key] = eval_result
142
+
143
+ eval_results = [v for k, v in eval_results.items()]
144
+
 
145
  return eval_results
146
+
147
+
148
  def get_eval_results_dicts(is_public=True) -> List[Dict]:
149
  eval_results = get_eval_results(is_public)
150
+
151
  return [e.to_dict() for e in eval_results]
152
 
153
+
154
  eval_results_dict = get_eval_results_dicts()
155
  # print(eval_results_dict)