idolezal commited on
Commit
a03ea09
ยท
1 Parent(s): b299ccd

Reduce "compare_significance.py"

Browse files
Files changed (1) hide show
  1. compare_significance.py +0 -293
compare_significance.py CHANGED
@@ -1,14 +1,3 @@
1
- import argparse
2
- import json
3
- from collections import defaultdict
4
- from typing import Sequence
5
-
6
- import numpy as np
7
- from numba import njit, prange
8
- from scipy.stats import ttest_rel
9
- from sklearn.metrics import roc_curve, auc
10
- from tqdm import tqdm
11
-
12
  SUPPORTED_METRICS = [
13
  "avg_mcauroc", # for classification tasks
14
  "exact_match", # for QA tasks
@@ -16,285 +5,3 @@ SUPPORTED_METRICS = [
16
  "rouge_raw_r2_mid_f", # for summarization tasks
17
  "word_perplexity", # for language modeling tasks
18
  ]
19
-
20
-
21
- def _get_CMs(i, probabilities, references, thresholds):
22
- confusion_matrices = []
23
- for threshold in thresholds[i]:
24
- TP = 0
25
- FP = 0
26
- TN = 0
27
- FN = 0
28
- for j in range(len(probabilities)):
29
- if probabilities[j][i] >= threshold:
30
- if references[j] == i:
31
- TP += 1
32
- else:
33
- FP += 1
34
- else:
35
- if references[j] == i:
36
- FN += 1
37
- else:
38
- TN += 1
39
- cm = {
40
- "TP": TP,
41
- "FP": FP,
42
- "TN": TN,
43
- "FN": FN,
44
- "threshold": threshold,
45
- "class": i,
46
- }
47
- confusion_matrices.append(cm)
48
-
49
- return confusion_matrices
50
-
51
-
52
- def compute_significance_ttest(scores_A, scores_B):
53
- delta = np.mean(scores_A) - np.mean(scores_B)
54
- if delta <= 0:
55
- return 1.0, delta
56
- t, p = ttest_rel(scores_A, scores_B)
57
- # correct for one-tailed test
58
- p_value = p / 2
59
- return p_value, delta
60
-
61
-
62
- @njit(parallel=True)
63
- def compute_significance_bootstrap(scores_A, scores_B):
64
- n = len(scores_A)
65
- R = 1_000
66
- delta_orig = np.mean(scores_A) - np.mean(scores_B)
67
-
68
- if delta_orig <= 0:
69
- return 1.0, delta_orig
70
- r = 0
71
- for _ in prange(R):
72
- samples = np.random.choice(n, n, replace=True)
73
- temp_A = scores_A[samples]
74
- temp_B = scores_B[samples]
75
- delta = np.mean(temp_A) - np.mean(temp_B)
76
- if delta > 2 * delta_orig:
77
- r += 1
78
-
79
- pval = r / R
80
- return pval, delta_orig
81
-
82
-
83
- def compute_significance_avg_mcauroc(
84
- probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
85
- probsB: Sequence[Sequence[float]], referencesB: Sequence[int],
86
- ):
87
- # compute MC-AUC for model A
88
- model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
89
- model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
90
- delta = np.mean(model_A_scores) - np.mean(model_B_scores)
91
-
92
- # one-tailed test
93
- p_value = (
94
- (model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
95
- / (len(model_A_scores) * len(model_B_scores))
96
- )
97
-
98
- return p_value, delta
99
-
100
-
101
- # Helper function to convert confusion matrices to numba-compatible arrays
102
- def convert_confusion_matrices(confusion_matrices):
103
- num_thresholds = len(confusion_matrices)
104
- tp = np.empty(num_thresholds)
105
- fn = np.empty(num_thresholds)
106
- for k in range(num_thresholds):
107
- tp[k] = confusion_matrices[k]["TP"]
108
- fn[k] = confusion_matrices[k]["FN"]
109
- return tp, fn
110
-
111
-
112
- @njit(parallel=True)
113
- def compute_tpr_variates(tp, fn, ฮป, Nsamples, num_thresholds):
114
- tpr_variates_for_each_fpr = np.empty((num_thresholds, Nsamples))
115
- for k in prange(num_thresholds):
116
- tpr_variates_for_each_fpr[k, :] = np.random.beta(tp[k] + ฮป, fn[k] + ฮป, Nsamples)
117
- return tpr_variates_for_each_fpr
118
-
119
-
120
- def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
121
- n_classes = list(range(len(probs[0])))
122
- fpr = dict()
123
- thresholds = dict()
124
- # compute AUC for every class
125
- auc_scores_per_class = []
126
- for i in range(len(n_classes)):
127
- # for i-th class vs all others
128
- fpr[i], _, thresholds[i] = roc_curve(
129
- y_true=[1 if x == n_classes[i] else 0 for x in references],
130
- y_score=[prob[i] for prob in probs],
131
- )
132
-
133
- confusion_matrices = _get_CMs(i, probs, references, thresholds)
134
- tp, fn = convert_confusion_matrices(confusion_matrices)
135
-
136
- ฮป = 1.0 # <- Flat prior
137
- # ฮป = 0.5 # <- Jeffrey's prior
138
-
139
- # sample variates for every threshold
140
- # tpr_variates_for_each_fpr = []
141
- # for k in range(len(thresholds[i])):
142
- # tpr_variates_for_each_fpr.append(
143
- # numpy.random.beta(confusion_matrices[k]["TP"] + ฮป, confusion_matrices[k]["FN"] + ฮป, Nsamples))
144
- tpr_variates_for_each_fpr = compute_tpr_variates(tp, fn, ฮป, Nsamples, len(thresholds[i]))
145
-
146
- # fprs x tpr_variates
147
- # tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
148
-
149
- # now pick 1 variate for each fpr, and compute AUC
150
- auc_scores = []
151
- for tpr_variates in tpr_variates_for_each_fpr.T:
152
- auc_score = auc(fpr[i], tpr_variates)
153
- # if numpy.isnan(auc_score):
154
- # auc_score = 0
155
- auc_scores.append(auc_score)
156
- auc_scores_per_class.append(auc_scores)
157
-
158
- auc_scores_per_class = np.array(auc_scores_per_class)
159
- mcauc_scores = np.mean(auc_scores_per_class, axis=0)
160
- return mcauc_scores
161
-
162
-
163
- def read_json(file_path):
164
- data = defaultdict(list)
165
- with open(file_path, "r") as f:
166
- fc = json.load(f)
167
- for task, results in fc["predictions"].items():
168
- # determine the metric
169
- metric = None
170
- for key in SUPPORTED_METRICS:
171
- if key in results[0]:
172
- metric = key
173
- break
174
- if metric is None:
175
- raise ValueError(f"Unsupported metric in {file_path}")
176
-
177
- if metric == "avg_mcauroc":
178
- local_data = [line[metric] for line in fc["predictions"][task]]
179
- unzipped_list = list(zip(*local_data))
180
- golds = unzipped_list[0]
181
- probs = unzipped_list[1]
182
- data[task] = (golds, probs), metric
183
- else:
184
- scores = [line[metric] for line in fc["predictions"][task]]
185
- data[task] = scores, metric
186
-
187
- # make sure all tasks are submitted
188
- METADATA_FILE = "tasks_metadata.json"
189
- with open(METADATA_FILE, "r") as f:
190
- metadata = json.load(f)
191
-
192
- all_tasks = list(metadata.keys())
193
- all_missing_tasks = []
194
- for task in all_tasks:
195
- if task not in data:
196
- all_missing_tasks.append(task)
197
- if len(all_missing_tasks) > 0:
198
- EOLN = "\n"
199
- raise ValueError(f"Missing tasks in {file_path}: {EOLN.join(all_missing_tasks)}")
200
- return data
201
-
202
-
203
- def process_task(task, dataA, dataB, significance_level):
204
- metricA = dataA[task][1]
205
- metricB = dataB[task][1]
206
- assert metricA == metricB
207
- assert len(dataA[task]) == len(dataB[task])
208
-
209
- if metricA == "avg_mcauroc":
210
- p_value, delta = compute_significance_avg_mcauroc(
211
- probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
212
- probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
213
- )
214
- elif metricA in ["acc", "exact_match"]:
215
- p_value, delta = compute_significance_ttest(
216
- scores_A=dataA[task][0],
217
- scores_B=dataB[task][0]
218
- )
219
- elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
220
- p_value, delta = compute_significance_bootstrap(
221
- scores_A=np.array(dataA[task][0]),
222
- scores_B=np.array(dataB[task][0])
223
- )
224
- else:
225
- raise ValueError(f"Unsupported metric {metricA}")
226
-
227
- if delta <= 0:
228
- p_value = 1.0
229
-
230
- return task, {
231
- "significant": not (p_value > significance_level),
232
- "p_value": p_value,
233
- "delta": delta,
234
- }
235
-
236
-
237
- def check_significance(fileA, fileB, significance_level=0.05):
238
- dataA = read_json(fileA)
239
- dataB = read_json(fileB)
240
-
241
- decisions = dict()
242
- _iter = tqdm(list(dataA.keys()))
243
- for task in _iter:
244
- _iter.set_description(f"Processing task: {task}")
245
- metricA = dataA[task][1]
246
- metricB = dataB[task][1]
247
- assert metricA == metricB
248
- assert len(dataA[task]) == len(dataB[task])
249
-
250
- if metricA == "avg_mcauroc":
251
- p_value, delta = compute_significance_avg_mcauroc(
252
- probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
253
- probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
254
- )
255
-
256
- elif metricA in ["acc", "exact_match"]:
257
- p_value, delta = compute_significance_ttest(
258
- scores_A=dataA[task][0],
259
- scores_B=dataB[task][0]
260
- )
261
- elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
262
- p_value, delta = compute_significance_bootstrap(
263
- scores_A=np.array(dataA[task][0]),
264
- scores_B=np.array(dataB[task][0])
265
- )
266
- else:
267
- raise ValueError(f"Unsupported metric {metricA}")
268
- if delta <= 0:
269
- p_value = 1.0
270
- decisions[task] = {
271
- "significant": not (p_value > significance_level),
272
- "p_value": p_value,
273
- "delta": delta,
274
- }
275
-
276
- return decisions
277
-
278
-
279
- def main():
280
- parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
281
- parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
282
- parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
283
- parser.add_argument(
284
- "--significance_level",
285
- type=float,
286
- default=0.05,
287
- help="Significance level (e.g., 0.05)",
288
- )
289
- args = parser.parse_args()
290
-
291
- result = check_significance(args.modelA, args.modelB, args.significance_level)
292
- print(json.dumps(result, indent=2))
293
-
294
-
295
- # harness already returns stderr estimate for sampling distribution
296
- # see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
297
-
298
- if __name__ == "__main__":
299
- check_significance("../csmpt.json", "../llama3_instruct.json", 0.05)
300
- main()
 
 
 
 
 
 
 
 
 
 
 
 
1
  SUPPORTED_METRICS = [
2
  "avg_mcauroc", # for classification tasks
3
  "exact_match", # for QA tasks
 
5
  "rouge_raw_r2_mid_f", # for summarization tasks
6
  "word_perplexity", # for language modeling tasks
7
  ]