Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
•
dcc2472
1
Parent(s):
b559f9a
Fix NLP evaluation result paths
Browse files
pegasus/nlp-eval.yaml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
- command:
|
2 |
-
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-causal-experimental --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks arc_challenge --num_fewshot 25 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}.json
|
3 |
model:
|
4 |
- /data/leaderboard/weights/metaai/llama-7B
|
5 |
- /data/leaderboard/weights/metaai/llama-13B
|
@@ -21,7 +21,7 @@
|
|
21 |
- togethercomputer/RedPajama-INCITE-7B-Chat
|
22 |
|
23 |
- command:
|
24 |
-
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-causal-experimental --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks hellaswag --num_fewshot 10 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}.json
|
25 |
model:
|
26 |
- /data/leaderboard/weights/metaai/llama-7B
|
27 |
- /data/leaderboard/weights/metaai/llama-13B
|
@@ -43,7 +43,7 @@
|
|
43 |
- togethercomputer/RedPajama-INCITE-7B-Chat
|
44 |
|
45 |
- command:
|
46 |
-
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-causal-experimental --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks truthfulqa_mc --num_fewshot 0 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}.json
|
47 |
model:
|
48 |
- /data/leaderboard/weights/metaai/llama-7B
|
49 |
- /data/leaderboard/weights/metaai/llama-13B
|
@@ -65,8 +65,8 @@
|
|
65 |
- togethercomputer/RedPajama-INCITE-7B-Chat
|
66 |
|
67 |
- command:
|
68 |
-
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-seq2seq --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks arc_challenge --num_fewshot 25 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}.json
|
69 |
-
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-seq2seq --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks hellaswag --num_fewshot 10 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}.json
|
70 |
-
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-seq2seq --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks truthfulqa_mc --num_fewshot 0 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}.json
|
71 |
model:
|
72 |
- lmsys/fastchat-t5-3b-v1.0
|
|
|
1 |
- command:
|
2 |
+
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-causal-experimental --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks arc_challenge --num_fewshot 25 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}/arc_challenge.json
|
3 |
model:
|
4 |
- /data/leaderboard/weights/metaai/llama-7B
|
5 |
- /data/leaderboard/weights/metaai/llama-13B
|
|
|
21 |
- togethercomputer/RedPajama-INCITE-7B-Chat
|
22 |
|
23 |
- command:
|
24 |
+
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-causal-experimental --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks hellaswag --num_fewshot 10 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}/hellaswag.json
|
25 |
model:
|
26 |
- /data/leaderboard/weights/metaai/llama-7B
|
27 |
- /data/leaderboard/weights/metaai/llama-13B
|
|
|
43 |
- togethercomputer/RedPajama-INCITE-7B-Chat
|
44 |
|
45 |
- command:
|
46 |
+
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-causal-experimental --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks truthfulqa_mc --num_fewshot 0 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}/truthfulqa_mc.json
|
47 |
model:
|
48 |
- /data/leaderboard/weights/metaai/llama-7B
|
49 |
- /data/leaderboard/weights/metaai/llama-13B
|
|
|
65 |
- togethercomputer/RedPajama-INCITE-7B-Chat
|
66 |
|
67 |
- command:
|
68 |
+
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-seq2seq --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks arc_challenge --num_fewshot 25 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}/arc_challenge.json
|
69 |
+
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-seq2seq --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks hellaswag --num_fewshot 10 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}/hellaswag.json
|
70 |
+
- docker exec leaderboard{{ gpu }} python lm-evaluation-harness/main.py --device cuda --no_cache --model hf-seq2seq --model_args pretrained={{ model }},trust_remote_code=True,use_accelerate=True --tasks truthfulqa_mc --num_fewshot 0 --output_path /data/leaderboard/benchmark/nlp/{{ replace model "/" "--" }}/truthfulqa_mc.json
|
71 |
model:
|
72 |
- lmsys/fastchat-t5-3b-v1.0
|
scripts/aggregate_nlp_metrics.py
CHANGED
@@ -31,7 +31,7 @@ def main(data_dir: str, out_file: str = "score.csv") -> None:
|
|
31 |
for model_dir in models:
|
32 |
for task, metric in TASK_METRICS.items():
|
33 |
model_name = "/".join(model_dir.split("--")[-2:])
|
34 |
-
results = json.load(open(f"{data_dir}/{model_dir}/{task}"))
|
35 |
df.loc[model_name, TASK_SHORT_NAMES[task]] = float(results["results"][task][metric]) * 100.0
|
36 |
df = df.reset_index().rename(columns={"index": "model"})
|
37 |
|
|
|
31 |
for model_dir in models:
|
32 |
for task, metric in TASK_METRICS.items():
|
33 |
model_name = "/".join(model_dir.split("--")[-2:])
|
34 |
+
results = json.load(open(f"{data_dir}/{model_dir}/{task}.json"))
|
35 |
df.loc[model_name, TASK_SHORT_NAMES[task]] = float(results["results"][task][metric]) * 100.0
|
36 |
df = df.reset_index().rename(columns={"index": "model"})
|
37 |
|