zstanjj commited on
Commit
b5a12e3
Β·
1 Parent(s): 921b8ba

add auto eval

Browse files
Files changed (28) hide show
  1. app.py +10 -5
  2. eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +34 -0
  3. eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +34 -0
  4. eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +34 -0
  5. eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +34 -0
  6. eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
  7. eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
  8. eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
  9. eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +35 -0
  10. eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +35 -0
  11. eval-results/{demo-leaderboard β†’ omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +12 -12
  12. eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +35 -0
  13. eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
  14. eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
  15. eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
  16. eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +0 -0
  17. eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
  18. eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β†’ omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
  19. eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β†’ omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
  20. eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β†’ omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
  21. eval-results/{demo-leaderboard β†’ omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
  22. eval-results/{demo-leaderboard β†’ omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
  23. eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β†’ omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
  24. eval-results/{demo-leaderboard β†’ omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
  25. eval-results/{demo-leaderboard/qwen2-72b_jina-zh β†’ omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
  26. src/about.py +5 -5
  27. src/envs.py +3 -2
  28. src/leaderboard/read_evals.py +1 -1
app.py CHANGED
@@ -24,7 +24,7 @@ from src.display.utils import (
24
  WeightType,
25
  Precision
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
@@ -41,7 +41,8 @@ try:
41
  except Exception:
42
  restart_space()
43
  try:
44
- print(EVAL_RESULTS_PATH)
 
45
  # snapshot_download(
46
  # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
  # )
@@ -49,7 +50,8 @@ except Exception:
49
  restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
53
 
54
  # (
55
  # finished_eval_queue_df,
@@ -97,8 +99,11 @@ with demo:
97
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
98
 
99
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
100
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
101
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
102
 
103
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
104
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
24
  WeightType,
25
  Precision
26
  )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, AUTO_RESULTS_PATH, HUMAN_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
 
41
  except Exception:
42
  restart_space()
43
  try:
44
+ print(AUTO_RESULTS_PATH)
45
+ print(HUMAN_RESULTS_PATH)
46
  # snapshot_download(
47
  # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
  # )
 
50
  restart_space()
51
 
52
 
53
+ AUTO_LEADERBOARD_DF = get_leaderboard_df(AUTO_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
+ HUMAN_LEADERBOARD_DF = get_leaderboard_df(HUMAN_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
55
 
56
  # (
57
  # finished_eval_queue_df,
 
99
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
100
 
101
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
102
+ with gr.TabItem("πŸ†OmniEval-Human", elem_id="llm-benchmark-tab-table", id=0):
103
+ leaderboard = init_leaderboard(HUMAN_LEADERBOARD_DF)
104
+
105
+ with gr.TabItem("πŸ€–OmniEval-Auto", elem_id="llm-benchmark-tab-table", id=1):
106
+ leaderboard = init_leaderboard(AUTO_LEADERBOARD_DF)
107
 
108
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
109
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.0011680767773708802,
9
+ "f1": 0.3709233008524321,
10
+ "rouge1": 0.2570830224992733,
11
+ "rouge2": 0.09085043984411759,
12
+ "rougeL": 0.1860727124152372,
13
+ "accuracy": 0.35869427958075517,
14
+ "completeness": 0.5755086661642803,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.11213720316622691
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_deepseek-v2-chat",
22
+ "generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
+ "generative_model_args": {
24
+ "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
+ "num_params": 236,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": true
32
+ }
33
+ }
34
+ }
eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.0008839499936860714,
9
+ "f1": 0.39891051266403244,
10
+ "rouge1": 0.2679937299203498,
11
+ "rouge2": 0.09293819886242284,
12
+ "rougeL": 0.19931718897529843,
13
+ "accuracy": 0.3238413941154186,
14
+ "completeness": 0.52843637454982,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.06765619606489472
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_llama3-70b-instruct",
22
+ "generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
+ "num_params": 70.6,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": true
32
+ }
33
+ }
34
+ }
eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.0002525571410531633,
9
+ "f1": 0.32215271896313463,
10
+ "rouge1": 0.2352109086389165,
11
+ "rouge2": 0.08060449522198783,
12
+ "rougeL": 0.16073680618083347,
13
+ "accuracy": 0.37883571157974494,
14
+ "completeness": 0.6016923768159353,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.1255931667193926
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_qwen2-72b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": true
32
+ }
33
+ }
34
+ }
eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.0,
5
+ "map": 0.0
6
+ },
7
+ "generation": {
8
+ "em": 0.0,
9
+ "f1": 0.06725057117657031,
10
+ "rouge1": 0.1277764944666756,
11
+ "rouge2": 0.03211441875898112,
12
+ "rougeL": 0.03257144660565082,
13
+ "accuracy": 0.15734309887612072,
14
+ "completeness": 0.5063249001331558,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.06932865291794647
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "CLOSE_yi15-34b",
22
+ "generative_model": "01ai/Yi-1.5-34B-Chat-16K",
23
+ "generative_model_args": {
24
+ "name": "01ai/Yi-1.5-34B-Chat-16K",
25
+ "num_params": 34.4,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "CLOSE",
29
+ "retrieval_model_args": {
30
+ "num_params": 0.0,
31
+ "open_source": true
32
+ }
33
+ }
34
+ }
eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.3097634381445468,
5
+ "map": 0.30402197247127166
6
+ },
7
+ "generation": {
8
+ "em": 0.0026518499810582142,
9
+ "f1": 0.2480828824153542,
10
+ "rouge1": 0.2493538725800514,
11
+ "rouge2": 0.1235656068292625,
12
+ "rougeL": 0.16098924930699862,
13
+ "accuracy": 0.3906427579239803,
14
+ "completeness": 0.5930474914396308,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.5045650189122212,
17
+ "numerical_accuracy": 0.28149656401119877
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "bge-large-zh_qwen2-72b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "BAAI/bge-large-zh",
29
+ "retrieval_model_args": {
30
+ "name": "BAAI/bge-large-zh",
31
+ "num_params": 0.326,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.33076566906595944,
5
+ "map": 0.32402765500694536
6
+ },
7
+ "generation": {
8
+ "em": 0.002525571410531633,
9
+ "f1": 0.2524796046548042,
10
+ "rouge1": 0.2542055585319881,
11
+ "rouge2": 0.12967013110722864,
12
+ "rougeL": 0.16623387811734364,
13
+ "accuracy": 0.0,
14
+ "completeness": 0.0,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.0,
17
+ "numerical_accuracy": 0.0
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "bge-m3_qwen2-72b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "BAAI/bge-m3",
29
+ "retrieval_model_args": {
30
+ "name": "BAAI/bge-m3",
31
+ "num_params": 0.5,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.26059266742433806,
5
+ "map": 0.25533526960474806
6
+ },
7
+ "generation": {
8
+ "em": 0.002146735698951888,
9
+ "f1": 0.24207930410773865,
10
+ "rouge1": 0.24073805243800728,
11
+ "rouge2": 0.1162276261848681,
12
+ "rougeL": 0.1534679545927458,
13
+ "accuracy": 0.37713095087763604,
14
+ "completeness": 0.5855007473841555,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.49136152656008253,
17
+ "numerical_accuracy": 0.2582123758594347
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "e5-mistral-7b_qwen2-72b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "intfloat/e5-mistral-7b-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "intfloat/e5-mistral-7b-instruct",
31
+ "num_params": 7.11,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.3406848507808225,
5
+ "map": 0.3337426863661236
6
+ },
7
+ "generation": {
8
+ "em": 0.0035568464031653824,
9
+ "f1": 0.3226028700822056,
10
+ "rouge1": 0.29804464952499493,
11
+ "rouge2": 0.1619392409911174,
12
+ "rougeL": 0.21536150159516076,
13
+ "accuracy": 0.3783377209477247,
14
+ "completeness": 0.5935541629364369,
15
+ "hallucination": 0.06668379802132854,
16
+ "utilization": 0.48314821907315203,
17
+ "numerical_accuracy": 0.2761605035405193
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
22
+ "generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
23
+ "generative_model_args": {
24
+ "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
25
+ "num_params": 236,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.3406848507808225,
5
+ "map": 0.3337426863661236
6
+ },
7
+ "generation": {
8
+ "em": 0.030906680136380857,
9
+ "f1": 0.4704248712273675,
10
+ "rouge1": 0.3844331865430577,
11
+ "rouge2": 0.21544656691735142,
12
+ "rougeL": 0.3082188596657867,
13
+ "accuracy": 0.4181714862987751,
14
+ "completeness": 0.586105675146771,
15
+ "hallucination": 0.0880543450397334,
16
+ "utilization": 0.45601078859491395,
17
+ "numerical_accuracy": 0.2751721876024926
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
22
+ "generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
25
+ "num_params": 70.6,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/{demo-leaderboard β†’ omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json RENAMED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "retrieval": {
4
- "mrr": 0.36173120728929387,
5
- "map": 0.3512338648443432
6
  },
7
  "generation": {
8
- "em": 0.002277904328018223,
9
- "f1": 0.3804001391052641,
10
- "rouge1": 0.34576336184459094,
11
- "rouge2": 0.1928778762677512,
12
- "rougeL": 0.2383694455084706,
13
- "accuracy": 0.4145785876993166,
14
- "completeness": 0.598297213622291,
15
- "hallucination": 0.07213496218731821,
16
- "utilization": 1.13922942206655,
17
- "numerical_accuracy": 0.3218694885361552
18
  }
19
  },
20
  "config": {
 
1
  {
2
  "results": {
3
  "retrieval": {
4
+ "mrr": 0.3406848507808225,
5
+ "map": 0.3337426863661236
6
  },
7
  "generation": {
8
+ "em": 0.0028412678368480867,
9
+ "f1": 0.2477112059712835,
10
+ "rouge1": 0.25666135328401396,
11
+ "rouge2": 0.13256084364546591,
12
+ "rougeL": 0.1669344569228441,
13
+ "accuracy": 0.40573304710190683,
14
+ "completeness": 0.6131668895824045,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.5346272891410885,
17
+ "numerical_accuracy": 0.2971301335972291
18
  }
19
  },
20
  "config": {
eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.3406848507808225,
5
+ "map": 0.3337426863661236
6
+ },
7
+ "generation": {
8
+ "em": 0.0,
9
+ "f1": 0.09732568803130702,
10
+ "rouge1": 0.1642342072893325,
11
+ "rouge2": 0.06542075931397044,
12
+ "rougeL": 0.059256539829821125,
13
+ "accuracy": 0.3304375804375804,
14
+ "completeness": 0.5735068912710567,
15
+ "hallucination": 0.06555017663221248,
16
+ "utilization": 0.4132755170113409,
17
+ "numerical_accuracy": 0.175
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "gte-qwen2-1.5b_yi15-34b",
22
+ "generative_model": "01ai/Yi-1.5-34B-Chat-16K",
23
+ "generative_model_args": {
24
+ "name": "01ai/Yi-1.5-34B-Chat-16K",
25
+ "num_params": 34.4,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
29
+ "retrieval_model_args": {
30
+ "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
31
+ "num_params": 1.78,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "retrieval": {
4
+ "mrr": 0.25315906890600665,
5
+ "map": 0.24830681483352277
6
+ },
7
+ "generation": {
8
+ "em": 0.0026518499810582142,
9
+ "f1": 0.24837825152624493,
10
+ "rouge1": 0.24111819423215256,
11
+ "rouge2": 0.11665848753826197,
12
+ "rougeL": 0.1558018779014647,
13
+ "accuracy": 0.3705644652102538,
14
+ "completeness": 0.5820335932813437,
15
+ "hallucination": 0.0,
16
+ "utilization": 0.4738984364905027,
17
+ "numerical_accuracy": 0.24648820567187915
18
+ }
19
+ },
20
+ "config": {
21
+ "eval_name": "jina-zh_qwen2-72b",
22
+ "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
+ "generative_model_args": {
24
+ "name": "Qwen/Qwen2.5-72B-Instruct",
25
+ "num_params": 72.7,
26
+ "open_source": true
27
+ },
28
+ "retrieval_model": "jinaai/jina-embeddings-v2-base-zh",
29
+ "retrieval_model_args": {
30
+ "name": "jinaai/jina-embeddings-v2-base-zh",
31
+ "num_params": 0.161,
32
+ "open_source": true
33
+ }
34
+ }
35
+ }
eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json RENAMED
File without changes
eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json RENAMED
File without changes
eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json RENAMED
File without changes
eval-results/{demo-leaderboard β†’ omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json RENAMED
File without changes
eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β†’ omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json RENAMED
@@ -18,7 +18,7 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "qwen2-72b_bge-large-zh",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "bge-large-zh_qwen2-72b",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β†’ omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json RENAMED
@@ -18,7 +18,7 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "qwen2-72b_bge-m3",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "bge-m3_qwen2-72b",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β†’ omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json RENAMED
@@ -18,7 +18,7 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "qwen2-72b_e5-mistral-7b",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "e5-mistral-7b_qwen2-72b",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
eval-results/{demo-leaderboard β†’ omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json RENAMED
File without changes
eval-results/{demo-leaderboard β†’ omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json RENAMED
File without changes
eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β†’ omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json RENAMED
@@ -18,7 +18,7 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "qwen2-72b_gte-qwen2-1.5b",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "gte-qwen2-1.5b_qwen2-72b",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
eval-results/{demo-leaderboard β†’ omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json RENAMED
File without changes
eval-results/{demo-leaderboard/qwen2-72b_jina-zh β†’ omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json RENAMED
@@ -18,7 +18,7 @@
18
  }
19
  },
20
  "config": {
21
- "eval_name": "qwen2-72b_jina-zh",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
 
18
  }
19
  },
20
  "config": {
21
+ "eval_name": "jina-zh_qwen2-72b",
22
  "generative_model": "Qwen/Qwen2.5-72B-Instruct",
23
  "generative_model_args": {
24
  "name": "Qwen/Qwen2.5-72B-Instruct",
src/about.py CHANGED
@@ -83,12 +83,12 @@ LLM_BENCHMARKS_TEXT = f"""
83
  With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
84
 
85
 
86
- ## :wrench: Installation
87
  `conda env create -f environment.yml && conda activate finrag`
88
 
89
- <!-- ## :sparkles: Features
90
  1. -->
91
- ## :rocket: Quick-Start
92
  Notion:
93
  1. The code run path is `./OpenFinBench`
94
  2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
@@ -136,11 +136,11 @@ Then conduct the model-based evaluate using the following codes, (change the par
136
  sh evaluator/judgement/judger.sh
137
  ```
138
 
139
- ## :bookmark: License
140
 
141
  OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
142
 
143
- ## :star2: Citation
144
  The paper is waiting to be released!
145
 
146
  <!-- # Check Infos
 
83
  With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
84
 
85
 
86
+ ## πŸ”§ Installation
87
  `conda env create -f environment.yml && conda activate finrag`
88
 
89
+ <!-- ## ✨ Features
90
  1. -->
91
+ ## πŸš€ Quick-Start
92
  Notion:
93
  1. The code run path is `./OpenFinBench`
94
  2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
 
136
  sh evaluator/judgement/judger.sh
137
  ```
138
 
139
+ ## πŸ”– License
140
 
141
  OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
142
 
143
+ ## 🌟 Citation
144
  The paper is waiting to be released!
145
 
146
  <!-- # Check Infos
src/envs.py CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
@@ -18,7 +18,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "RUC-NLPIR" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ HUMAN_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-human")
22
+ AUTO_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-auto")
23
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
24
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
25
 
src/leaderboard/read_evals.py CHANGED
@@ -183,7 +183,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
183
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
184
  """From the path of the results folder root, extract all needed info for results"""
185
  model_result_filepaths = []
186
-
187
  for root, _, files in os.walk(results_path):
188
  # We should only have json files in model results
189
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
 
183
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
184
  """From the path of the results folder root, extract all needed info for results"""
185
  model_result_filepaths = []
186
+ print(f"Reading results from {results_path}")
187
  for root, _, files in os.walk(results_path):
188
  # We should only have json files in model results
189
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):