Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Remove duplicate EvalRequest.model_type
#13
by
albertvillanova
HF staff
- opened
- Makefile +2 -4
- app.py +19 -21
- custom_tasks.py +1 -0
- main_backend_harness.py +50 -17
- main_backend_lighteval.py +65 -25
- pyproject.toml +6 -9
- requirements.txt +0 -1
- scripts/create_request_file.py +6 -2
- scripts/fix_harness_import.py +3 -1
- src/backend/manage_requests.py +25 -22
- src/backend/run_eval_suite_harness.py +17 -6
- src/backend/run_eval_suite_lighteval.py +36 -26
- src/backend/sort_queue.py +5 -2
- src/display/log_visualizer.py +5 -6
- src/envs.py +8 -8
- src/logging.py +6 -8
Makefile
CHANGED
@@ -2,12 +2,10 @@
|
|
2 |
|
3 |
|
4 |
style:
|
5 |
-
python -m black --line-length 119 .
|
6 |
-
python -m isort .
|
7 |
ruff check --fix .
|
|
|
8 |
|
9 |
|
10 |
quality:
|
11 |
-
python -m black --check --line-length 119 .
|
12 |
-
python -m isort --check-only .
|
13 |
ruff check .
|
|
|
|
2 |
|
3 |
|
4 |
style:
|
|
|
|
|
5 |
ruff check --fix .
|
6 |
+
ruff format .
|
7 |
|
8 |
|
9 |
quality:
|
|
|
|
|
10 |
ruff check .
|
11 |
+
ruff format --check .
|
app.py
CHANGED
@@ -1,32 +1,31 @@
|
|
1 |
import logging
|
2 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
-
|
4 |
-
from src.logging import configure_root_logger
|
5 |
-
|
6 |
-
logging.getLogger("httpx").setLevel(logging.WARNING)
|
7 |
-
logging.getLogger("numexpr").setLevel(logging.WARNING)
|
8 |
-
logging.getLogger("absl").setLevel(logging.WARNING)
|
9 |
-
configure_root_logger()
|
10 |
-
|
11 |
from functools import partial
|
12 |
|
13 |
import gradio as gr
|
|
|
|
|
14 |
# Choose ligtheval or harness backend
|
|
|
15 |
from main_backend_lighteval import run_auto_eval
|
16 |
-
#from main_backend_harness import run_auto_eval
|
17 |
|
18 |
-
from src.display.log_visualizer import log_file_to_html_string
|
19 |
from src.display.css_html_js import dark_mode_gradio_js
|
20 |
-
from src.
|
21 |
-
from src.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
logging.basicConfig(level=logging.INFO)
|
24 |
logger = setup_logger(__name__)
|
25 |
|
26 |
|
27 |
-
intro_md =
|
28 |
# Intro
|
29 |
-
This is a visual for the auto evaluator.
|
30 |
"""
|
31 |
|
32 |
links_md = f"""
|
@@ -39,6 +38,7 @@ links_md = f"""
|
|
39 |
| Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
|
40 |
"""
|
41 |
|
|
|
42 |
def auto_eval():
|
43 |
logger.info("Triggering Auto Eval")
|
44 |
run_auto_eval()
|
@@ -52,20 +52,18 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
|
52 |
output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
|
53 |
with gr.Row():
|
54 |
download_button = gr.DownloadButton("Download Log File", value=log_file)
|
55 |
-
with gr.Accordion(
|
56 |
reverse_order_checkbox.render()
|
57 |
# Add a button that when pressed, triggers run_auto_eval
|
58 |
button = gr.Button("Manually Run Evaluation")
|
59 |
gr.Markdown(links_md)
|
60 |
|
61 |
-
#dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
|
62 |
|
63 |
button.click(fn=auto_eval, inputs=[], outputs=[])
|
64 |
|
65 |
-
if __name__ ==
|
66 |
scheduler = BackgroundScheduler()
|
67 |
scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
|
68 |
scheduler.start()
|
69 |
-
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0",
|
70 |
-
show_error=True,
|
71 |
-
server_port=7860)
|
|
|
1 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from functools import partial
|
3 |
|
4 |
import gradio as gr
|
5 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
+
|
7 |
# Choose ligtheval or harness backend
|
8 |
+
# from main_backend_harness import run_auto_eval
|
9 |
from main_backend_lighteval import run_auto_eval
|
|
|
10 |
|
|
|
11 |
from src.display.css_html_js import dark_mode_gradio_js
|
12 |
+
from src.display.log_visualizer import log_file_to_html_string
|
13 |
+
from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
|
14 |
+
from src.logging import configure_root_logger, log_file, setup_logger
|
15 |
+
|
16 |
+
|
17 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
18 |
+
logging.getLogger("numexpr").setLevel(logging.WARNING)
|
19 |
+
logging.getLogger("absl").setLevel(logging.WARNING)
|
20 |
+
configure_root_logger()
|
21 |
|
22 |
logging.basicConfig(level=logging.INFO)
|
23 |
logger = setup_logger(__name__)
|
24 |
|
25 |
|
26 |
+
intro_md = """
|
27 |
# Intro
|
28 |
+
This is a visual for the auto evaluator.
|
29 |
"""
|
30 |
|
31 |
links_md = f"""
|
|
|
38 |
| Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
|
39 |
"""
|
40 |
|
41 |
+
|
42 |
def auto_eval():
|
43 |
logger.info("Triggering Auto Eval")
|
44 |
run_auto_eval()
|
|
|
52 |
output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
|
53 |
with gr.Row():
|
54 |
download_button = gr.DownloadButton("Download Log File", value=log_file)
|
55 |
+
with gr.Accordion("Log View Configuration", open=False):
|
56 |
reverse_order_checkbox.render()
|
57 |
# Add a button that when pressed, triggers run_auto_eval
|
58 |
button = gr.Button("Manually Run Evaluation")
|
59 |
gr.Markdown(links_md)
|
60 |
|
61 |
+
# dummy = gr.Markdown(auto_eval, every=REFRESH_RATE, visible=False)
|
62 |
|
63 |
button.click(fn=auto_eval, inputs=[], outputs=[])
|
64 |
|
65 |
+
if __name__ == "__main__":
|
66 |
scheduler = BackgroundScheduler()
|
67 |
scheduler.add_job(auto_eval, "interval", seconds=REFRESH_RATE)
|
68 |
scheduler.start()
|
69 |
+
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
|
|
|
custom_tasks.py
CHANGED
@@ -6,6 +6,7 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then im
|
|
6 |
|
7 |
Author:
|
8 |
"""
|
|
|
9 |
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
10 |
from lighteval.tasks.requests import Doc
|
11 |
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
|
|
6 |
|
7 |
Author:
|
8 |
"""
|
9 |
+
|
10 |
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
11 |
from lighteval.tasks.requests import Doc
|
12 |
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
main_backend_harness.py
CHANGED
@@ -3,25 +3,56 @@ import pprint
|
|
3 |
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from src.backend.run_eval_suite_harness import run_evaluation
|
9 |
-
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
from src.logging import setup_logger
|
15 |
|
16 |
|
|
|
17 |
|
18 |
# logging.basicConfig(level=logging.ERROR)
|
19 |
logger = setup_logger(__name__)
|
20 |
pp = pprint.PrettyPrinter(width=80)
|
21 |
|
22 |
|
23 |
-
snapshot_download(
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def run_auto_eval():
|
27 |
current_pending_status = [PENDING_STATUS]
|
@@ -36,11 +67,13 @@ def run_auto_eval():
|
|
36 |
hf_repo=QUEUE_REPO,
|
37 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
38 |
hf_repo_results=RESULTS_REPO,
|
39 |
-
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
40 |
)
|
41 |
|
42 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
43 |
-
eval_requests = get_eval_requests(
|
|
|
|
|
44 |
# Sort the evals by priority (first submitted first run)
|
45 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
46 |
|
@@ -61,16 +94,16 @@ def run_auto_eval():
|
|
61 |
)
|
62 |
|
63 |
run_evaluation(
|
64 |
-
eval_request=eval_request,
|
65 |
-
task_names=TASKS_HARNESS,
|
66 |
-
num_fewshot=NUM_FEWSHOT,
|
67 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
68 |
results_repo=RESULTS_REPO,
|
69 |
batch_size="auto",
|
70 |
-
device=DEVICE,
|
71 |
-
limit=LIMIT
|
72 |
-
|
73 |
|
74 |
|
75 |
if __name__ == "__main__":
|
76 |
-
run_auto_eval()
|
|
|
3 |
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
+
from src.backend.manage_requests import (
|
7 |
+
FAILED_STATUS,
|
8 |
+
FINISHED_STATUS,
|
9 |
+
PENDING_STATUS,
|
10 |
+
RUNNING_STATUS,
|
11 |
+
check_completed_evals,
|
12 |
+
get_eval_requests,
|
13 |
+
set_eval_request,
|
14 |
+
)
|
15 |
from src.backend.run_eval_suite_harness import run_evaluation
|
|
|
16 |
from src.backend.sort_queue import sort_models_by_priority
|
17 |
+
from src.envs import (
|
18 |
+
API,
|
19 |
+
DEVICE,
|
20 |
+
EVAL_REQUESTS_PATH_BACKEND,
|
21 |
+
EVAL_RESULTS_PATH_BACKEND,
|
22 |
+
LIMIT,
|
23 |
+
NUM_FEWSHOT,
|
24 |
+
QUEUE_REPO,
|
25 |
+
RESULTS_REPO,
|
26 |
+
TASKS_HARNESS,
|
27 |
+
TOKEN,
|
28 |
+
)
|
29 |
from src.logging import setup_logger
|
30 |
|
31 |
|
32 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
33 |
|
34 |
# logging.basicConfig(level=logging.ERROR)
|
35 |
logger = setup_logger(__name__)
|
36 |
pp = pprint.PrettyPrinter(width=80)
|
37 |
|
38 |
|
39 |
+
snapshot_download(
|
40 |
+
repo_id=RESULTS_REPO,
|
41 |
+
revision="main",
|
42 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
43 |
+
repo_type="dataset",
|
44 |
+
max_workers=60,
|
45 |
+
token=TOKEN,
|
46 |
+
)
|
47 |
+
snapshot_download(
|
48 |
+
repo_id=QUEUE_REPO,
|
49 |
+
revision="main",
|
50 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
51 |
+
repo_type="dataset",
|
52 |
+
max_workers=60,
|
53 |
+
token=TOKEN,
|
54 |
+
)
|
55 |
+
|
56 |
|
57 |
def run_auto_eval():
|
58 |
current_pending_status = [PENDING_STATUS]
|
|
|
67 |
hf_repo=QUEUE_REPO,
|
68 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
69 |
hf_repo_results=RESULTS_REPO,
|
70 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
71 |
)
|
72 |
|
73 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
74 |
+
eval_requests = get_eval_requests(
|
75 |
+
job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
76 |
+
)
|
77 |
# Sort the evals by priority (first submitted first run)
|
78 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
79 |
|
|
|
94 |
)
|
95 |
|
96 |
run_evaluation(
|
97 |
+
eval_request=eval_request,
|
98 |
+
task_names=TASKS_HARNESS,
|
99 |
+
num_fewshot=NUM_FEWSHOT,
|
100 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
101 |
results_repo=RESULTS_REPO,
|
102 |
batch_size="auto",
|
103 |
+
device=DEVICE,
|
104 |
+
limit=LIMIT,
|
105 |
+
)
|
106 |
|
107 |
|
108 |
if __name__ == "__main__":
|
109 |
+
run_auto_eval()
|
main_backend_lighteval.py
CHANGED
@@ -3,22 +3,57 @@ import pprint
|
|
3 |
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from src.backend.run_eval_suite_lighteval import run_evaluation
|
9 |
-
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from src.logging import setup_logger
|
14 |
|
|
|
|
|
|
|
15 |
logger = setup_logger(__name__)
|
16 |
|
17 |
# logging.basicConfig(level=logging.ERROR)
|
18 |
pp = pprint.PrettyPrinter(width=80)
|
19 |
|
20 |
-
snapshot_download(
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def run_auto_eval():
|
24 |
current_pending_status = [PENDING_STATUS]
|
@@ -33,11 +68,13 @@ def run_auto_eval():
|
|
33 |
hf_repo=QUEUE_REPO,
|
34 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
35 |
hf_repo_results=RESULTS_REPO,
|
36 |
-
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
37 |
)
|
38 |
|
39 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
40 |
-
eval_requests = get_eval_requests(
|
|
|
|
|
41 |
# Sort the evals by priority (first submitted first run)
|
42 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
43 |
|
@@ -49,7 +86,6 @@ def run_auto_eval():
|
|
49 |
eval_request = eval_requests[0]
|
50 |
logger.info(pp.pformat(eval_request))
|
51 |
|
52 |
-
|
53 |
set_eval_request(
|
54 |
api=API,
|
55 |
eval_request=eval_request,
|
@@ -59,29 +95,33 @@ def run_auto_eval():
|
|
59 |
)
|
60 |
|
61 |
# This needs to be done
|
62 |
-
#instance_size, instance_type = get_instance_for_model(eval_request)
|
63 |
# For GPU
|
64 |
-
# instance_size, instance_type = "small", "g4dn.xlarge"
|
65 |
# For CPU
|
66 |
# Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
|
67 |
instance_size, instance_type = "x4", "intel-icl"
|
68 |
-
logger.info(
|
|
|
|
|
69 |
|
70 |
run_evaluation(
|
71 |
-
eval_request=eval_request,
|
72 |
-
task_names=TASKS_LIGHTEVAL,
|
73 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
74 |
-
batch_size=1,
|
75 |
-
accelerator=ACCELERATOR,
|
76 |
-
region=REGION,
|
77 |
-
vendor=VENDOR,
|
78 |
-
instance_size=instance_size,
|
79 |
-
instance_type=instance_type,
|
80 |
-
limit=LIMIT
|
81 |
-
|
82 |
|
83 |
-
logger.info(
|
|
|
|
|
84 |
|
85 |
|
86 |
if __name__ == "__main__":
|
87 |
-
run_auto_eval()
|
|
|
3 |
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
+
from src.backend.manage_requests import (
|
7 |
+
FAILED_STATUS,
|
8 |
+
FINISHED_STATUS,
|
9 |
+
PENDING_STATUS,
|
10 |
+
RUNNING_STATUS,
|
11 |
+
check_completed_evals,
|
12 |
+
get_eval_requests,
|
13 |
+
set_eval_request,
|
14 |
+
)
|
15 |
from src.backend.run_eval_suite_lighteval import run_evaluation
|
|
|
16 |
from src.backend.sort_queue import sort_models_by_priority
|
17 |
+
from src.envs import (
|
18 |
+
ACCELERATOR,
|
19 |
+
API,
|
20 |
+
EVAL_REQUESTS_PATH_BACKEND,
|
21 |
+
EVAL_RESULTS_PATH_BACKEND,
|
22 |
+
LIMIT,
|
23 |
+
QUEUE_REPO,
|
24 |
+
REGION,
|
25 |
+
RESULTS_REPO,
|
26 |
+
TASKS_LIGHTEVAL,
|
27 |
+
TOKEN,
|
28 |
+
VENDOR,
|
29 |
+
)
|
30 |
from src.logging import setup_logger
|
31 |
|
32 |
+
|
33 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
34 |
+
|
35 |
logger = setup_logger(__name__)
|
36 |
|
37 |
# logging.basicConfig(level=logging.ERROR)
|
38 |
pp = pprint.PrettyPrinter(width=80)
|
39 |
|
40 |
+
snapshot_download(
|
41 |
+
repo_id=RESULTS_REPO,
|
42 |
+
revision="main",
|
43 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
44 |
+
repo_type="dataset",
|
45 |
+
max_workers=60,
|
46 |
+
token=TOKEN,
|
47 |
+
)
|
48 |
+
snapshot_download(
|
49 |
+
repo_id=QUEUE_REPO,
|
50 |
+
revision="main",
|
51 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
52 |
+
repo_type="dataset",
|
53 |
+
max_workers=60,
|
54 |
+
token=TOKEN,
|
55 |
+
)
|
56 |
+
|
57 |
|
58 |
def run_auto_eval():
|
59 |
current_pending_status = [PENDING_STATUS]
|
|
|
68 |
hf_repo=QUEUE_REPO,
|
69 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
70 |
hf_repo_results=RESULTS_REPO,
|
71 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
72 |
)
|
73 |
|
74 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
75 |
+
eval_requests = get_eval_requests(
|
76 |
+
job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
77 |
+
)
|
78 |
# Sort the evals by priority (first submitted first run)
|
79 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
80 |
|
|
|
86 |
eval_request = eval_requests[0]
|
87 |
logger.info(pp.pformat(eval_request))
|
88 |
|
|
|
89 |
set_eval_request(
|
90 |
api=API,
|
91 |
eval_request=eval_request,
|
|
|
95 |
)
|
96 |
|
97 |
# This needs to be done
|
98 |
+
# instance_size, instance_type = get_instance_for_model(eval_request)
|
99 |
# For GPU
|
100 |
+
# instance_size, instance_type = "small", "g4dn.xlarge"
|
101 |
# For CPU
|
102 |
# Updated naming available at https://huggingface.co/docs/inference-endpoints/pricing
|
103 |
instance_size, instance_type = "x4", "intel-icl"
|
104 |
+
logger.info(
|
105 |
+
f"Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
|
106 |
+
)
|
107 |
|
108 |
run_evaluation(
|
109 |
+
eval_request=eval_request,
|
110 |
+
task_names=TASKS_LIGHTEVAL,
|
111 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
112 |
+
batch_size=1,
|
113 |
+
accelerator=ACCELERATOR,
|
114 |
+
region=REGION,
|
115 |
+
vendor=VENDOR,
|
116 |
+
instance_size=instance_size,
|
117 |
+
instance_type=instance_type,
|
118 |
+
limit=LIMIT,
|
119 |
+
)
|
120 |
|
121 |
+
logger.info(
|
122 |
+
f"Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}"
|
123 |
+
)
|
124 |
|
125 |
|
126 |
if __name__ == "__main__":
|
127 |
+
run_auto_eval()
|
pyproject.toml
CHANGED
@@ -1,13 +1,10 @@
|
|
1 |
[tool.ruff]
|
2 |
-
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
-
select = ["E", "F"]
|
4 |
-
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
line-length = 119
|
6 |
-
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
|
8 |
-
[tool.
|
9 |
-
|
10 |
-
|
11 |
|
12 |
-
[tool.
|
13 |
-
|
|
|
|
1 |
[tool.ruff]
|
|
|
|
|
|
|
2 |
line-length = 119
|
|
|
3 |
|
4 |
+
[tool.ruff.lint]
|
5 |
+
select = ["C", "E", "F", "I", "W"]
|
6 |
+
ignore = ["E501"] # line too long (the formatter is taking care of this)
|
7 |
|
8 |
+
[tool.ruff.lint.isort]
|
9 |
+
lines-after-imports = 2
|
10 |
+
known-local-folder = ["src"]
|
requirements.txt
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
APScheduler==3.10.1
|
2 |
-
black==23.11.0
|
3 |
click==8.1.3
|
4 |
huggingface-hub>=0.18.0
|
5 |
python-dateutil==2.8.2
|
|
|
1 |
APScheduler==3.10.1
|
|
|
2 |
click==8.1.3
|
3 |
huggingface-hub>=0.18.0
|
4 |
python-dateutil==2.8.2
|
scripts/create_request_file.py
CHANGED
@@ -7,7 +7,9 @@ from datetime import datetime, timezone
|
|
7 |
import click
|
8 |
from colorama import Fore
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
10 |
-
|
|
|
|
|
11 |
|
12 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
|
13 |
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
@@ -34,7 +36,9 @@ def get_model_size(model_info, precision: str):
|
|
34 |
def main():
|
35 |
api = HfApi()
|
36 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
37 |
-
snapshot_download(
|
|
|
|
|
38 |
|
39 |
model_name = click.prompt("Enter model name")
|
40 |
revision = click.prompt("Enter revision", default="main")
|
|
|
7 |
import click
|
8 |
from colorama import Fore
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
10 |
+
|
11 |
+
from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
12 |
+
|
13 |
|
14 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
|
15 |
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
|
|
36 |
def main():
|
37 |
api = HfApi()
|
38 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
39 |
+
snapshot_download(
|
40 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
|
41 |
+
)
|
42 |
|
43 |
model_name = click.prompt("Enter model name")
|
44 |
revision = click.prompt("Enter revision", default="main")
|
scripts/fix_harness_import.py
CHANGED
@@ -2,10 +2,12 @@
|
|
2 |
It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
|
3 |
It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
|
4 |
"""
|
|
|
5 |
import os
|
6 |
|
7 |
import lm_eval
|
8 |
|
|
|
9 |
if __name__ == "__main__":
|
10 |
lm_eval_path = lm_eval.__path__[0]
|
11 |
-
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
|
|
2 |
It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
|
3 |
It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
|
4 |
"""
|
5 |
+
|
6 |
import os
|
7 |
|
8 |
import lm_eval
|
9 |
|
10 |
+
|
11 |
if __name__ == "__main__":
|
12 |
lm_eval_path = lm_eval.__path__[0]
|
13 |
+
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
src/backend/manage_requests.py
CHANGED
@@ -4,9 +4,11 @@ from dataclasses import dataclass
|
|
4 |
from typing import Optional
|
5 |
|
6 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
7 |
from src.envs import TOKEN
|
8 |
from src.logging import setup_logger
|
9 |
|
|
|
10 |
logger = setup_logger(__name__)
|
11 |
|
12 |
PENDING_STATUS = "PENDING"
|
@@ -14,27 +16,29 @@ RUNNING_STATUS = "RUNNING"
|
|
14 |
FINISHED_STATUS = "FINISHED"
|
15 |
FAILED_STATUS = "FAILED"
|
16 |
|
|
|
17 |
@dataclass
|
18 |
class EvalRequest:
|
19 |
-
"""This class represents one evaluation request file.
|
20 |
-
|
21 |
model: str
|
22 |
status: str
|
23 |
json_filepath: str
|
24 |
weight_type: str = "Original"
|
25 |
-
model_type: str =
|
26 |
precision: str = "" # float16, bfloat16
|
27 |
-
revision: str = "main"
|
28 |
-
submitted_time: Optional[str] =
|
29 |
-
|
|
|
30 |
likes: Optional[int] = 0
|
31 |
params: Optional[int] = None
|
32 |
license: Optional[str] = ""
|
33 |
base_model: Optional[str] = ""
|
34 |
private: Optional[bool] = False
|
35 |
-
|
36 |
def get_model_args(self):
|
37 |
-
"""Edit this function if you want to manage more complex quantization issues. You'll need to map it to
|
38 |
the evaluation suite you chose.
|
39 |
"""
|
40 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
@@ -45,7 +49,7 @@ class EvalRequest:
|
|
45 |
# Quantized models need some added config, the install of bits and bytes, etc
|
46 |
else:
|
47 |
raise Exception(f"Unknown precision {self.precision}.")
|
48 |
-
|
49 |
return model_args
|
50 |
|
51 |
|
@@ -77,7 +81,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
|
|
77 |
Returns:
|
78 |
`list[EvalRequest]`: a list of model info dicts.
|
79 |
"""
|
80 |
-
snapshot_download(
|
|
|
|
|
81 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
82 |
|
83 |
eval_requests = []
|
@@ -102,6 +108,7 @@ def eval_was_running(eval_request: EvalRequest):
|
|
102 |
status = data["status"]
|
103 |
return status == RUNNING_STATUS
|
104 |
|
|
|
105 |
def check_completed_evals(
|
106 |
api: HfApi,
|
107 |
hf_repo: str,
|
@@ -114,12 +121,12 @@ def check_completed_evals(
|
|
114 |
):
|
115 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
116 |
snapshot_download(
|
117 |
-
repo_id=hf_repo_results,
|
118 |
-
revision="main",
|
119 |
-
local_dir=local_dir_results,
|
120 |
-
repo_type="dataset",
|
121 |
-
max_workers=60,
|
122 |
-
token=TOKEN
|
123 |
)
|
124 |
|
125 |
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
@@ -134,13 +141,9 @@ def check_completed_evals(
|
|
134 |
output_file_exists = len(glob.glob(output_file)) > 0
|
135 |
|
136 |
if output_file_exists:
|
137 |
-
logger.info(
|
138 |
-
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
139 |
-
)
|
140 |
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
141 |
else:
|
142 |
if eval_was_running(eval_request=eval_request):
|
143 |
-
logger.info(
|
144 |
-
f"No result file found for {model} setting it to {failed_status}"
|
145 |
-
)
|
146 |
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
|
|
4 |
from typing import Optional
|
5 |
|
6 |
from huggingface_hub import HfApi, snapshot_download
|
7 |
+
|
8 |
from src.envs import TOKEN
|
9 |
from src.logging import setup_logger
|
10 |
|
11 |
+
|
12 |
logger = setup_logger(__name__)
|
13 |
|
14 |
PENDING_STATUS = "PENDING"
|
|
|
16 |
FINISHED_STATUS = "FINISHED"
|
17 |
FAILED_STATUS = "FAILED"
|
18 |
|
19 |
+
|
20 |
@dataclass
|
21 |
class EvalRequest:
|
22 |
+
"""This class represents one evaluation request file."""
|
23 |
+
|
24 |
model: str
|
25 |
status: str
|
26 |
json_filepath: str
|
27 |
weight_type: str = "Original"
|
28 |
+
model_type: Optional[str] = None # pretrained, fine-tuned, etc. - define your own categories in
|
29 |
precision: str = "" # float16, bfloat16
|
30 |
+
revision: str = "main" # commit hash
|
31 |
+
submitted_time: Optional[str] = (
|
32 |
+
"2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
33 |
+
)
|
34 |
likes: Optional[int] = 0
|
35 |
params: Optional[int] = None
|
36 |
license: Optional[str] = ""
|
37 |
base_model: Optional[str] = ""
|
38 |
private: Optional[bool] = False
|
39 |
+
|
40 |
def get_model_args(self):
|
41 |
+
"""Edit this function if you want to manage more complex quantization issues. You'll need to map it to
|
42 |
the evaluation suite you chose.
|
43 |
"""
|
44 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
|
|
49 |
# Quantized models need some added config, the install of bits and bytes, etc
|
50 |
else:
|
51 |
raise Exception(f"Unknown precision {self.precision}.")
|
52 |
+
|
53 |
return model_args
|
54 |
|
55 |
|
|
|
81 |
Returns:
|
82 |
`list[EvalRequest]`: a list of model info dicts.
|
83 |
"""
|
84 |
+
snapshot_download(
|
85 |
+
repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
|
86 |
+
)
|
87 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
88 |
|
89 |
eval_requests = []
|
|
|
108 |
status = data["status"]
|
109 |
return status == RUNNING_STATUS
|
110 |
|
111 |
+
|
112 |
def check_completed_evals(
|
113 |
api: HfApi,
|
114 |
hf_repo: str,
|
|
|
121 |
):
|
122 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
123 |
snapshot_download(
|
124 |
+
repo_id=hf_repo_results,
|
125 |
+
revision="main",
|
126 |
+
local_dir=local_dir_results,
|
127 |
+
repo_type="dataset",
|
128 |
+
max_workers=60,
|
129 |
+
token=TOKEN,
|
130 |
)
|
131 |
|
132 |
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
|
|
141 |
output_file_exists = len(glob.glob(output_file)) > 0
|
142 |
|
143 |
if output_file_exists:
|
144 |
+
logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
|
|
|
|
|
145 |
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
146 |
else:
|
147 |
if eval_was_running(eval_request=eval_request):
|
148 |
+
logger.info(f"No result file found for {model} setting it to {failed_status}")
|
|
|
|
|
149 |
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
src/backend/run_eval_suite_harness.py
CHANGED
@@ -1,21 +1,32 @@
|
|
1 |
import json
|
2 |
-
import os
|
3 |
import logging
|
|
|
4 |
from datetime import datetime
|
|
|
5 |
|
6 |
-
from lm_eval import
|
7 |
from lm_eval.tasks import TaskManager
|
8 |
|
9 |
-
from src.envs import RESULTS_REPO, API
|
10 |
from src.backend.manage_requests import EvalRequest
|
|
|
11 |
from src.logging import setup_logger
|
12 |
|
13 |
-
from typing import Union
|
14 |
|
15 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
16 |
logger = setup_logger(__name__)
|
17 |
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
20 |
|
21 |
Args:
|
@@ -51,7 +62,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int
|
|
51 |
batch_size=batch_size,
|
52 |
device=device,
|
53 |
limit=limit,
|
54 |
-
write_out=True
|
55 |
)
|
56 |
|
57 |
results["config"]["model_dtype"] = eval_request.precision
|
|
|
1 |
import json
|
|
|
2 |
import logging
|
3 |
+
import os
|
4 |
from datetime import datetime
|
5 |
+
from typing import Union
|
6 |
|
7 |
+
from lm_eval import evaluator, utils
|
8 |
from lm_eval.tasks import TaskManager
|
9 |
|
|
|
10 |
from src.backend.manage_requests import EvalRequest
|
11 |
+
from src.envs import API
|
12 |
from src.logging import setup_logger
|
13 |
|
|
|
14 |
|
15 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
16 |
logger = setup_logger(__name__)
|
17 |
|
18 |
+
|
19 |
+
def run_evaluation(
|
20 |
+
eval_request: EvalRequest,
|
21 |
+
task_names: list,
|
22 |
+
num_fewshot: int,
|
23 |
+
batch_size: Union[int, str],
|
24 |
+
device: str,
|
25 |
+
local_dir: str,
|
26 |
+
results_repo: str,
|
27 |
+
no_cache: bool = True,
|
28 |
+
limit: int = None,
|
29 |
+
):
|
30 |
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
31 |
|
32 |
Args:
|
|
|
62 |
batch_size=batch_size,
|
63 |
device=device,
|
64 |
limit=limit,
|
65 |
+
write_out=True, # Whether to write out an example document and model input, for checking task integrity
|
66 |
)
|
67 |
|
68 |
results["config"]["model_dtype"] = eval_request.precision
|
src/backend/run_eval_suite_lighteval.py
CHANGED
@@ -1,23 +1,31 @@
|
|
1 |
import json
|
2 |
-
import argparse
|
3 |
import logging
|
4 |
-
from datetime import datetime
|
5 |
|
6 |
-
import lighteval
|
7 |
from lighteval.logging.evaluation_tracker import EvaluationTracker
|
8 |
from lighteval.models.model_config import InferenceEndpointModelConfig
|
9 |
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
|
10 |
|
11 |
-
from lighteval.main_accelerate import main, EnvConfig, create_model_config
|
12 |
-
|
13 |
-
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
|
14 |
from src.backend.manage_requests import EvalRequest
|
|
|
15 |
from src.logging import setup_logger
|
16 |
|
|
|
17 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
18 |
logger = setup_logger(__name__)
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
|
22 |
|
23 |
Args:
|
@@ -32,18 +40,20 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
32 |
local_dir (str): Where to save the results locally
|
33 |
no_cache (bool, optional): Whether to use a cache or not.
|
34 |
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
35 |
-
"""
|
36 |
|
37 |
if limit:
|
38 |
-
logger.info(
|
|
|
|
|
39 |
|
40 |
evaluation_tracker = EvaluationTracker(
|
41 |
output_dir="./results",
|
42 |
-
save_details
|
43 |
-
push_to_hub
|
44 |
-
push_to_tensorboard
|
45 |
-
hub_results_org=
|
46 |
-
public
|
47 |
)
|
48 |
|
49 |
pipeline_params = PipelineParameters(
|
@@ -52,21 +62,21 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
52 |
max_samples=limit,
|
53 |
use_chat_template=False,
|
54 |
system_prompt=None,
|
55 |
-
custom_tasks_directory="custom_tasks.py",
|
56 |
)
|
57 |
|
58 |
model_config = InferenceEndpointModelConfig(
|
59 |
# Endpoint parameters
|
60 |
-
name
|
61 |
-
repository
|
62 |
-
accelerator
|
63 |
-
vendor=
|
64 |
-
region=
|
65 |
-
instance_size=
|
66 |
-
instance_type=
|
67 |
-
should_reuse_existing=
|
68 |
-
model_dtype=
|
69 |
-
revision=
|
70 |
)
|
71 |
|
72 |
pipeline = Pipeline(
|
@@ -85,7 +95,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
85 |
dumped = json.dumps(results, indent=2)
|
86 |
logger.info(dumped)
|
87 |
|
88 |
-
except Exception
|
89 |
pipeline.model.cleanup()
|
90 |
|
91 |
return results
|
|
|
1 |
import json
|
|
|
2 |
import logging
|
|
|
3 |
|
|
|
4 |
from lighteval.logging.evaluation_tracker import EvaluationTracker
|
5 |
from lighteval.models.model_config import InferenceEndpointModelConfig
|
6 |
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
|
7 |
|
|
|
|
|
|
|
8 |
from src.backend.manage_requests import EvalRequest
|
9 |
+
from src.envs import RESULTS_REPO
|
10 |
from src.logging import setup_logger
|
11 |
|
12 |
+
|
13 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
14 |
logger = setup_logger(__name__)
|
15 |
|
16 |
+
|
17 |
+
def run_evaluation(
|
18 |
+
eval_request: EvalRequest,
|
19 |
+
task_names: str,
|
20 |
+
batch_size: int,
|
21 |
+
local_dir: str,
|
22 |
+
accelerator: str,
|
23 |
+
region: str,
|
24 |
+
vendor: str,
|
25 |
+
instance_size: str,
|
26 |
+
instance_type: str,
|
27 |
+
limit=None,
|
28 |
+
):
|
29 |
"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
|
30 |
|
31 |
Args:
|
|
|
40 |
local_dir (str): Where to save the results locally
|
41 |
no_cache (bool, optional): Whether to use a cache or not.
|
42 |
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
43 |
+
"""
|
44 |
|
45 |
if limit:
|
46 |
+
logger.info(
|
47 |
+
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
48 |
+
)
|
49 |
|
50 |
evaluation_tracker = EvaluationTracker(
|
51 |
output_dir="./results",
|
52 |
+
save_details=True,
|
53 |
+
push_to_hub=True,
|
54 |
+
push_to_tensorboard=False,
|
55 |
+
hub_results_org=RESULTS_REPO,
|
56 |
+
public=False,
|
57 |
)
|
58 |
|
59 |
pipeline_params = PipelineParameters(
|
|
|
62 |
max_samples=limit,
|
63 |
use_chat_template=False,
|
64 |
system_prompt=None,
|
65 |
+
custom_tasks_directory="custom_tasks.py", # if using a custom task
|
66 |
)
|
67 |
|
68 |
model_config = InferenceEndpointModelConfig(
|
69 |
# Endpoint parameters
|
70 |
+
name=eval_request.model.replace(".", "-").lower(),
|
71 |
+
repository=eval_request.model,
|
72 |
+
accelerator=accelerator,
|
73 |
+
vendor=vendor,
|
74 |
+
region=region,
|
75 |
+
instance_size=instance_size,
|
76 |
+
instance_type=instance_type,
|
77 |
+
should_reuse_existing=False,
|
78 |
+
model_dtype=eval_request.precision,
|
79 |
+
revision=eval_request.revision,
|
80 |
)
|
81 |
|
82 |
pipeline = Pipeline(
|
|
|
95 |
dumped = json.dumps(results, indent=2)
|
96 |
logger.info(dumped)
|
97 |
|
98 |
+
except Exception: # if eval failed, we force a cleanup
|
99 |
pipeline.model.cleanup()
|
100 |
|
101 |
return results
|
src/backend/sort_queue.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import re
|
2 |
from dataclasses import dataclass
|
3 |
|
4 |
from huggingface_hub import HfApi
|
@@ -11,6 +10,7 @@ class ModelMetadata:
|
|
11 |
likes: int = 0
|
12 |
size: int = 15
|
13 |
|
|
|
14 |
# All the functions below sort the models in the queue based on different parameters
|
15 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
private_models = [model for model in models if model.private]
|
@@ -18,11 +18,14 @@ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalR
|
|
18 |
|
19 |
return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
|
20 |
|
|
|
21 |
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
22 |
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
23 |
|
|
|
24 |
def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
25 |
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
26 |
|
|
|
27 |
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
28 |
-
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
|
|
|
|
1 |
from dataclasses import dataclass
|
2 |
|
3 |
from huggingface_hub import HfApi
|
|
|
10 |
likes: int = 0
|
11 |
size: int = 15
|
12 |
|
13 |
+
|
14 |
# All the functions below sort the models in the queue based on different parameters
|
15 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
private_models = [model for model in models if model.private]
|
|
|
18 |
|
19 |
return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
|
20 |
|
21 |
+
|
22 |
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
23 |
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
24 |
|
25 |
+
|
26 |
def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
27 |
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
28 |
|
29 |
+
|
30 |
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
31 |
+
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
src/display/log_visualizer.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
from io import StringIO
|
2 |
-
from pathlib import Path
|
3 |
|
4 |
from bs4 import BeautifulSoup
|
5 |
from rich.console import Console
|
@@ -12,8 +11,8 @@ from src.logging import log_file
|
|
12 |
|
13 |
def log_file_to_html_string(reverse=True):
|
14 |
with open(log_file, "rt") as f:
|
15 |
-
|
16 |
-
|
17 |
|
18 |
if reverse:
|
19 |
lines = reversed(lines)
|
@@ -26,12 +25,12 @@ def log_file_to_html_string(reverse=True):
|
|
26 |
html_content = console.export_html(inline_styles=True)
|
27 |
|
28 |
# Parse the HTML content using BeautifulSoup
|
29 |
-
soup = BeautifulSoup(html_content,
|
30 |
|
31 |
# Modify the <pre> tag and add custom styles
|
32 |
pre_tag = soup.pre
|
33 |
-
pre_tag[
|
34 |
-
del pre_tag[
|
35 |
|
36 |
# Add your custom styles and the .scrollable CSS to the <style> tag
|
37 |
style_tag = soup.style
|
|
|
1 |
from io import StringIO
|
|
|
2 |
|
3 |
from bs4 import BeautifulSoup
|
4 |
from rich.console import Console
|
|
|
11 |
|
12 |
def log_file_to_html_string(reverse=True):
|
13 |
with open(log_file, "rt") as f:
|
14 |
+
lines = f.readlines()
|
15 |
+
lines = lines[-NUM_LINES_VISUALIZE:]
|
16 |
|
17 |
if reverse:
|
18 |
lines = reversed(lines)
|
|
|
25 |
html_content = console.export_html(inline_styles=True)
|
26 |
|
27 |
# Parse the HTML content using BeautifulSoup
|
28 |
+
soup = BeautifulSoup(html_content, "lxml")
|
29 |
|
30 |
# Modify the <pre> tag and add custom styles
|
31 |
pre_tag = soup.pre
|
32 |
+
pre_tag["class"] = "scrollable"
|
33 |
+
del pre_tag["style"]
|
34 |
|
35 |
# Add your custom styles and the .scrollable CSS to the <style> tag
|
36 |
style_tag = soup.style
|
src/envs.py
CHANGED
@@ -2,23 +2,24 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
|
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("HF_TOKEN")
|
8 |
|
9 |
-
OWNER = "demo-leaderboard-backend"
|
10 |
|
11 |
# For harness evaluations
|
12 |
-
DEVICE = "cpu"
|
13 |
-
LIMIT = 20
|
14 |
-
NUM_FEWSHOT = 0
|
15 |
TASKS_HARNESS = ["anli_r1", "logiqa"]
|
16 |
|
17 |
# For lighteval evaluations
|
18 |
ACCELERATOR = "cpu"
|
19 |
REGION = "us-east-1"
|
20 |
VENDOR = "aws"
|
21 |
-
TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
22 |
# To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
|
23 |
|
24 |
# ---------------------------------------------------
|
@@ -27,7 +28,7 @@ QUEUE_REPO = f"{OWNER}/requests"
|
|
27 |
RESULTS_REPO = f"{OWNER}/results"
|
28 |
|
29 |
# If you setup a cache later, just change HF_HOME
|
30 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
31 |
|
32 |
# Local caches
|
33 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
@@ -39,4 +40,3 @@ REFRESH_RATE = 10 * 60 # 10 min
|
|
39 |
NUM_LINES_VISUALIZE = 300
|
40 |
|
41 |
API = HfApi(token=TOKEN)
|
42 |
-
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
+
|
6 |
# Info to change for your repository
|
7 |
# ----------------------------------
|
8 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
9 |
|
10 |
+
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset
|
11 |
|
12 |
# For harness evaluations
|
13 |
+
DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
|
14 |
+
LIMIT = 20 # !!!! For testing, should be None for actual evaluations!!!
|
15 |
+
NUM_FEWSHOT = 0 # Change with your few shot for the Harness evaluations
|
16 |
TASKS_HARNESS = ["anli_r1", "logiqa"]
|
17 |
|
18 |
# For lighteval evaluations
|
19 |
ACCELERATOR = "cpu"
|
20 |
REGION = "us-east-1"
|
21 |
VENDOR = "aws"
|
22 |
+
TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
23 |
# To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
|
24 |
|
25 |
# ---------------------------------------------------
|
|
|
28 |
RESULTS_REPO = f"{OWNER}/results"
|
29 |
|
30 |
# If you setup a cache later, just change HF_HOME
|
31 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
32 |
|
33 |
# Local caches
|
34 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
40 |
NUM_LINES_VISUALIZE = 300
|
41 |
|
42 |
API = HfApi(token=TOKEN)
|
|
src/logging.py
CHANGED
@@ -1,19 +1,17 @@
|
|
1 |
-
import
|
2 |
from pathlib import Path
|
3 |
|
4 |
-
proj_dir = Path(__file__).parents[1]
|
5 |
-
|
6 |
-
log_file = proj_dir/"output.log"
|
7 |
|
|
|
8 |
|
9 |
-
|
10 |
|
11 |
|
12 |
def setup_logger(name: str):
|
13 |
logger = logging.getLogger(name)
|
14 |
logger.setLevel(logging.INFO)
|
15 |
|
16 |
-
formatter = logging.Formatter(
|
17 |
|
18 |
# Create a file handler to write logs to a file
|
19 |
file_handler = logging.FileHandler(log_file)
|
@@ -29,10 +27,10 @@ def configure_root_logger():
|
|
29 |
logging.basicConfig(level=logging.INFO)
|
30 |
root_logger = logging.getLogger()
|
31 |
|
32 |
-
formatter = logging.Formatter(
|
33 |
|
34 |
file_handler = logging.FileHandler(log_file)
|
35 |
file_handler.setLevel(logging.INFO)
|
36 |
file_handler.setFormatter(formatter)
|
37 |
|
38 |
-
root_logger.addHandler(file_handler)
|
|
|
1 |
+
import logging
|
2 |
from pathlib import Path
|
3 |
|
|
|
|
|
|
|
4 |
|
5 |
+
proj_dir = Path(__file__).parents[1]
|
6 |
|
7 |
+
log_file = proj_dir / "output.log"
|
8 |
|
9 |
|
10 |
def setup_logger(name: str):
|
11 |
logger = logging.getLogger(name)
|
12 |
logger.setLevel(logging.INFO)
|
13 |
|
14 |
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
15 |
|
16 |
# Create a file handler to write logs to a file
|
17 |
file_handler = logging.FileHandler(log_file)
|
|
|
27 |
logging.basicConfig(level=logging.INFO)
|
28 |
root_logger = logging.getLogger()
|
29 |
|
30 |
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
31 |
|
32 |
file_handler = logging.FileHandler(log_file)
|
33 |
file_handler.setLevel(logging.INFO)
|
34 |
file_handler.setFormatter(formatter)
|
35 |
|
36 |
+
root_logger.addHandler(file_handler)
|