Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clémentine
commited on
Commit
·
0c0a603
1
Parent(s):
412f8e5
change to lighteval's latest
Browse files- README.md +1 -1
- requirements.txt +1 -1
- src/backend/run_eval_suite_lighteval.py +50 -47
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🥇
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.1.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
requirements.txt
CHANGED
@@ -9,7 +9,7 @@ accelerate>=0.26.0
|
|
9 |
sentencepiece
|
10 |
|
11 |
# Evaluation suites
|
12 |
-
lighteval
|
13 |
lm_eval==0.4.3
|
14 |
|
15 |
# Log Visualizer
|
|
|
9 |
sentencepiece
|
10 |
|
11 |
# Evaluation suites
|
12 |
+
lighteval>=0.5.0
|
13 |
lm_eval==0.4.3
|
14 |
|
15 |
# Log Visualizer
|
src/backend/run_eval_suite_lighteval.py
CHANGED
@@ -3,7 +3,12 @@ import argparse
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
|
9 |
from src.backend.manage_requests import EvalRequest
|
@@ -32,57 +37,55 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
32 |
if limit:
|
33 |
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
"reuse_existing": False,
|
44 |
-
"model_dtype": eval_request.precision,
|
45 |
-
"revision": eval_request.revision,
|
46 |
-
# Save parameters
|
47 |
-
"push_results_to_hub": True,
|
48 |
-
"save_details": True,
|
49 |
-
"push_details_to_hub": True,
|
50 |
-
"public_run": False,
|
51 |
-
"cache_dir": CACHE_PATH,
|
52 |
-
"results_org": RESULTS_REPO,
|
53 |
-
"output_dir": local_dir,
|
54 |
-
"job_id": str(datetime.now()),
|
55 |
-
# Experiment parameters
|
56 |
-
"override_batch_size": batch_size,
|
57 |
-
"custom_tasks": "custom_tasks.py",
|
58 |
-
"tasks": task_names,
|
59 |
-
"max_samples": limit,
|
60 |
-
"use_chat_template": False,
|
61 |
-
"system_prompt": None,
|
62 |
-
# Parameters which would be set to things by the kwargs if actually using argparse
|
63 |
-
"inference_server_address": None,
|
64 |
-
"model_args": None,
|
65 |
-
"num_fewshot_seeds": None,
|
66 |
-
"delta_weights": False,
|
67 |
-
"adapter_weights": False
|
68 |
-
}
|
69 |
-
args = argparse.Namespace(**args_dict)
|
70 |
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
77 |
|
78 |
dumped = json.dumps(results, indent=2)
|
79 |
logger.info(dumped)
|
80 |
-
except Exception as e: # if eval failed, we force a cleanup
|
81 |
-
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
|
82 |
-
|
83 |
-
model_config = create_model_config(args=args, accelerator=accelerator)
|
84 |
-
model, _ = load_model(config=model_config, env_config=env_config)
|
85 |
-
model.cleanup()
|
86 |
|
|
|
|
|
87 |
|
88 |
return results
|
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
|
6 |
+
import lighteval
|
7 |
+
from lighteval.logging.evaluation_tracker import EvaluationTracker
|
8 |
+
from lighteval.models.model_config import InferenceEndpointModelConfig
|
9 |
+
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
|
10 |
+
|
11 |
+
from lighteval.main_accelerate import main, EnvConfig, create_model_config
|
12 |
|
13 |
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
|
14 |
from src.backend.manage_requests import EvalRequest
|
|
|
37 |
if limit:
|
38 |
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
39 |
|
40 |
+
evaluation_tracker = EvaluationTracker(
|
41 |
+
output_dir="./results",
|
42 |
+
save_details = True,
|
43 |
+
push_to_hub = True,
|
44 |
+
push_to_tensorboard = False,
|
45 |
+
hub_results_org= RESULTS_REPO,
|
46 |
+
public = False,
|
47 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
pipeline_params = PipelineParameters(
|
50 |
+
launcher_type=ParallelismManager.ACCELERATE,
|
51 |
+
override_batch_size=batch_size,
|
52 |
+
max_samples=limit,
|
53 |
+
use_chat_template=False,
|
54 |
+
system_prompt=None,
|
55 |
+
custom_tasks_directory="custom_tasks.py", # if using a custom task
|
56 |
+
)
|
57 |
+
|
58 |
+
model_config = InferenceEndpointModelConfig(
|
59 |
+
# Endpoint parameters
|
60 |
+
name = eval_request.model.replace(".", "-").lower(),
|
61 |
+
repository = eval_request.model,
|
62 |
+
accelerator = accelerator,
|
63 |
+
vendor= vendor,
|
64 |
+
region= region,
|
65 |
+
instance_size= instance_size,
|
66 |
+
instance_type= instance_type,
|
67 |
+
should_reuse_existing= False,
|
68 |
+
model_dtype= eval_request.precision,
|
69 |
+
revision= eval_request.revision,
|
70 |
+
)
|
71 |
+
|
72 |
+
pipeline = Pipeline(
|
73 |
+
tasks=task_names,
|
74 |
+
pipeline_parameters=pipeline_params,
|
75 |
+
evaluation_tracker=evaluation_tracker,
|
76 |
+
model_config=model_config,
|
77 |
+
)
|
78 |
|
79 |
+
try:
|
80 |
+
pipeline.evaluate()
|
81 |
+
pipeline.show_results()
|
82 |
+
pipeline.save_and_push_results()
|
83 |
+
results = pipeline.get_results()
|
84 |
|
85 |
dumped = json.dumps(results, indent=2)
|
86 |
logger.info(dumped)
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
except Exception as e: # if eval failed, we force a cleanup
|
89 |
+
pipeline.model.cleanup()
|
90 |
|
91 |
return results
|