Maharshi Gor commited on
Commit
54e2d5b
·
1 Parent(s): 3283369

Update leaderboard download, refactored hf_datasets_utils

Browse files
Files changed (6) hide show
  1. app.py +3 -16
  2. run_local.sh +17 -0
  3. src/components/leaderboard.py +15 -11
  4. src/envs.py +1 -1
  5. src/hf_datasets_utils.py +161 -0
  6. src/utils.py +2 -0
app.py CHANGED
@@ -29,8 +29,9 @@ from envs import (
29
  QUEUE_REPO,
30
  REPO_ID,
31
  RESULTS_REPO,
32
- SERVER_REFRESH_INTERVAL,
33
  )
 
34
  from shared.workflows import factory
35
  from shared.workflows.configs import AVAILABLE_MODELS
36
 
@@ -39,20 +40,6 @@ def restart_space():
39
  API.restart_space(repo_id=REPO_ID)
40
 
41
 
42
- def download_dataset_snapshot(repo_id, local_dir):
43
- try:
44
- logger.info(f"Downloading dataset snapshot from {repo_id} to {local_dir}")
45
- snapshot_download(
46
- repo_id=repo_id,
47
- local_dir=local_dir,
48
- repo_type="dataset",
49
- tqdm_class=None,
50
- )
51
- except Exception as e:
52
- logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}. Restarting space.")
53
- restart_space()
54
-
55
-
56
  download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
57
 
58
 
@@ -97,7 +84,7 @@ def presave_pipeline_state(
97
 
98
  if __name__ == "__main__":
99
  scheduler = BackgroundScheduler()
100
- scheduler.add_job(restart_space, "interval", seconds=SERVER_REFRESH_INTERVAL)
101
  scheduler.start()
102
 
103
  css = css_pipeline + css_tossup + css_bonus + leaderboard_css
 
29
  QUEUE_REPO,
30
  REPO_ID,
31
  RESULTS_REPO,
32
+ SERVER_RESTART_INTERVAL,
33
  )
34
+ from hf_datasets_utils import download_dataset_snapshot
35
  from shared.workflows import factory
36
  from shared.workflows.configs import AVAILABLE_MODELS
37
 
 
40
  API.restart_space(repo_id=REPO_ID)
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  download_dataset_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
44
 
45
 
 
84
 
85
  if __name__ == "__main__":
86
  scheduler = BackgroundScheduler()
87
+ scheduler.add_job(restart_space, "interval", seconds=SERVER_RESTART_INTERVAL)
88
  scheduler.start()
89
 
90
  css = css_pipeline + css_tossup + css_bonus + leaderboard_css
run_local.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Source the environment variables
4
+ source .env
5
+
6
+ # Remove directories if they exist
7
+ if [ -d "eval-queue" ]; then
8
+ echo "Removing eval-queue"
9
+ rm -r eval-queue
10
+ fi
11
+
12
+ if [ -d "eval-results" ]; then
13
+ echo "Removing eval-results"
14
+ rm -r eval-results
15
+ fi
16
+
17
+ python app.py
src/components/leaderboard.py CHANGED
@@ -7,11 +7,15 @@ from gradio_leaderboard import Leaderboard
7
  from loguru import logger
8
 
9
  import populate
10
- from envs import EVAL_RESULTS_PATH, LEADERBOARD_REFRESH_INTERVAL
 
 
 
 
 
11
 
12
 
13
  def fetch_tossup_leaderboard(style: bool = True):
14
- # download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
15
  df = populate.get_tossups_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
16
 
17
  def colour_pos_neg(v):
@@ -35,7 +39,6 @@ def fetch_tossup_leaderboard(style: bool = True):
35
 
36
 
37
  def fetch_bonus_leaderboard(style: bool = True):
38
- # download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
39
  df = populate.get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
40
 
41
  # Apply formatting and styling
@@ -49,6 +52,13 @@ def fetch_bonus_leaderboard(style: bool = True):
49
  return styled_df if style else df
50
 
51
 
 
 
 
 
 
 
 
52
  def create_leaderboard_interface(app):
53
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
54
  refresh_btn = gr.Button("🔄 Refresh")
@@ -77,13 +87,7 @@ def create_leaderboard_interface(app):
77
 
78
  gr.on(
79
  triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
80
- fn=fetch_tossup_leaderboard,
81
- inputs=[],
82
- outputs=tossup_leaderboard,
83
- )
84
- gr.on(
85
- triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
86
- fn=fetch_bonus_leaderboard,
87
  inputs=[],
88
- outputs=bonus_leaderboard,
89
  )
 
7
  from loguru import logger
8
 
9
  import populate
10
+ from envs import EVAL_RESULTS_PATH, LEADERBOARD_REFRESH_INTERVAL, RESULTS_REPO
11
+ from hf_datasets_utils import download_dataset_snapshot
12
+
13
+
14
+ def download_leaderboard_data():
15
+ download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
16
 
17
 
18
  def fetch_tossup_leaderboard(style: bool = True):
 
19
  df = populate.get_tossups_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
20
 
21
  def colour_pos_neg(v):
 
39
 
40
 
41
  def fetch_bonus_leaderboard(style: bool = True):
 
42
  df = populate.get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, "tiny_eval")
43
 
44
  # Apply formatting and styling
 
52
  return styled_df if style else df
53
 
54
 
55
+ def refresh_leaderboard(style: bool = True):
56
+ download_leaderboard_data()
57
+ tossup_df = fetch_tossup_leaderboard(style)
58
+ bonus_df = fetch_bonus_leaderboard(style)
59
+ return tossup_df, bonus_df
60
+
61
+
62
  def create_leaderboard_interface(app):
63
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
64
  refresh_btn = gr.Button("🔄 Refresh")
 
87
 
88
  gr.on(
89
  triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
90
+ fn=refresh_leaderboard,
 
 
 
 
 
 
91
  inputs=[],
92
+ outputs=[tossup_leaderboard, bonus_leaderboard],
93
  )
src/envs.py CHANGED
@@ -45,7 +45,7 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
45
 
46
 
47
  LLM_CACHE_REFRESH_INTERVAL = 600 # seconds (30 minutes)
48
- SERVER_REFRESH_INTERVAL = 86400 # seconds (one day)
49
  LEADERBOARD_REFRESH_INTERVAL = 600 # seconds (10 minutes)
50
 
51
  API = HfApi(token=TOKEN)
 
45
 
46
 
47
  LLM_CACHE_REFRESH_INTERVAL = 600 # seconds (30 minutes)
48
+ SERVER_RESTART_INTERVAL = 2 * 24 * 60 * 60 # seconds (2 days)
49
  LEADERBOARD_REFRESH_INTERVAL = 600 # seconds (10 minutes)
50
 
51
  API = HfApi(token=TOKEN)
src/hf_datasets_utils.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi, snapshot_download
2
+ from loguru import logger
3
+
4
+ api = HfApi()
5
+
6
+
7
+ def download_dataset_snapshot(repo_id, local_dir):
8
+ try:
9
+ logger.info(f"Downloading dataset snapshot from {repo_id} to {local_dir}")
10
+ snapshot_download(
11
+ repo_id=repo_id,
12
+ local_dir=local_dir,
13
+ repo_type="dataset",
14
+ tqdm_class=None,
15
+ )
16
+ except Exception as e:
17
+ logger.error(f"Error downloading dataset snapshot from {repo_id} to {local_dir}: {e}. Restarting space.")
18
+ api.restart_space(repo_id=repo_id)
19
+
20
+
21
+ def remove_files_from_dataset_repo(repo_id: str, path_patterns: list[str], commit_message: str = "Remove files"):
22
+ """
23
+ Remove files or directories matching specified patterns from a Hugging Face dataset repository.
24
+
25
+ Args:
26
+ repo_id: The ID of the dataset repository (e.g., "username/dataset-name")
27
+ path_patterns: List of file or directory path patterns to remove
28
+ commit_message: Message for the commit that removes the files
29
+ """
30
+ import fnmatch
31
+ import os
32
+
33
+ # Get all files in the repository
34
+ repo_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
35
+
36
+ # Find files matching the patterns
37
+ files_to_remove = []
38
+ for pattern in path_patterns:
39
+ matching_files = fnmatch.filter(repo_files, pattern)
40
+ files_to_remove.extend(matching_files)
41
+
42
+ # Delete each matching file
43
+ for path in files_to_remove:
44
+ try:
45
+ api.delete_file(
46
+ path_in_repo=path, repo_id=repo_id, repo_type="dataset", commit_message=f"{commit_message}: {path}"
47
+ )
48
+ print(f"Successfully removed {path} from {repo_id}")
49
+ except Exception as e:
50
+ print(f"Error removing {path}: {e}")
51
+
52
+
53
+ def update_dataset_info_readme(
54
+ repo_id: str,
55
+ dataset_info: dict,
56
+ license_id: str = None,
57
+ commit_message: str = "Update dataset_info in README.md",
58
+ ):
59
+ """
60
+ Update the dataset_info section in the README.md file of a Hugging Face dataset repository.
61
+
62
+ Args:
63
+ repo_id: The ID of the dataset repository (e.g., "username/dataset-name")
64
+ dataset_info: Dictionary containing dataset information to include in the README
65
+ license_id: Optional license identifier (e.g., "mit", "cc-by-4.0")
66
+ commit_message: Message for the commit
67
+
68
+ Example dataset_info structure:
69
+ {
70
+ "features": [
71
+ {"name": "text", "dtype": "string"},
72
+ {"name": "label", "dtype": "int64"}
73
+ ],
74
+ "splits": [
75
+ {"name": "train", "num_examples": 10000, "num_bytes": 1000000},
76
+ {"name": "test", "num_examples": 1000, "num_bytes": 100000}
77
+ ],
78
+ "download_size": 1200000,
79
+ "dataset_size": 1100000,
80
+ "configs": [
81
+ {
82
+ "config_name": "default",
83
+ "data_files": [
84
+ {"split": "train", "path": "data/train.csv"},
85
+ {"split": "test", "path": "data/test.csv"}
86
+ ]
87
+ }
88
+ ]
89
+ }
90
+ """
91
+ import re
92
+
93
+ import yaml
94
+ from huggingface_hub import HfApi
95
+
96
+ api = HfApi()
97
+
98
+ # Check if README.md exists
99
+ try:
100
+ readme_content = api.hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="README.md", token=None)
101
+ with open(readme_content, "r", encoding="utf-8") as f:
102
+ content = f.read()
103
+ except Exception:
104
+ # Create a new README.md if it doesn't exist
105
+ content = ""
106
+
107
+ # Parse existing YAML front matter if it exists
108
+ yaml_block = None
109
+ yaml_match = re.search(r"---\s*\n(.*?)\n\s*---", content, re.DOTALL)
110
+
111
+ if yaml_match:
112
+ yaml_text = yaml_match.group(1)
113
+ try:
114
+ yaml_block = yaml.safe_load(yaml_text)
115
+ except Exception as e:
116
+ print(f"Error parsing existing YAML front matter: {e}")
117
+ yaml_block = {}
118
+ else:
119
+ yaml_block = {}
120
+
121
+ # Update or add dataset_info and license
122
+ if dataset_info:
123
+ yaml_block["dataset_info"] = dataset_info
124
+
125
+ if license_id:
126
+ yaml_block["license"] = license_id
127
+
128
+ # Generate new YAML front matter
129
+ new_yaml = yaml.dump(yaml_block, sort_keys=False, default_flow_style=False)
130
+ new_yaml_block = f"---\n{new_yaml}---\n"
131
+
132
+ # Replace existing YAML front matter or add it at the beginning
133
+ if yaml_match:
134
+ new_content = content[: yaml_match.start()] + new_yaml_block + content[yaml_match.end() :]
135
+ else:
136
+ new_content = new_yaml_block + content
137
+
138
+ # Create a temporary file with the new content
139
+ import tempfile
140
+
141
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file:
142
+ temp_file.write(new_content)
143
+ temp_path = temp_file.name
144
+
145
+ # Upload the updated README.md
146
+ try:
147
+ api.upload_file(
148
+ path_or_fileobj=temp_path,
149
+ path_in_repo="README.md",
150
+ repo_id=repo_id,
151
+ repo_type="dataset",
152
+ commit_message=commit_message,
153
+ )
154
+ print(f"Successfully updated README.md in {repo_id}")
155
+ except Exception as e:
156
+ print(f"Error updating README.md: {e}")
157
+
158
+ # Clean up temporary file
159
+ import os
160
+
161
+ os.unlink(temp_path)
src/utils.py CHANGED
@@ -1,5 +1,7 @@
1
  # Description: Utility functions for the model_step component.
2
 
 
 
3
  from app_configs import AVAILABLE_MODELS, UNSELECTED_MODEL_NAME
4
 
5
 
 
1
  # Description: Utility functions for the model_step component.
2
 
3
+ from loguru import logger
4
+
5
  from app_configs import AVAILABLE_MODELS, UNSELECTED_MODEL_NAME
6
 
7