Spaces:
Build error
Build error
Yacine Jernite
commited on
Commit
·
0069e8c
1
Parent(s):
e2f899f
in deployment mode, check whethet the cache_dir exists
Browse files- app.py +69 -57
- data_measurements/dataset_statistics.py +15 -3
app.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
|
| 15 |
import logging
|
| 16 |
from os import mkdir
|
| 17 |
-
from os.path import isdir
|
| 18 |
from pathlib import Path
|
| 19 |
|
| 20 |
import streamlit as st
|
|
@@ -143,63 +143,63 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
| 143 |
|
| 144 |
"""
|
| 145 |
|
| 146 |
-
if not isdir(CACHE_DIR):
|
| 147 |
-
logs.warning("Creating cache")
|
| 148 |
-
# We need to preprocess everything.
|
| 149 |
-
# This should eventually all go into a prepare_dataset CLI
|
| 150 |
-
mkdir(CACHE_DIR)
|
| 151 |
if use_cache:
|
| 152 |
logs.warning("Using cache")
|
| 153 |
-
|
|
|
|
| 154 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
| 155 |
# Don't recalculate; we're live
|
| 156 |
dstats.set_deployment(True)
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
logs.warning("Missing a cache for load or prepare dataset")
|
| 164 |
-
try:
|
| 165 |
-
# Header widget
|
| 166 |
-
dstats.load_or_prepare_dset_peek()
|
| 167 |
-
except:
|
| 168 |
-
logs.warning("Missing a cache for dset peek")
|
| 169 |
-
try:
|
| 170 |
-
# General stats widget
|
| 171 |
-
dstats.load_or_prepare_general_stats()
|
| 172 |
-
except:
|
| 173 |
-
logs.warning("Missing a cache for general stats")
|
| 174 |
-
try:
|
| 175 |
-
# Labels widget
|
| 176 |
-
dstats.load_or_prepare_labels()
|
| 177 |
-
except:
|
| 178 |
-
logs.warning("Missing a cache for prepare labels")
|
| 179 |
-
try:
|
| 180 |
-
# Text lengths widget
|
| 181 |
-
dstats.load_or_prepare_text_lengths()
|
| 182 |
-
except:
|
| 183 |
-
logs.warning("Missing a cache for text lengths")
|
| 184 |
-
if show_embeddings:
|
| 185 |
try:
|
| 186 |
-
#
|
| 187 |
-
dstats.
|
| 188 |
except:
|
| 189 |
-
logs.warning("Missing a cache for
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
| 205 |
"""
|
|
@@ -257,21 +257,33 @@ def main():
|
|
| 257 |
dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
|
| 258 |
dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
|
| 259 |
left_col, _, right_col = st.columns([10, 1, 10])
|
| 260 |
-
dstats_left = load_or_prepare_widgets(
|
| 261 |
dataset_args_left, show_embeddings, use_cache=use_cache
|
| 262 |
)
|
| 263 |
with left_col:
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
dataset_args_right, show_embeddings, use_cache=use_cache
|
| 267 |
)
|
| 268 |
with right_col:
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
else:
|
| 271 |
logs.warning("Using Single Dataset Mode")
|
| 272 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
| 273 |
-
dstats = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
|
| 277 |
if __name__ == "__main__":
|
|
|
|
| 14 |
|
| 15 |
import logging
|
| 16 |
from os import mkdir
|
| 17 |
+
from os.path import exists, isdir
|
| 18 |
from pathlib import Path
|
| 19 |
|
| 20 |
import streamlit as st
|
|
|
|
| 143 |
|
| 144 |
"""
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
if use_cache:
|
| 147 |
logs.warning("Using cache")
|
| 148 |
+
if True:
|
| 149 |
+
#try:
|
| 150 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
| 151 |
# Don't recalculate; we're live
|
| 152 |
dstats.set_deployment(True)
|
| 153 |
+
# checks whether the cache_dir exists in deployment mode
|
| 154 |
+
# creates cache_dir if not and if in development mode
|
| 155 |
+
cache_dir_exists = dstats.check_cache_dir()
|
| 156 |
+
#except:
|
| 157 |
+
# logs.warning("We're screwed")
|
| 158 |
+
if cache_dir_exists:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
try:
|
| 160 |
+
# We need to have the text_dset loaded for further load_or_prepare
|
| 161 |
+
dstats.load_or_prepare_dataset()
|
| 162 |
except:
|
| 163 |
+
logs.warning("Missing a cache for load or prepare dataset")
|
| 164 |
+
try:
|
| 165 |
+
# Header widget
|
| 166 |
+
dstats.load_or_prepare_dset_peek()
|
| 167 |
+
except:
|
| 168 |
+
logs.warning("Missing a cache for dset peek")
|
| 169 |
+
try:
|
| 170 |
+
# General stats widget
|
| 171 |
+
dstats.load_or_prepare_general_stats()
|
| 172 |
+
except:
|
| 173 |
+
logs.warning("Missing a cache for general stats")
|
| 174 |
+
try:
|
| 175 |
+
# Labels widget
|
| 176 |
+
dstats.load_or_prepare_labels()
|
| 177 |
+
except:
|
| 178 |
+
logs.warning("Missing a cache for prepare labels")
|
| 179 |
+
try:
|
| 180 |
+
# Text lengths widget
|
| 181 |
+
dstats.load_or_prepare_text_lengths()
|
| 182 |
+
except:
|
| 183 |
+
logs.warning("Missing a cache for text lengths")
|
| 184 |
+
if show_embeddings:
|
| 185 |
+
try:
|
| 186 |
+
# Embeddings widget
|
| 187 |
+
dstats.load_or_prepare_embeddings()
|
| 188 |
+
except:
|
| 189 |
+
logs.warning("Missing a cache for embeddings")
|
| 190 |
+
try:
|
| 191 |
+
dstats.load_or_prepare_text_duplicates()
|
| 192 |
+
except:
|
| 193 |
+
logs.warning("Missing a cache for text duplicates")
|
| 194 |
+
try:
|
| 195 |
+
dstats.load_or_prepare_npmi()
|
| 196 |
+
except:
|
| 197 |
+
logs.warning("Missing a cache for npmi")
|
| 198 |
+
try:
|
| 199 |
+
dstats.load_or_prepare_zipf()
|
| 200 |
+
except:
|
| 201 |
+
logs.warning("Missing a cache for zipf")
|
| 202 |
+
return dstats, cache_dir_exists
|
| 203 |
|
| 204 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
| 205 |
"""
|
|
|
|
| 257 |
dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
|
| 258 |
dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
|
| 259 |
left_col, _, right_col = st.columns([10, 1, 10])
|
| 260 |
+
dstats_left, cache_exists_left = load_or_prepare_widgets(
|
| 261 |
dataset_args_left, show_embeddings, use_cache=use_cache
|
| 262 |
)
|
| 263 |
with left_col:
|
| 264 |
+
if cache_exists_left:
|
| 265 |
+
show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
|
| 266 |
+
else:
|
| 267 |
+
st.markdown("### Missing pre-computed data measures!")
|
| 268 |
+
st.write(dataset_args_left)
|
| 269 |
+
dstats_right, cache_exists_right = load_or_prepare_widgets(
|
| 270 |
dataset_args_right, show_embeddings, use_cache=use_cache
|
| 271 |
)
|
| 272 |
with right_col:
|
| 273 |
+
if cache_exists_right:
|
| 274 |
+
show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
|
| 275 |
+
else:
|
| 276 |
+
st.markdown("### Missing pre-computed data measures!")
|
| 277 |
+
st.write(dataset_args_right)
|
| 278 |
else:
|
| 279 |
logs.warning("Using Single Dataset Mode")
|
| 280 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
| 281 |
+
dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
|
| 282 |
+
if cache_exists:
|
| 283 |
+
show_column(dstats, ds_name_to_dict, show_embeddings, "")
|
| 284 |
+
else:
|
| 285 |
+
st.markdown("### Missing pre-computed data measures!")
|
| 286 |
+
st.write(dataset_args)
|
| 287 |
|
| 288 |
|
| 289 |
if __name__ == "__main__":
|
data_measurements/dataset_statistics.py
CHANGED
|
@@ -245,9 +245,6 @@ class DatasetStatisticsCacheClass:
|
|
| 245 |
self.cache_dir,
|
| 246 |
f"{dset_name}_{dset_config}_{split_name}_{text_field}", # {label_field},
|
| 247 |
)
|
| 248 |
-
if not isdir(self.cache_path):
|
| 249 |
-
logs.warning("Creating cache directory %s." % self.cache_path)
|
| 250 |
-
mkdir(self.cache_path)
|
| 251 |
|
| 252 |
# Cache files not needed for UI
|
| 253 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
|
@@ -302,6 +299,21 @@ class DatasetStatisticsCacheClass:
|
|
| 302 |
"""
|
| 303 |
self.live = live
|
| 304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
def get_base_dataset(self):
|
| 306 |
"""Gets a pointer to the truncated base dataset object."""
|
| 307 |
if not self.dset:
|
|
|
|
| 245 |
self.cache_dir,
|
| 246 |
f"{dset_name}_{dset_config}_{split_name}_{text_field}", # {label_field},
|
| 247 |
)
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
# Cache files not needed for UI
|
| 250 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
|
|
|
| 299 |
"""
|
| 300 |
self.live = live
|
| 301 |
|
| 302 |
+
def check_cache_dir(self):
|
| 303 |
+
"""
|
| 304 |
+
First function to call to create the cache directory.
|
| 305 |
+
If in deployment mode and cache directory does not already exist,
|
| 306 |
+
return False.
|
| 307 |
+
"""
|
| 308 |
+
if self.live:
|
| 309 |
+
return isdir(self.cache_path)
|
| 310 |
+
else:
|
| 311 |
+
if not isdir(self.cache_path):
|
| 312 |
+
logs.warning("Creating cache directory %s." % self.cache_path)
|
| 313 |
+
mkdir(self.cache_path)
|
| 314 |
+
return isdir(self.cache_path)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
def get_base_dataset(self):
|
| 318 |
"""Gets a pointer to the truncated base dataset object."""
|
| 319 |
if not self.dset:
|