Spaces:

amaye15
/

object-segmentation-processing

Sleeping

App Files Files Community

amaye15 commited on Aug 25, 2024

Commit

60283f6

1 Parent(s): 6ea28ef

webhook - complete

Browse files

Files changed (1) hide show

app.py +28 -14

app.py CHANGED Viewed

@@ -6,8 +6,8 @@ from pathlib import Path
 from huggingface_hub import WebhooksServer, WebhookPayload
 from datasets import Dataset, load_dataset, disable_caching
 from fastapi import BackgroundTasks, Response, status
-from huggingface_hub.utils import build_hf_headers, get_session
 disable_caching()
 # Set up the logger
@@ -23,7 +23,7 @@ logger.addHandler(console_handler)
 # Environment variables
 DS_NAME = "amaye15/object-segmentation"
-DATA_DIR = "data"
 TARGET_REPO = "amaye15/object-segmentation-processed"
 WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET")
@@ -31,10 +31,13 @@ WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET")
 def get_data():
     """
     Generator function to stream data from the dataset.
     """
     ds = load_dataset(
         DS_NAME,
-        cache_dir=os.path.join(os.getcwd(), DATA_DIR),
         streaming=True,
         download_mode="force_redownload",
     )
@@ -46,16 +49,18 @@ def get_data():
 def process_and_push_data():
     """
     Function to process and push new data to the target repository.
-    """
-    p = os.path.join(os.getcwd(), DATA_DIR)
-    if os.path.exists(p):
-        shutil.rmtree(p)
-    os.mkdir(p)
     ds_processed = Dataset.from_generator(get_data)
     ds_processed.push_to_hub(TARGET_REPO)
     logger.info("Data processed and pushed to the hub.")
     gc.collect()
@@ -70,6 +75,9 @@ async def handle_repository_changes(
 ):
     """
     Webhook endpoint that triggers data processing when the dataset is updated.
     """
     logger.info(
         f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}"
@@ -79,13 +87,19 @@ async def handle_repository_changes(
 def _process_webhook():
-    logger.info(f"Loading new dataset...")
-    # dataset = load_dataset(DS_NAME)
-    logger.info(f"Loaded new dataset")
-    logger.info(f"Processing and updating dataset...")
     process_and_push_data()
-    logger.info(f"Processing and updating dataset completed!")
 if __name__ == "__main__":

 from huggingface_hub import WebhooksServer, WebhookPayload
 from datasets import Dataset, load_dataset, disable_caching
 from fastapi import BackgroundTasks, Response, status
+# Disable caching globally for Hugging Face datasets
 disable_caching()
 # Set up the logger
 # Environment variables
 DS_NAME = "amaye15/object-segmentation"
+DATA_DIR = Path("data")  # Use pathlib for path handling
 TARGET_REPO = "amaye15/object-segmentation-processed"
 WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET")
 def get_data():
     """
     Generator function to stream data from the dataset.
+    Uses streaming to avoid loading the entire dataset into memory at once,
+    which is useful for handling large datasets.
     """
     ds = load_dataset(
         DS_NAME,
+        cache_dir=DATA_DIR,
         streaming=True,
         download_mode="force_redownload",
     )
 def process_and_push_data():
     """
     Function to process and push new data to the target repository.
+    Removes existing data directory if it exists, recreates it, processes
+    the dataset, and pushes the processed dataset to the hub.
+    """
+    if DATA_DIR.exists():
+        shutil.rmtree(DATA_DIR)
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    # Process data using the generator and push it to the hub
     ds_processed = Dataset.from_generator(get_data)
     ds_processed.push_to_hub(TARGET_REPO)
     logger.info("Data processed and pushed to the hub.")
     gc.collect()
 ):
     """
     Webhook endpoint that triggers data processing when the dataset is updated.
+    Adds a task to the background task queue to process the dataset
+    asynchronously.
     """
     logger.info(
         f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}"
 def _process_webhook():
+    """
+    Private function to handle the processing of the dataset when a webhook
+    is triggered.
+    Loads the dataset, processes it, and pushes the processed data to the hub.
+    """
+    logger.info("Loading new dataset...")
+    # Dataset loading is handled inside process_and_push_data, no need to load here
+    logger.info("Loaded new dataset")
+    logger.info("Processing and updating dataset...")
     process_and_push_data()
+    logger.info("Processing and updating dataset completed!")
 if __name__ == "__main__":