amaye15
commited on
Commit
·
60283f6
1
Parent(s):
6ea28ef
webhook - complete
Browse files
app.py
CHANGED
|
@@ -6,8 +6,8 @@ from pathlib import Path
|
|
| 6 |
from huggingface_hub import WebhooksServer, WebhookPayload
|
| 7 |
from datasets import Dataset, load_dataset, disable_caching
|
| 8 |
from fastapi import BackgroundTasks, Response, status
|
| 9 |
-
from huggingface_hub.utils import build_hf_headers, get_session
|
| 10 |
|
|
|
|
| 11 |
disable_caching()
|
| 12 |
|
| 13 |
# Set up the logger
|
|
@@ -23,7 +23,7 @@ logger.addHandler(console_handler)
|
|
| 23 |
|
| 24 |
# Environment variables
|
| 25 |
DS_NAME = "amaye15/object-segmentation"
|
| 26 |
-
DATA_DIR = "data"
|
| 27 |
TARGET_REPO = "amaye15/object-segmentation-processed"
|
| 28 |
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET")
|
| 29 |
|
|
@@ -31,10 +31,13 @@ WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET")
|
|
| 31 |
def get_data():
|
| 32 |
"""
|
| 33 |
Generator function to stream data from the dataset.
|
|
|
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
ds = load_dataset(
|
| 36 |
DS_NAME,
|
| 37 |
-
cache_dir=
|
| 38 |
streaming=True,
|
| 39 |
download_mode="force_redownload",
|
| 40 |
)
|
|
@@ -46,16 +49,18 @@ def get_data():
|
|
| 46 |
def process_and_push_data():
|
| 47 |
"""
|
| 48 |
Function to process and push new data to the target repository.
|
| 49 |
-
"""
|
| 50 |
-
p = os.path.join(os.getcwd(), DATA_DIR)
|
| 51 |
-
|
| 52 |
-
if os.path.exists(p):
|
| 53 |
-
shutil.rmtree(p)
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
|
|
|
| 57 |
ds_processed = Dataset.from_generator(get_data)
|
| 58 |
ds_processed.push_to_hub(TARGET_REPO)
|
|
|
|
| 59 |
logger.info("Data processed and pushed to the hub.")
|
| 60 |
gc.collect()
|
| 61 |
|
|
@@ -70,6 +75,9 @@ async def handle_repository_changes(
|
|
| 70 |
):
|
| 71 |
"""
|
| 72 |
Webhook endpoint that triggers data processing when the dataset is updated.
|
|
|
|
|
|
|
|
|
|
| 73 |
"""
|
| 74 |
logger.info(
|
| 75 |
f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}"
|
|
@@ -79,13 +87,19 @@ async def handle_repository_changes(
|
|
| 79 |
|
| 80 |
|
| 81 |
def _process_webhook():
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
logger.info(
|
| 87 |
process_and_push_data()
|
| 88 |
-
logger.info(
|
| 89 |
|
| 90 |
|
| 91 |
if __name__ == "__main__":
|
|
|
|
| 6 |
from huggingface_hub import WebhooksServer, WebhookPayload
|
| 7 |
from datasets import Dataset, load_dataset, disable_caching
|
| 8 |
from fastapi import BackgroundTasks, Response, status
|
|
|
|
| 9 |
|
| 10 |
+
# Disable caching globally for Hugging Face datasets
|
| 11 |
disable_caching()
|
| 12 |
|
| 13 |
# Set up the logger
|
|
|
|
| 23 |
|
| 24 |
# Environment variables
|
| 25 |
DS_NAME = "amaye15/object-segmentation"
|
| 26 |
+
DATA_DIR = Path("data") # Use pathlib for path handling
|
| 27 |
TARGET_REPO = "amaye15/object-segmentation-processed"
|
| 28 |
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET")
|
| 29 |
|
|
|
|
| 31 |
def get_data():
|
| 32 |
"""
|
| 33 |
Generator function to stream data from the dataset.
|
| 34 |
+
|
| 35 |
+
Uses streaming to avoid loading the entire dataset into memory at once,
|
| 36 |
+
which is useful for handling large datasets.
|
| 37 |
"""
|
| 38 |
ds = load_dataset(
|
| 39 |
DS_NAME,
|
| 40 |
+
cache_dir=DATA_DIR,
|
| 41 |
streaming=True,
|
| 42 |
download_mode="force_redownload",
|
| 43 |
)
|
|
|
|
| 49 |
def process_and_push_data():
|
| 50 |
"""
|
| 51 |
Function to process and push new data to the target repository.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
Removes existing data directory if it exists, recreates it, processes
|
| 54 |
+
the dataset, and pushes the processed dataset to the hub.
|
| 55 |
+
"""
|
| 56 |
+
if DATA_DIR.exists():
|
| 57 |
+
shutil.rmtree(DATA_DIR)
|
| 58 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 59 |
|
| 60 |
+
# Process data using the generator and push it to the hub
|
| 61 |
ds_processed = Dataset.from_generator(get_data)
|
| 62 |
ds_processed.push_to_hub(TARGET_REPO)
|
| 63 |
+
|
| 64 |
logger.info("Data processed and pushed to the hub.")
|
| 65 |
gc.collect()
|
| 66 |
|
|
|
|
| 75 |
):
|
| 76 |
"""
|
| 77 |
Webhook endpoint that triggers data processing when the dataset is updated.
|
| 78 |
+
|
| 79 |
+
Adds a task to the background task queue to process the dataset
|
| 80 |
+
asynchronously.
|
| 81 |
"""
|
| 82 |
logger.info(
|
| 83 |
f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}"
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
def _process_webhook():
|
| 90 |
+
"""
|
| 91 |
+
Private function to handle the processing of the dataset when a webhook
|
| 92 |
+
is triggered.
|
| 93 |
+
|
| 94 |
+
Loads the dataset, processes it, and pushes the processed data to the hub.
|
| 95 |
+
"""
|
| 96 |
+
logger.info("Loading new dataset...")
|
| 97 |
+
# Dataset loading is handled inside process_and_push_data, no need to load here
|
| 98 |
+
logger.info("Loaded new dataset")
|
| 99 |
|
| 100 |
+
logger.info("Processing and updating dataset...")
|
| 101 |
process_and_push_data()
|
| 102 |
+
logger.info("Processing and updating dataset completed!")
|
| 103 |
|
| 104 |
|
| 105 |
if __name__ == "__main__":
|