Spaces:
Running
Running
import asyncio | |
from crawl4ai import AsyncWebCrawler | |
from huggingface_hub import HfApi | |
import json | |
from datetime import datetime | |
import os | |
async def crawl_website(): | |
async with AsyncWebCrawler() as crawler: | |
# Crawl the website | |
result = await crawler.arun( | |
url="https://sletchersystems.com", | |
max_depth=2, # Adjust depth as needed | |
content_types=["text/html"], | |
exclude_patterns=[ | |
"*/admin/*", | |
"*/login/*", | |
"*/logout/*" | |
] | |
) | |
# Format the data for dataset | |
crawl_data = { | |
"url": "https://sletchersystems.com", | |
"timestamp": datetime.now().isoformat(), | |
"content": result.markdown, | |
"metadata": { | |
"pages_crawled": len(result.urls), | |
"total_tokens": len(result.markdown.split()) | |
} | |
} | |
return crawl_data | |
def update_huggingface_dataset(crawl_data): | |
# Initialize Hugging Face API | |
api = HfApi() | |
# Save crawl data to a file | |
dataset_path = "website_data.json" | |
with open(dataset_path, "w") as f: | |
json.dump(crawl_data, f) | |
# Push to Hugging Face | |
api.upload_file( | |
path_or_fileobj=dataset_path, | |
path_in_repo="data/latest_crawl.json", | |
repo_id="SmokeyBandit/SletcherSystems", | |
repo_type="dataset" | |
) | |
if __name__ == "__main__": | |
# Run the crawler | |
crawl_data = asyncio.run(crawl_website()) | |
# Update the dataset | |
update_huggingface_dataset(crawl_data) |