import asyncio from crawl4ai import AsyncWebCrawler from huggingface_hub import HfApi import json from datetime import datetime import os async def crawl_website(): async with AsyncWebCrawler() as crawler: # Crawl the website result = await crawler.arun( url="https://sletchersystems.com", max_depth=2, # Adjust depth as needed content_types=["text/html"], exclude_patterns=[ "*/admin/*", "*/login/*", "*/logout/*" ] ) # Format the data for dataset crawl_data = { "url": "https://sletchersystems.com", "timestamp": datetime.now().isoformat(), "content": result.markdown, "metadata": { "pages_crawled": len(result.urls), "total_tokens": len(result.markdown.split()) } } return crawl_data def update_huggingface_dataset(crawl_data): # Initialize Hugging Face API api = HfApi() # Save crawl data to a file dataset_path = "website_data.json" with open(dataset_path, "w") as f: json.dump(crawl_data, f) # Push to Hugging Face api.upload_file( path_or_fileobj=dataset_path, path_in_repo="data/latest_crawl.json", repo_id="SmokeyBandit/SletcherSystems", repo_type="dataset" ) if __name__ == "__main__": # Run the crawler crawl_data = asyncio.run(crawl_website()) # Update the dataset update_huggingface_dataset(crawl_data)