Spaces:

SmokeyBandit
/

SletcherSystems

Running

File size: 1,600 Bytes

25beff5

import asyncio
from crawl4ai import AsyncWebCrawler
from huggingface_hub import HfApi
import json
from datetime import datetime
import os

async def crawl_website():
    async with AsyncWebCrawler() as crawler:
        # Crawl the website
        result = await crawler.arun(
            url="https://sletchersystems.com",
            max_depth=2,  # Adjust depth as needed
            content_types=["text/html"],
            exclude_patterns=[
                "*/admin/*",
                "*/login/*",
                "*/logout/*"
            ]
        )
        
        # Format the data for dataset
        crawl_data = {
            "url": "https://sletchersystems.com",
            "timestamp": datetime.now().isoformat(),
            "content": result.markdown,
            "metadata": {
                "pages_crawled": len(result.urls),
                "total_tokens": len(result.markdown.split())
            }
        }
        
        return crawl_data

def update_huggingface_dataset(crawl_data):
    # Initialize Hugging Face API
    api = HfApi()
    
    # Save crawl data to a file
    dataset_path = "website_data.json"
    with open(dataset_path, "w") as f:
        json.dump(crawl_data, f)
    
    # Push to Hugging Face
    api.upload_file(
        path_or_fileobj=dataset_path,
        path_in_repo="data/latest_crawl.json",
        repo_id="SmokeyBandit/SletcherSystems",
        repo_type="dataset"
    )

if __name__ == "__main__":
    # Run the crawler
    crawl_data = asyncio.run(crawl_website())
    
    # Update the dataset
    update_huggingface_dataset(crawl_data)