SletcherSystems / crawl_website.py
SmokeyBandit's picture
Create crawl_website.py
25beff5 verified
raw
history blame
1.6 kB
import asyncio
from crawl4ai import AsyncWebCrawler
from huggingface_hub import HfApi
import json
from datetime import datetime
import os
async def crawl_website():
async with AsyncWebCrawler() as crawler:
# Crawl the website
result = await crawler.arun(
url="https://sletchersystems.com",
max_depth=2, # Adjust depth as needed
content_types=["text/html"],
exclude_patterns=[
"*/admin/*",
"*/login/*",
"*/logout/*"
]
)
# Format the data for dataset
crawl_data = {
"url": "https://sletchersystems.com",
"timestamp": datetime.now().isoformat(),
"content": result.markdown,
"metadata": {
"pages_crawled": len(result.urls),
"total_tokens": len(result.markdown.split())
}
}
return crawl_data
def update_huggingface_dataset(crawl_data):
# Initialize Hugging Face API
api = HfApi()
# Save crawl data to a file
dataset_path = "website_data.json"
with open(dataset_path, "w") as f:
json.dump(crawl_data, f)
# Push to Hugging Face
api.upload_file(
path_or_fileobj=dataset_path,
path_in_repo="data/latest_crawl.json",
repo_id="SmokeyBandit/SletcherSystems",
repo_type="dataset"
)
if __name__ == "__main__":
# Run the crawler
crawl_data = asyncio.run(crawl_website())
# Update the dataset
update_huggingface_dataset(crawl_data)