Spaces:

SmokeyBandit
/

SletcherSystems

Running

SletcherSystems / crawl_website.py

Create crawl_website.py

25beff5 verified about 2 months ago

1.6 kB

	import asyncio
	from crawl4ai import AsyncWebCrawler
	from huggingface_hub import HfApi
	import json
	from datetime import datetime
	import os

	async def crawl_website():
	async with AsyncWebCrawler() as crawler:
	# Crawl the website
	result = await crawler.arun(
	url="https://sletchersystems.com",
	max_depth=2, # Adjust depth as needed
	content_types=["text/html"],
	exclude_patterns=[
	"/admin/",
	"/login/",
	"/logout/"
	]
	)

	# Format the data for dataset
	crawl_data = {
	"url": "https://sletchersystems.com",
	"timestamp": datetime.now().isoformat(),
	"content": result.markdown,
	"metadata": {
	"pages_crawled": len(result.urls),
	"total_tokens": len(result.markdown.split())
	}
	}

	return crawl_data

	def update_huggingface_dataset(crawl_data):
	# Initialize Hugging Face API
	api = HfApi()

	# Save crawl data to a file
	dataset_path = "website_data.json"
	with open(dataset_path, "w") as f:
	json.dump(crawl_data, f)

	# Push to Hugging Face
	api.upload_file(
	path_or_fileobj=dataset_path,
	path_in_repo="data/latest_crawl.json",
	repo_id="SmokeyBandit/SletcherSystems",
	repo_type="dataset"
	)

	if __name__ == "__main__":
	# Run the crawler
	crawl_data = asyncio.run(crawl_website())

	# Update the dataset
	update_huggingface_dataset(crawl_data)