from firecrawl import FirecrawlApp import os import time from dotenv import load_dotenv from urllib.parse import urlparse load_dotenv() base_url = os.getenv('BASE_URL') def map_website(url): # Initialize the Firecrawl application with the API key app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY')) # Use the /map endpoint to get all URLs from the website map_status = app.map_url(url) # Check if the mapping was successful if isinstance(map_status, list): return map_status else: print("Failed to map the website:", map_status) return [] def scrape_url(url): # Initialize the Firecrawl application with the API key app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY')) # Use the /scrape endpoint to scrape the URL scrape_status = app.scrape_url(url) # Print the scrape_status to understand its structure print(f"Scrape status for {url}: {scrape_status}") # Check if the scraping was successful if 'markdown' in scrape_status: return scrape_status['markdown'] else: print(f"Failed to scrape {url}: {scrape_status}") return "" def scrape_all_urls(base_url): # Map the URLs urls = map_website(base_url) # Parse the base URL to get the domain without 'www' and scheme parsed_url = urlparse(base_url) domain = parsed_url.netloc.replace("www.", "") # Create the directory if it doesn't exist os.makedirs('scraped_documentation', exist_ok=True) # Generate the output file name and save location output_file = os.path.join('scraped_documentation', f"{domain}.md") # Open the output file in write mode with open(output_file, 'w', encoding='utf-8') as md_file: # Iterate over the URLs for i, url in enumerate(urls): # Print the URL being scraped print(f"Scraping {url} ({i+1}/{len(urls)})") # Scrape the URL markdown_content = scrape_url(url) # Write the scraped content to the file md_file.write(f"# {url}\n\n") md_file.write(markdown_content) md_file.write("\n\n---\n\n") # Rate limiting: 10 scrapes per minute if os.getenv('LIMIT_RATE') == 'True': if (i + 1) % 10 == 0: print("Rate limit reached, waiting for 60 seconds...") time.sleep(60) if __name__ == "__main__": scrape_all_urls(base_url)