Spaces:

patrickacraig
/

docs-scraper

Sleeping

App Files Files Community

patrickacraig commited on Oct 7, 2024

Commit

709e431

1 Parent(s): 5566cb9

some improvements

Browse files

Files changed (2) hide show

app.py +66 -64
web_ui.py +57 -33

app.py CHANGED Viewed

@@ -1,80 +1,82 @@
 from firecrawl import FirecrawlApp
 import os
 import time
 from dotenv import load_dotenv
 from urllib.parse import urlparse
 load_dotenv()
 base_url = os.getenv('BASE_URL')
-def map_website(url):
-    # Initialize the Firecrawl application with the API key
-    app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
-    # Use the /map endpoint to get all URLs from the website
-    map_status = app.map_url(url)
-    # Check if the mapping was successful
-    if isinstance(map_status, list):
-        return map_status
-    else:
-        print("Failed to map the website:", map_status)
-        return []
-def scrape_url(url):
-    # Initialize the Firecrawl application with the API key
-    app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
-    # Use the /scrape endpoint to scrape the URL
-    scrape_status = app.scrape_url(url)
-    # Print the scrape_status to understand its structure
-    print(f"Scrape status for {url}: {scrape_status}")
-    # Check if the scraping was successful
-    if 'markdown' in scrape_status:
-        return scrape_status['markdown']
-    else:
-        print(f"Failed to scrape {url}: {scrape_status}")
         return ""
-def scrape_all_urls(base_url):
-    # Map the URLs
-    urls = map_website(base_url)
-    # Parse the base URL to get the domain without 'www' and scheme
-    parsed_url = urlparse(base_url)
-    domain = parsed_url.netloc.replace("www.", "")
-    # Create the directory if it doesn't exist
-    os.makedirs('scraped_documentation', exist_ok=True)
-    # Generate the output file name and save location
-    output_file = os.path.join('scraped_documentation', f"{domain}.md")
-    # Open the output file in write mode
-    with open(output_file, 'w', encoding='utf-8') as md_file:
-        # Iterate over the URLs
-        for i, url in enumerate(urls):
-            # Print the URL being scraped
-            print(f"Scraping {url} ({i+1}/{len(urls)})")
-            # Scrape the URL
-            markdown_content = scrape_url(url)
-            # Write the scraped content to the file
-            md_file.write(f"# {url}\n\n")
-            md_file.write(markdown_content)
-            md_file.write("\n\n---\n\n")
-            # Rate limiting: 10 scrapes per minute
-            if os.getenv('LIMIT_RATE') == 'True':
-                if (i + 1) % 10 == 0:
                     print("Rate limit reached, waiting for 60 seconds...")
                     time.sleep(60)
 if __name__ == "__main__":
-    scrape_all_urls(base_url)

 from firecrawl import FirecrawlApp
 import os
 import time
+import asyncio
 from dotenv import load_dotenv
 from urllib.parse import urlparse
 load_dotenv()
 base_url = os.getenv('BASE_URL')
+api_key = os.getenv('FIRECRAWL_API_KEY')
+limit_rate = os.getenv('LIMIT_RATE', 'False').lower() == 'true'
+# Get Firecrawl App instance
+def get_firecrawl_app(api_key):
+    return FirecrawlApp(api_key=api_key)
+# Asynchronous scrape URL
+async def async_scrape_url(app, url):
+    try:
+        scrape_status = app.scrape_url(url)
+        print(f"Scrape status for {url}: {scrape_status}")
+        if 'markdown' in scrape_status:
+            return scrape_status['markdown']
+        else:
+            print(f"Failed to scrape {url}: {scrape_status}")
+            return ""
+    except Exception as e:
+        print(f"Error scraping {url}: {e}")
         return ""
+# Synchronously map website URLs
+def map_website(app, url):
+    try:
+        map_status = app.map_url(url)
+        if isinstance(map_status, list):
+            return map_status
+        else:
+            print("Failed to map the website:", map_status)
+            return []
+    except Exception as e:
+        print(f"Error mapping website {url}: {e}")
+        return []
+# Asynchronously scrape all URLs
+def scrape_all_urls(base_url, api_key, limit_rate):
+    async def scrape_process():
+        app = get_firecrawl_app(api_key)
+        urls = map_website(app, base_url)
+        if not urls:
+            print("No URLs found. Please check if the base URL is correct.")
+            return
+        parsed_url = urlparse(base_url)
+        domain = parsed_url.netloc.replace("www.", "")
+        os.makedirs('scraped_documentation', exist_ok=True)
+        output_file = os.path.join('scraped_documentation', f"{domain}.md")
+        with open(output_file, 'w', encoding='utf-8') as md_file:
+            for i, url in enumerate(urls):
+                print(f"Scraping {url} ({i+1}/{len(urls)})")
+                markdown_content = await async_scrape_url(app, url)
+                md_file.write(f"# {url}\n\n")
+                md_file.write(markdown_content)
+                md_file.write("\n\n---\n\n")
+                # Rate limiting: 10 scrapes per minute
+                if limit_rate and (i + 1) % 10 == 0:
                     print("Rate limit reached, waiting for 60 seconds...")
                     time.sleep(60)
+        print(f"Scraping completed. Output saved to {output_file}")
+    asyncio.run(scrape_process())
 if __name__ == "__main__":
+    if not base_url:
+        print("Error: BASE_URL not specified in environment variables.")
+    elif not api_key:
+        print("Error: FIRECRAWL_API_KEY not specified in environment variables.")
+    else:
+        scrape_all_urls(base_url, api_key, limit_rate)

web_ui.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import time
 from dotenv import load_dotenv
 from urllib.parse import urlparse
 from firecrawl import FirecrawlApp
@@ -7,62 +8,81 @@ import gradio as gr
 load_dotenv()
-def map_website(url, api_key):
-    app = FirecrawlApp(api_key=api_key)
-    map_status = app.map_url(url)
-    if isinstance(map_status, list):
-        return map_status
-    else:
-        print("Failed to map the website:", map_status)
-        return []
-def scrape_url(url, api_key):
-    app = FirecrawlApp(api_key=api_key)
-    scrape_status = app.scrape_url(url)
-    print(f"Scrape status for {url}: {scrape_status}")
-    if 'markdown' in scrape_status:
-        return scrape_status['markdown']
-    else:
-        print(f"Failed to scrape {url}: {scrape_status}")
         return ""
-def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress()):
-    urls = map_website(base_url, api_key)
     parsed_url = urlparse(base_url)
     domain = parsed_url.netloc.replace("www.", "")
     os.makedirs('scraped_documentation', exist_ok=True)
     output_file = os.path.join('scraped_documentation', f"{domain}.md")
     with open(output_file, 'w', encoding='utf-8') as md_file:
         for i, url in enumerate(progress.tqdm(urls)):
             progress(i / len(urls), f"Scraping {url}")
-            markdown_content = scrape_url(url, api_key)
             md_file.write(f"# {url}\n\n")
             md_file.write(markdown_content)
             md_file.write("\n\n---\n\n")
-            if limit_rate:
-                if (i + 1) % 10 == 0:
-                    time.sleep(60)
     return f"Scraping completed. Output saved to {output_file}"
 def count_urls(base_url, api_key):
     if not api_key:
         return "Please enter your Firecrawl API key first."
-    urls = map_website(base_url, api_key)
-    return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
-def gradio_scrape(base_url, api_key, limit_rate):
     if not api_key:
         return "Please enter your Firecrawl API key."
     if not base_url:
         return "Please enter a base URL to scrape."
-    return scrape_all_urls(base_url, api_key, limit_rate)
 with gr.Blocks() as iface:
     gr.Markdown("# Docs Scraper")
-    gr.Markdown("## To map and scrape all URLs from a given website using the Firecrawl API, enter a base URL to scrape, your Firecrawl API key, and choose whether to limit the rate of scraping.")
-    gr.Markdown("Scraped content is saved into a markdown file named after the domain of the base URL, making it easy to reference and utilize. This can be particularly useful for AI code editors that need to gather context from various types of websites. By scraping the content, the AI can analyze and understand the structure and information provided, which can enhance its ability to offer accurate code suggestions and improvements.")
     gr.HTML('Don\'t have an API key? <a href="https://firecrawl.dev/" target="_blank" rel="noopener noreferrer">Get one from Firecrawl</a>')
     with gr.Row():
@@ -74,7 +94,8 @@ with gr.Blocks() as iface:
             info="Enable to limit scraping to 10 URLs per minute. This adheres to Firecrawl API's free tier rate limit."
         )
-    gr.Markdown("After entering your API key, click 'Count URLs' to determine the number of URLs to be scraped. Then, click 'Scrape URLs' to begin the process. The progress and file location will be displayed in the textbox labeled 'Output'.")
     with gr.Row():
         count_button = gr.Button("Count URLs")
         url_count = gr.Textbox(label="URL Count")
@@ -83,8 +104,11 @@ with gr.Blocks() as iface:
         scrape_button = gr.Button("Scrape URLs")
         output = gr.Textbox(label="Output", elem_id="output_textbox")
-    gr.Markdown("#### Note: The free tier of the Firecrawl API allows for 500 credits per month. If you need to scrape more, you can upgrade to a paid plan. The 'Count URLs' button may not work as expected if the base URL is not correctly specified or if the API key is invalid. Always ensure the base URL is correct and the API key is valid before proceeding.")
     count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
     scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output])

 import os
 import time
+import asyncio
 from dotenv import load_dotenv
 from urllib.parse import urlparse
 from firecrawl import FirecrawlApp
 load_dotenv()
+def get_firecrawl_app(api_key):
+    return FirecrawlApp(api_key=api_key)
+async def async_scrape_url(app, url):
+    try:
+        scrape_status = app.scrape_url(url)
+        print(f"Scrape status for {url}: {scrape_status}")
+        if 'markdown' in scrape_status:
+            return scrape_status['markdown']
+        else:
+            print(f"Failed to scrape {url}: {scrape_status}")
+            return ""
+    except Exception as e:
+        print(f"Error scraping {url}: {e}")
         return ""
+def map_website(app, url):
+    try:
+        map_status = app.map_url(url)
+        if isinstance(map_status, list):
+            return map_status
+        else:
+            print("Failed to map the website:", map_status)
+            return []
+    except Exception as e:
+        print(f"Error mapping website {url}: {e}")
+        return []
+async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress()):
+    app = get_firecrawl_app(api_key)
+    urls = map_website(app, base_url)
+    if not urls:
+        return "No URLs found. Please check if the base URL is correct."
     parsed_url = urlparse(base_url)
     domain = parsed_url.netloc.replace("www.", "")
     os.makedirs('scraped_documentation', exist_ok=True)
     output_file = os.path.join('scraped_documentation', f"{domain}.md")
     with open(output_file, 'w', encoding='utf-8') as md_file:
         for i, url in enumerate(progress.tqdm(urls)):
             progress(i / len(urls), f"Scraping {url}")
+            markdown_content = await async_scrape_url(app, url)
             md_file.write(f"# {url}\n\n")
             md_file.write(markdown_content)
             md_file.write("\n\n---\n\n")
+            if limit_rate and (i + 1) % 10 == 0:
+                time.sleep(60)
     return f"Scraping completed. Output saved to {output_file}"
 def count_urls(base_url, api_key):
     if not api_key:
         return "Please enter your Firecrawl API key first."
+    app = get_firecrawl_app(api_key)
+    urls = map_website(app, base_url)
+    if urls:
+        return f"{len(urls)} URLs found. Do you want to proceed with scraping?"
+    else:
+        return "No URLs found. Please check the base URL or API key."
+async def gradio_scrape(base_url, api_key, limit_rate):
     if not api_key:
         return "Please enter your Firecrawl API key."
     if not base_url:
         return "Please enter a base URL to scrape."
+    return await scrape_all_urls(base_url, api_key, limit_rate)
 with gr.Blocks() as iface:
     gr.Markdown("# Docs Scraper")
+    gr.Markdown("""
+    ## Map and Scrape Website URLs with Firecrawl API
+    Enter a base URL, your Firecrawl API key, and choose whether to limit the scraping rate.
+    Scraped content will be saved as a markdown file named after the domain.
+    """)
     gr.HTML('Don\'t have an API key? <a href="https://firecrawl.dev/" target="_blank" rel="noopener noreferrer">Get one from Firecrawl</a>')
     with gr.Row():
             info="Enable to limit scraping to 10 URLs per minute. This adheres to Firecrawl API's free tier rate limit."
         )
+    gr.Markdown("After entering your API key, click 'Count URLs' to determine the number of URLs to be scraped. Then, click 'Scrape URLs' to begin the process.")
     with gr.Row():
         count_button = gr.Button("Count URLs")
         url_count = gr.Textbox(label="URL Count")
         scrape_button = gr.Button("Scrape URLs")
         output = gr.Textbox(label="Output", elem_id="output_textbox")
+    gr.Markdown("""
+    #### Note:
+    The free tier of the Firecrawl API allows for 500 credits per month.
+    If you need to scrape more, consider upgrading to a paid plan.
+    """)
     count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
     scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output])