patrickacraig commited on
Commit
dde0c58
·
1 Parent(s): 0616405

adding download button

Browse files
Files changed (1) hide show
  1. app.py +25 -8
app.py CHANGED
@@ -36,11 +36,11 @@ def map_website(app, url):
36
  print(f"Error mapping website {url}: {e}")
37
  return []
38
 
39
- async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress()):
40
  app = get_firecrawl_app(api_key)
41
  urls = map_website(app, base_url)
42
  if not urls:
43
- return "No URLs found. Please check if the base URL is correct."
44
 
45
  parsed_url = urlparse(base_url)
46
  domain = parsed_url.netloc.replace("www.", "")
@@ -49,6 +49,8 @@ async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress())
49
 
50
  with open(output_file, 'w', encoding='utf-8') as md_file:
51
  for i, url in enumerate(progress.tqdm(urls)):
 
 
52
  progress(i / len(urls), f"Scraping {url}")
53
  markdown_content = await async_scrape_url(app, url)
54
  md_file.write(f"# {url}\n\n")
@@ -57,7 +59,7 @@ async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress())
57
  if limit_rate and (i + 1) % 10 == 0:
58
  time.sleep(60)
59
 
60
- return f"Scraping completed. Output saved to {output_file}"
61
 
62
  def count_urls(base_url, api_key):
63
  if not api_key:
@@ -69,12 +71,21 @@ def count_urls(base_url, api_key):
69
  else:
70
  return "No URLs found. Please check the base URL or API key."
71
 
72
- async def gradio_scrape(base_url, api_key, limit_rate):
73
  if not api_key:
74
- return "Please enter your Firecrawl API key."
75
  if not base_url:
76
- return "Please enter a base URL to scrape."
77
- return await scrape_all_urls(base_url, api_key, limit_rate)
 
 
 
 
 
 
 
 
 
78
 
79
  with gr.Blocks() as iface:
80
  gr.Markdown("# Docs Scraper")
@@ -102,8 +113,11 @@ with gr.Blocks() as iface:
102
 
103
  with gr.Row():
104
  scrape_button = gr.Button("Scrape URLs")
 
105
  output = gr.Textbox(label="Output", elem_id="output_textbox")
106
 
 
 
107
  gr.Markdown("""
108
  #### Note:
109
  The free tier of the Firecrawl API allows for 500 credits per month.
@@ -111,7 +125,10 @@ with gr.Blocks() as iface:
111
  """)
112
 
113
  count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
114
- scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output])
 
115
 
116
  if __name__ == "__main__":
 
 
117
  iface.launch()
 
36
  print(f"Error mapping website {url}: {e}")
37
  return []
38
 
39
+ async def scrape_all_urls(base_url, api_key, limit_rate, progress=gr.Progress(), cancel_event=None):
40
  app = get_firecrawl_app(api_key)
41
  urls = map_website(app, base_url)
42
  if not urls:
43
+ return "No URLs found. Please check if the base URL is correct.", None
44
 
45
  parsed_url = urlparse(base_url)
46
  domain = parsed_url.netloc.replace("www.", "")
 
49
 
50
  with open(output_file, 'w', encoding='utf-8') as md_file:
51
  for i, url in enumerate(progress.tqdm(urls)):
52
+ if cancel_event and cancel_event.is_set():
53
+ return "Scraping cancelled.", None
54
  progress(i / len(urls), f"Scraping {url}")
55
  markdown_content = await async_scrape_url(app, url)
56
  md_file.write(f"# {url}\n\n")
 
59
  if limit_rate and (i + 1) % 10 == 0:
60
  time.sleep(60)
61
 
62
+ return f"Scraping completed. Output saved to {output_file}", output_file
63
 
64
  def count_urls(base_url, api_key):
65
  if not api_key:
 
71
  else:
72
  return "No URLs found. Please check the base URL or API key."
73
 
74
+ async def gradio_scrape(base_url, api_key, limit_rate, progress=gr.Progress()):
75
  if not api_key:
76
+ return "Please enter your Firecrawl API key.", None
77
  if not base_url:
78
+ return "Please enter a base URL to scrape.", None
79
+ cancel_event = asyncio.Event()
80
+ result, file_path = await scrape_all_urls(base_url, api_key, limit_rate, progress, cancel_event)
81
+ return result, file_path
82
+
83
+ def cancel_scrape():
84
+ # This function will be called when the cancel button is clicked
85
+ global cancel_event
86
+ if cancel_event:
87
+ cancel_event.set()
88
+ return "Cancelling scrape operation..."
89
 
90
  with gr.Blocks() as iface:
91
  gr.Markdown("# Docs Scraper")
 
113
 
114
  with gr.Row():
115
  scrape_button = gr.Button("Scrape URLs")
116
+ cancel_button = gr.Button("Cancel Scrape")
117
  output = gr.Textbox(label="Output", elem_id="output_textbox")
118
 
119
+ file_output = gr.File(label="Download Scraped Content")
120
+
121
  gr.Markdown("""
122
  #### Note:
123
  The free tier of the Firecrawl API allows for 500 credits per month.
 
125
  """)
126
 
127
  count_button.click(count_urls, inputs=[base_url, api_key], outputs=[url_count])
128
+ scrape_button.click(gradio_scrape, inputs=[base_url, api_key, limit_rate], outputs=[output, file_output])
129
+ cancel_button.click(cancel_scrape, outputs=[output])
130
 
131
  if __name__ == "__main__":
132
+ global cancel_event
133
+ cancel_event = asyncio.Event()
134
  iface.launch()