webspider

Running

App Files Files Community

webspider / app.py

bsenst

remove variable handling bug (#1)

9b6b215 verified 4 months ago

raw

history blame contribute delete

2.88 kB

	import streamlit as st
	import os
	import time
	import json
	import re

	def clean_string_for_filename(s):
	"""Cleans a string to make it safe for use as a filename."""
	s = re.sub(r"[^\w\s-]", "", s) # Remove invalid characters
	s = re.sub(r"\s+", "_", s) # Replace spaces with underscores
	return s.strip("_")

	def check_scraping_status(log_file="scraping_status.log"):
	try:
	with open(log_file, "r") as file:
	lines = file.readlines()
	status = lines[-1]
	return status
	except FileNotFoundError:
	return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing

	def run_scraping(url, depth_limit, pagecount_limit):
	# Generate a safe filename based on the URL
	identifier = clean_string_for_filename(url)
	output_filename = f"output_{identifier}.json"

	if os.path.exists(output_filename):
	os.remove(output_filename)

	# Run the scraping command
	os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o {output_filename}")

	st.success(f"Scraping started. Output will be saved to {output_filename}.")
	return output_filename

	# Streamlit interface
	st.title("Scraping Tool with URL-based Output File")
	col1, col2 = st.columns(2)

	with col1:
	depth_limit = st.slider("Depth Limit", min_value=1, value=2, max_value=5, step=1)
	with col2:
	pagecount_limit = st.slider("Page Count Limit", min_value=10, value=10, max_value=50, step=10)

	url = st.text_input("Enter URL", value="https://bsenst.github.io/toscrape/app-website/")

	if st.button("Run Scraping"):
	if check_scraping_status() == "Scraping running":
	st.warning("Scraping in progress...")
	else:
	output_filename = run_scraping(url, depth_limit, pagecount_limit)

	if st.button("Status Scraping"):
	identifier = clean_string_for_filename(url)
	output_file = f"output_{identifier}.json"

	if check_scraping_status() == "Scraping running":
	st.warning("Scraping is running.")
	elif os.path.exists(output_file):
	try:
	with open(output_file, "r") as f:
	scraped_data = json.load(f)
	page_count = len(scraped_data)

	# Show download button if output file exists
	st.download_button(
	"Download Scraping Output",
	data=json.dumps(scraped_data),
	file_name=output_file,
	)
	# Display number of pages scraped
	st.write(f"{page_count} pages scraped:")
	# Display scraping results
	st.write([(el["url"], el["title"]) for el in scraped_data])

	except Exception as e:
	st.warning(f"Error with opening {output_file}: {e}")
	else:
	st.warning("No output file found. Please run the scraping command.")