webspider / app.py
bsenst's picture
remove variable handling bug (#1)
9b6b215 verified
import streamlit as st
import os
import time
import json
import re
def clean_string_for_filename(s):
"""Cleans a string to make it safe for use as a filename."""
s = re.sub(r"[^\w\s-]", "", s) # Remove invalid characters
s = re.sub(r"\s+", "_", s) # Replace spaces with underscores
return s.strip("_")
def check_scraping_status(log_file="scraping_status.log"):
try:
with open(log_file, "r") as file:
lines = file.readlines()
status = lines[-1]
return status
except FileNotFoundError:
return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing
def run_scraping(url, depth_limit, pagecount_limit):
# Generate a safe filename based on the URL
identifier = clean_string_for_filename(url)
output_filename = f"output_{identifier}.json"
if os.path.exists(output_filename):
os.remove(output_filename)
# Run the scraping command
os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o {output_filename}")
st.success(f"Scraping started. Output will be saved to {output_filename}.")
return output_filename
# Streamlit interface
st.title("Scraping Tool with URL-based Output File")
col1, col2 = st.columns(2)
with col1:
depth_limit = st.slider("Depth Limit", min_value=1, value=2, max_value=5, step=1)
with col2:
pagecount_limit = st.slider("Page Count Limit", min_value=10, value=10, max_value=50, step=10)
url = st.text_input("Enter URL", value="https://bsenst.github.io/toscrape/app-website/")
if st.button("Run Scraping"):
if check_scraping_status() == "Scraping running":
st.warning("Scraping in progress...")
else:
output_filename = run_scraping(url, depth_limit, pagecount_limit)
if st.button("Status Scraping"):
identifier = clean_string_for_filename(url)
output_file = f"output_{identifier}.json"
if check_scraping_status() == "Scraping running":
st.warning("Scraping is running.")
elif os.path.exists(output_file):
try:
with open(output_file, "r") as f:
scraped_data = json.load(f)
page_count = len(scraped_data)
# Show download button if output file exists
st.download_button(
"Download Scraping Output",
data=json.dumps(scraped_data),
file_name=output_file,
)
# Display number of pages scraped
st.write(f"{page_count} pages scraped:")
# Display scraping results
st.write([(el["url"], el["title"]) for el in scraped_data])
except Exception as e:
st.warning(f"Error with opening {output_file}: {e}")
else:
st.warning("No output file found. Please run the scraping command.")