Spaces:
Running
Running
import streamlit as st | |
import os | |
import time | |
import json | |
import re | |
def clean_string_for_filename(s): | |
"""Cleans a string to make it safe for use as a filename.""" | |
s = re.sub(r"[^\w\s-]", "", s) # Remove invalid characters | |
s = re.sub(r"\s+", "_", s) # Replace spaces with underscores | |
return s.strip("_") | |
def check_scraping_status(log_file="scraping_status.log"): | |
try: | |
with open(log_file, "r") as file: | |
lines = file.readlines() | |
status = lines[-1] | |
return status | |
except FileNotFoundError: | |
return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing | |
def run_scraping(url, depth_limit, pagecount_limit): | |
# Generate a safe filename based on the URL | |
identifier = clean_string_for_filename(url) | |
output_filename = f"output_{identifier}.json" | |
if os.path.exists(output_filename): | |
os.remove(output_filename) | |
# Run the scraping command | |
os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o {output_filename}") | |
st.success(f"Scraping started. Output will be saved to {output_filename}.") | |
return output_filename | |
# Streamlit interface | |
st.title("Scraping Tool with URL-based Output File") | |
col1, col2 = st.columns(2) | |
with col1: | |
depth_limit = st.slider("Depth Limit", min_value=1, value=2, max_value=5, step=1) | |
with col2: | |
pagecount_limit = st.slider("Page Count Limit", min_value=10, value=10, max_value=50, step=10) | |
url = st.text_input("Enter URL", value="https://bsenst.github.io/toscrape/app-website/") | |
if st.button("Run Scraping"): | |
if check_scraping_status() == "Scraping running": | |
st.warning("Scraping in progress...") | |
else: | |
output_filename = run_scraping(url, depth_limit, pagecount_limit) | |
if st.button("Status Scraping"): | |
identifier = clean_string_for_filename(url) | |
output_file = f"output_{identifier}.json" | |
if check_scraping_status() == "Scraping running": | |
st.warning("Scraping is running.") | |
elif os.path.exists(output_file): | |
try: | |
with open(output_file, "r") as f: | |
scraped_data = json.load(f) | |
page_count = len(scraped_data) | |
# Show download button if output file exists | |
st.download_button( | |
"Download Scraping Output", | |
data=json.dumps(scraped_data), | |
file_name=output_file, | |
) | |
# Display number of pages scraped | |
st.write(f"{page_count} pages scraped:") | |
# Display scraping results | |
st.write([(el["url"], el["title"]) for el in scraped_data]) | |
except Exception as e: | |
st.warning(f"Error with opening {output_file}: {e}") | |
else: | |
st.warning("No output file found. Please run the scraping command.") | |