import os import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from zipfile import ZipFile from io import BytesIO import gradio as gr def download_file(url, session): """Download a file and return its content.""" try: response = session.get(url) response.raise_for_status() return response.content except requests.exceptions.RequestException as e: print(f"Error downloading {url}: {e}") return None def save_webpage_as_zip(url): """Save a webpage and its assets as a ZIP file.""" session = requests.Session() response = session.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') temp_dir = 'temp_webpage' if not os.path.exists(temp_dir): os.makedirs(temp_dir) main_html_path = os.path.join(temp_dir, 'index.html') with open(main_html_path, 'wb') as f: f.write(response.content) assets = [] for tag in soup.find_all(['img', 'link', 'script']): if tag.name == 'img' and tag.get('src'): assets.append(tag['src']) elif tag.name == 'link' and tag.get('href'): assets.append(tag['href']) elif tag.name == 'script' and tag.get('src'): assets.append(tag['src']) for asset in assets: asset_url = urljoin(url, asset) asset_path = urlparse(asset_url).path.lstrip('/') asset_full_path = os.path.join(temp_dir, asset_path) if asset_path.endswith('/'): print(f"Skipping directory {asset_full_path}") continue os.makedirs(os.path.dirname(asset_full_path), exist_ok=True) content = download_file(asset_url, session) if content: if os.path.isdir(asset_full_path): print(f"Skipping directory {asset_full_path}") continue with open(asset_full_path, 'wb') as f: f.write(content) zip_buffer = BytesIO() with ZipFile(zip_buffer, 'w') as zipf: for root, _, files in os.walk(temp_dir): for file in files: file_path = os.path.join(root, file) zipf.write(file_path, os.path.relpath(file_path, temp_dir)) for root, _, files in os.walk(temp_dir, topdown=False): for file in files: os.remove(os.path.join(root, file)) os.rmdir(root) zip_buffer.seek(0) return zip_buffer def generate_zip_file(url): """Generate ZIP file from a webpage URL.""" zip_buffer = save_webpage_as_zip(url) temp_zip_path = "webpage.zip" with open(temp_zip_path, 'wb') as f: f.write(zip_buffer.read()) return temp_zip_path examples = [ "https://www.bmw.com/en/index.html", "https://www.ferrari.com/en-EN", "https://streamlit.io/" ] DESCRIPTION = """ ## Webpage to ZIP Downloader 🔗 """ with gr.Blocks(theme="bethecloud/storj_theme") as demo: gr.Markdown(DESCRIPTION) gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.") url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)") download_button = gr.Button("Download as ZIP") output_file = gr.File(label="Download") def set_example_url(url): url_input.value = url download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file) gr.Examples( examples=examples, inputs=url_input, outputs=output_file, fn=generate_zip_file ) demo.launch()