import requests from bs4 import BeautifulSoup import json from datetime import datetime import time class SletcherScraper: def __init__(self): self.base_url = "https://www.sletchersystems.com" self.pages = [ "/", "/clients", "/solutions", "/services", "/about" ] self.content = { "timestamp": datetime.now().isoformat(), "pages": {}, "company_info": {}, "services": [], "solutions": [], "clients": [] } def clean_text(self, text): if text: return " ".join(text.strip().split()) return "" def scrape_page(self, url_path): full_url = self.base_url + url_path try: response = requests.get(full_url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') page_data = { "url": full_url, "title": self.clean_text(soup.title.string) if soup.title else "", "sections": [] } # Extract main content sections for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()): section_data = { "heading": "", "content": "" } # Get heading heading = section.find(['h1', 'h2', 'h3']) if heading: section_data["heading"] = self.clean_text(heading.text) # Get content paragraphs paragraphs = section.find_all('p') section_data["content"] = "\n".join([ self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text) ]) if section_data["heading"] or section_data["content"]: page_data["sections"].append(section_data) return page_data except Exception as e: print(f"Error scraping {full_url}: {e}") return None def extract_specific_content(self): # Extract services if "/services" in self.content["pages"]: services_page = self.content["pages"]["/services"] for section in services_page["sections"]: if section["heading"] and section["content"]: self.content["services"].append({ "name": section["heading"], "description": section["content"] }) # Extract solutions if "/solutions" in self.content["pages"]: solutions_page = self.content["pages"]["/solutions"] for section in solutions_page["sections"]: if section["heading"] and section["content"]: self.content["solutions"].append({ "name": section["heading"], "description": section["content"] }) # Extract company info from about page if "/about" in self.content["pages"]: about_page = self.content["pages"]["/about"] self.content["company_info"] = { "name": "SletcherSystems", "description": "\n".join([ section["content"] for section in about_page["sections"] if section["content"] ]) } def scrape_all(self): # Scrape each page for page in self.pages: print(f"Scraping {self.base_url}{page}") page_data = self.scrape_page(page) if page_data: self.content["pages"][page] = page_data time.sleep(1) # Be nice to the server # Extract specific content self.extract_specific_content() return self.content def save_to_json(self, filename="site_content.json"): with open(filename, "w", encoding="utf-8") as f: json.dump(self.content, f, indent=2, ensure_ascii=False) def main(): scraper = SletcherScraper() content = scraper.scrape_all() scraper.save_to_json("data/site_content.json") print("Scraping completed and saved to data/site_content.json") if __name__ == "__main__": main()