Spaces:

SmokeyBandit
/

SletcherSystems

Running

App Files Files Community

SmokeyBandit commited on Jan 7

Commit

1fc4326

verified ·

1 Parent(s): 4f0f0ed

Update crawl_website.py

Browse files

Files changed (1) hide show

crawl_website.py +114 -74

crawl_website.py CHANGED Viewed

@@ -1,88 +1,128 @@
-import scrapy
-from scrapy.crawler import CrawlerProcess
-from datetime import datetime
-import json
 from bs4 import BeautifulSoup
-import re
-class SletcherSpider(scrapy.Spider):
-    name = 'sletcher'
-    start_urls = ['https://sletchersystems.com']
     def __init__(self):
-        super().__init__()
-        self.structured_content = {
-            'company_info': {},
-            'services': [],
-            'pages': []
         }
     def clean_text(self, text):
-        # Remove extra whitespace and normalize
-        return ' '.join(text.split())
-    def extract_content(self, response, selector):
-        text = ' '.join(response.css(selector + ' ::text').getall()).strip()
-        return self.clean_text(text)
-    def parse(self, response):
-        # Extract main content sections
-        page_data = {
-            'url': response.url,
-            'title': self.extract_content(response, 'title'),
-            'main_content': self.extract_content(response, 'main'),
-            'sections': []
-        }
-        # Extract content from different sections
-        for section in response.css('section, div[class*="section"]'):
-            section_data = {
-                'heading': self.extract_content(section, 'h1, h2, h3'),
-                'content': self.clean_text(' '.join(section.css('::text').getall())),
             }
-            if section_data['content'].strip():
-                page_data['sections'].append(section_data)
-        # Store page data
-        self.structured_content['pages'].append(page_data)
-        # Extract company info if on main page
-        if response.url == self.start_urls[0]:
-            self.structured_content['company_info'] = {
-                'name': 'SletcherSystems',
-                'description': self.extract_content(response, 'main p'),
             }
-        # Follow internal links
-        for href in response.css('a::attr(href)').getall():
-            if href and href.startswith('/'):
-                yield response.follow(href, self.parse)
-def run_crawler():
-    # Initialize crawler process
-    process = CrawlerProcess({
-        'USER_AGENT': 'SletcherSystems Content Crawler',
-        'LOG_LEVEL': 'INFO'
-    })
-    spider = SletcherSpider()
-    process.crawl(spider)
-    process.start()
-    # Format final data
-    final_data = {
-        "timestamp": datetime.now().isoformat(),
-        "content": spider.structured_content,
-        "metadata": {
-            "pages_crawled": len(spider.structured_content['pages'])
-        }
-    }
-    # Save to JSON
-    with open("data/site_content.json", "w") as f:
-        json.dump(final_data, f, indent=2)
-    return final_data
 if __name__ == "__main__":
-    run_crawler()

+import requests
 from bs4 import BeautifulSoup
+import json
+from datetime import datetime
+import time
+class SletcherScraper:
     def __init__(self):
+        self.base_url = "https://www.sletchersystems.com"
+        self.pages = [
+            "/",
+            "/clients",
+            "/solutions",
+            "/services",
+            "/about"
+        ]
+        self.content = {
+            "timestamp": datetime.now().isoformat(),
+            "pages": {},
+            "company_info": {},
+            "services": [],
+            "solutions": [],
+            "clients": []
         }
     def clean_text(self, text):
+        if text:
+            return " ".join(text.strip().split())
+        return ""
+    def scrape_page(self, url_path):
+        full_url = self.base_url + url_path
+        try:
+            response = requests.get(full_url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            page_data = {
+                "url": full_url,
+                "title": self.clean_text(soup.title.string) if soup.title else "",
+                "sections": []
             }
+            # Extract main content sections
+            for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
+                section_data = {
+                    "heading": "",
+                    "content": ""
+                }
+                # Get heading
+                heading = section.find(['h1', 'h2', 'h3'])
+                if heading:
+                    section_data["heading"] = self.clean_text(heading.text)
+                # Get content paragraphs
+                paragraphs = section.find_all('p')
+                section_data["content"] = "\n".join([
+                    self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text)
+                ])
+                if section_data["heading"] or section_data["content"]:
+                    page_data["sections"].append(section_data)
+            return page_data
+        except Exception as e:
+            print(f"Error scraping {full_url}: {e}")
+            return None
+    def extract_specific_content(self):
+        # Extract services
+        if "/services" in self.content["pages"]:
+            services_page = self.content["pages"]["/services"]
+            for section in services_page["sections"]:
+                if section["heading"] and section["content"]:
+                    self.content["services"].append({
+                        "name": section["heading"],
+                        "description": section["content"]
+                    })
+        # Extract solutions
+        if "/solutions" in self.content["pages"]:
+            solutions_page = self.content["pages"]["/solutions"]
+            for section in solutions_page["sections"]:
+                if section["heading"] and section["content"]:
+                    self.content["solutions"].append({
+                        "name": section["heading"],
+                        "description": section["content"]
+                    })
+        # Extract company info from about page
+        if "/about" in self.content["pages"]:
+            about_page = self.content["pages"]["/about"]
+            self.content["company_info"] = {
+                "name": "SletcherSystems",
+                "description": "\n".join([
+                    section["content"] for section in about_page["sections"]
+                    if section["content"]
+                ])
             }
+    def scrape_all(self):
+        # Scrape each page
+        for page in self.pages:
+            print(f"Scraping {self.base_url}{page}")
+            page_data = self.scrape_page(page)
+            if page_data:
+                self.content["pages"][page] = page_data
+            time.sleep(1)  # Be nice to the server
+        # Extract specific content
+        self.extract_specific_content()
+        return self.content
+    def save_to_json(self, filename="site_content.json"):
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(self.content, f, indent=2, ensure_ascii=False)
+def main():
+    scraper = SletcherScraper()
+    content = scraper.scrape_all()
+    scraper.save_to_json("data/site_content.json")
+    print("Scraping completed and saved to data/site_content.json")
 if __name__ == "__main__":
+    main()