SmokeyBandit commited on
Commit
1fc4326
·
verified ·
1 Parent(s): 4f0f0ed

Update crawl_website.py

Browse files
Files changed (1) hide show
  1. crawl_website.py +114 -74
crawl_website.py CHANGED
@@ -1,88 +1,128 @@
1
- import scrapy
2
- from scrapy.crawler import CrawlerProcess
3
- from datetime import datetime
4
- import json
5
  from bs4 import BeautifulSoup
6
- import re
 
 
7
 
8
- class SletcherSpider(scrapy.Spider):
9
- name = 'sletcher'
10
- start_urls = ['https://sletchersystems.com']
11
-
12
  def __init__(self):
13
- super().__init__()
14
- self.structured_content = {
15
- 'company_info': {},
16
- 'services': [],
17
- 'pages': []
 
 
 
 
 
 
 
 
 
 
18
  }
19
-
20
  def clean_text(self, text):
21
- # Remove extra whitespace and normalize
22
- return ' '.join(text.split())
23
-
24
- def extract_content(self, response, selector):
25
- text = ' '.join(response.css(selector + ' ::text').getall()).strip()
26
- return self.clean_text(text)
27
 
28
- def parse(self, response):
29
- # Extract main content sections
30
- page_data = {
31
- 'url': response.url,
32
- 'title': self.extract_content(response, 'title'),
33
- 'main_content': self.extract_content(response, 'main'),
34
- 'sections': []
35
- }
36
-
37
- # Extract content from different sections
38
- for section in response.css('section, div[class*="section"]'):
39
- section_data = {
40
- 'heading': self.extract_content(section, 'h1, h2, h3'),
41
- 'content': self.clean_text(' '.join(section.css('::text').getall())),
42
  }
43
- if section_data['content'].strip():
44
- page_data['sections'].append(section_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Store page data
47
- self.structured_content['pages'].append(page_data)
 
 
 
 
 
 
 
48
 
49
- # Extract company info if on main page
50
- if response.url == self.start_urls[0]:
51
- self.structured_content['company_info'] = {
52
- 'name': 'SletcherSystems',
53
- 'description': self.extract_content(response, 'main p'),
 
 
 
 
54
  }
55
-
56
- # Follow internal links
57
- for href in response.css('a::attr(href)').getall():
58
- if href and href.startswith('/'):
59
- yield response.follow(href, self.parse)
60
-
61
- def run_crawler():
62
- # Initialize crawler process
63
- process = CrawlerProcess({
64
- 'USER_AGENT': 'SletcherSystems Content Crawler',
65
- 'LOG_LEVEL': 'INFO'
66
- })
67
 
68
- spider = SletcherSpider()
69
- process.crawl(spider)
70
- process.start()
71
-
72
- # Format final data
73
- final_data = {
74
- "timestamp": datetime.now().isoformat(),
75
- "content": spider.structured_content,
76
- "metadata": {
77
- "pages_crawled": len(spider.structured_content['pages'])
78
- }
79
- }
80
-
81
- # Save to JSON
82
- with open("data/site_content.json", "w") as f:
83
- json.dump(final_data, f, indent=2)
84
 
85
- return final_data
 
 
 
 
 
 
 
 
86
 
87
  if __name__ == "__main__":
88
- run_crawler()
 
1
+ import requests
 
 
 
2
  from bs4 import BeautifulSoup
3
+ import json
4
+ from datetime import datetime
5
+ import time
6
 
7
+ class SletcherScraper:
 
 
 
8
  def __init__(self):
9
+ self.base_url = "https://www.sletchersystems.com"
10
+ self.pages = [
11
+ "/",
12
+ "/clients",
13
+ "/solutions",
14
+ "/services",
15
+ "/about"
16
+ ]
17
+ self.content = {
18
+ "timestamp": datetime.now().isoformat(),
19
+ "pages": {},
20
+ "company_info": {},
21
+ "services": [],
22
+ "solutions": [],
23
+ "clients": []
24
  }
25
+
26
  def clean_text(self, text):
27
+ if text:
28
+ return " ".join(text.strip().split())
29
+ return ""
 
 
 
30
 
31
+ def scrape_page(self, url_path):
32
+ full_url = self.base_url + url_path
33
+ try:
34
+ response = requests.get(full_url)
35
+ response.raise_for_status()
36
+ soup = BeautifulSoup(response.text, 'html.parser')
37
+
38
+ page_data = {
39
+ "url": full_url,
40
+ "title": self.clean_text(soup.title.string) if soup.title else "",
41
+ "sections": []
 
 
 
42
  }
43
+
44
+ # Extract main content sections
45
+ for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
46
+ section_data = {
47
+ "heading": "",
48
+ "content": ""
49
+ }
50
+
51
+ # Get heading
52
+ heading = section.find(['h1', 'h2', 'h3'])
53
+ if heading:
54
+ section_data["heading"] = self.clean_text(heading.text)
55
+
56
+ # Get content paragraphs
57
+ paragraphs = section.find_all('p')
58
+ section_data["content"] = "\n".join([
59
+ self.clean_text(p.text) for p in paragraphs if self.clean_text(p.text)
60
+ ])
61
+
62
+ if section_data["heading"] or section_data["content"]:
63
+ page_data["sections"].append(section_data)
64
+
65
+ return page_data
66
+
67
+ except Exception as e:
68
+ print(f"Error scraping {full_url}: {e}")
69
+ return None
70
+
71
+ def extract_specific_content(self):
72
+ # Extract services
73
+ if "/services" in self.content["pages"]:
74
+ services_page = self.content["pages"]["/services"]
75
+ for section in services_page["sections"]:
76
+ if section["heading"] and section["content"]:
77
+ self.content["services"].append({
78
+ "name": section["heading"],
79
+ "description": section["content"]
80
+ })
81
 
82
+ # Extract solutions
83
+ if "/solutions" in self.content["pages"]:
84
+ solutions_page = self.content["pages"]["/solutions"]
85
+ for section in solutions_page["sections"]:
86
+ if section["heading"] and section["content"]:
87
+ self.content["solutions"].append({
88
+ "name": section["heading"],
89
+ "description": section["content"]
90
+ })
91
 
92
+ # Extract company info from about page
93
+ if "/about" in self.content["pages"]:
94
+ about_page = self.content["pages"]["/about"]
95
+ self.content["company_info"] = {
96
+ "name": "SletcherSystems",
97
+ "description": "\n".join([
98
+ section["content"] for section in about_page["sections"]
99
+ if section["content"]
100
+ ])
101
  }
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ def scrape_all(self):
104
+ # Scrape each page
105
+ for page in self.pages:
106
+ print(f"Scraping {self.base_url}{page}")
107
+ page_data = self.scrape_page(page)
108
+ if page_data:
109
+ self.content["pages"][page] = page_data
110
+ time.sleep(1) # Be nice to the server
111
+
112
+ # Extract specific content
113
+ self.extract_specific_content()
114
+
115
+ return self.content
 
 
 
116
 
117
+ def save_to_json(self, filename="site_content.json"):
118
+ with open(filename, "w", encoding="utf-8") as f:
119
+ json.dump(self.content, f, indent=2, ensure_ascii=False)
120
+
121
+ def main():
122
+ scraper = SletcherScraper()
123
+ content = scraper.scrape_all()
124
+ scraper.save_to_json("data/site_content.json")
125
+ print("Scraping completed and saved to data/site_content.json")
126
 
127
  if __name__ == "__main__":
128
+ main()