SmokeyBandit commited on
Commit
7df492c
·
verified ·
1 Parent(s): 583b2a6

Create update_content.py

Browse files
Files changed (1) hide show
  1. update_content.py +114 -0
update_content.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ from datetime import datetime
5
+ from huggingface_hub import HfApi
6
+
7
+ def clean_text(text):
8
+ if text:
9
+ return " ".join(text.strip().split())
10
+ return ""
11
+
12
+ def scrape_page(url):
13
+ try:
14
+ response = requests.get(url)
15
+ response.raise_for_status()
16
+ soup = BeautifulSoup(response.text, 'html.parser')
17
+
18
+ sections = []
19
+ for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
20
+ section_data = {"heading": "", "content": ""}
21
+
22
+ # Get heading
23
+ heading = section.find(['h1', 'h2', 'h3'])
24
+ if heading:
25
+ section_data["heading"] = clean_text(heading.text)
26
+
27
+ # Get content
28
+ paragraphs = section.find_all('p')
29
+ content = "\n".join([clean_text(p.text) for p in paragraphs if clean_text(p.text)])
30
+ if not content: # If no paragraphs, get all text
31
+ content = clean_text(section.get_text())
32
+ section_data["content"] = content
33
+
34
+ if section_data["heading"] or section_data["content"]:
35
+ sections.append(section_data)
36
+
37
+ return {
38
+ "title": clean_text(soup.title.string) if soup.title else "",
39
+ "sections": sections
40
+ }
41
+ except Exception as e:
42
+ print(f"Error scraping {url}: {e}")
43
+ return {"title": "", "sections": []}
44
+
45
+ def update_content():
46
+ # Load existing structure
47
+ try:
48
+ with open("data/site_content.json", "r") as f:
49
+ content = json.load(f)
50
+ except Exception as e:
51
+ print(f"Error loading JSON: {e}")
52
+ return
53
+
54
+ # Update timestamp
55
+ content["timestamp"] = datetime.now().isoformat()
56
+
57
+ # Scrape each page
58
+ for path in content["pages"]:
59
+ url = content["pages"][path]["url"]
60
+ print(f"Scraping {url}")
61
+ page_data = scrape_page(url)
62
+ content["pages"][path].update(page_data)
63
+
64
+ # Extract specific content
65
+ services = []
66
+ solutions = []
67
+ clients = []
68
+
69
+ # Process services
70
+ if "/services" in content["pages"]:
71
+ for section in content["pages"]["/services"]["sections"]:
72
+ if section["heading"] or section["content"]:
73
+ services.append({
74
+ "name": section["heading"],
75
+ "description": section["content"]
76
+ })
77
+ content["services"] = services
78
+
79
+ # Process solutions
80
+ if "/solutions" in content["pages"]:
81
+ for section in content["pages"]["/solutions"]["sections"]:
82
+ if section["heading"] or section["content"]:
83
+ solutions.append({
84
+ "name": section["heading"],
85
+ "description": section["content"]
86
+ })
87
+ content["solutions"] = solutions
88
+
89
+ # Process about page for company info
90
+ if "/about" in content["pages"]:
91
+ about_sections = content["pages"]["/about"]["sections"]
92
+ content["company_info"] = {
93
+ "name": "SletcherSystems",
94
+ "description": "\n".join([
95
+ section["content"] for section in about_sections
96
+ if section["content"]
97
+ ])
98
+ }
99
+
100
+ # Save updated content
101
+ with open("data/site_content.json", "w") as f:
102
+ json.dump(content, f, indent=2)
103
+
104
+ # Upload to Hugging Face Space
105
+ api = HfApi()
106
+ api.upload_file(
107
+ path_or_fileobj="data/site_content.json",
108
+ path_in_repo="data/site_content.json",
109
+ repo_id="SmokeyBandit/SletcherSystems",
110
+ repo_type="space"
111
+ )
112
+
113
+ if __name__ == "__main__":
114
+ update_content()