Spaces:
Running
Running
SmokeyBandit
commited on
Create update_content.py
Browse files- update_content.py +114 -0
update_content.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import json
|
4 |
+
from datetime import datetime
|
5 |
+
from huggingface_hub import HfApi
|
6 |
+
|
7 |
+
def clean_text(text):
|
8 |
+
if text:
|
9 |
+
return " ".join(text.strip().split())
|
10 |
+
return ""
|
11 |
+
|
12 |
+
def scrape_page(url):
|
13 |
+
try:
|
14 |
+
response = requests.get(url)
|
15 |
+
response.raise_for_status()
|
16 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
17 |
+
|
18 |
+
sections = []
|
19 |
+
for section in soup.find_all(['section', 'div'], class_=lambda x: x and 'section' in x.lower()):
|
20 |
+
section_data = {"heading": "", "content": ""}
|
21 |
+
|
22 |
+
# Get heading
|
23 |
+
heading = section.find(['h1', 'h2', 'h3'])
|
24 |
+
if heading:
|
25 |
+
section_data["heading"] = clean_text(heading.text)
|
26 |
+
|
27 |
+
# Get content
|
28 |
+
paragraphs = section.find_all('p')
|
29 |
+
content = "\n".join([clean_text(p.text) for p in paragraphs if clean_text(p.text)])
|
30 |
+
if not content: # If no paragraphs, get all text
|
31 |
+
content = clean_text(section.get_text())
|
32 |
+
section_data["content"] = content
|
33 |
+
|
34 |
+
if section_data["heading"] or section_data["content"]:
|
35 |
+
sections.append(section_data)
|
36 |
+
|
37 |
+
return {
|
38 |
+
"title": clean_text(soup.title.string) if soup.title else "",
|
39 |
+
"sections": sections
|
40 |
+
}
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error scraping {url}: {e}")
|
43 |
+
return {"title": "", "sections": []}
|
44 |
+
|
45 |
+
def update_content():
|
46 |
+
# Load existing structure
|
47 |
+
try:
|
48 |
+
with open("data/site_content.json", "r") as f:
|
49 |
+
content = json.load(f)
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error loading JSON: {e}")
|
52 |
+
return
|
53 |
+
|
54 |
+
# Update timestamp
|
55 |
+
content["timestamp"] = datetime.now().isoformat()
|
56 |
+
|
57 |
+
# Scrape each page
|
58 |
+
for path in content["pages"]:
|
59 |
+
url = content["pages"][path]["url"]
|
60 |
+
print(f"Scraping {url}")
|
61 |
+
page_data = scrape_page(url)
|
62 |
+
content["pages"][path].update(page_data)
|
63 |
+
|
64 |
+
# Extract specific content
|
65 |
+
services = []
|
66 |
+
solutions = []
|
67 |
+
clients = []
|
68 |
+
|
69 |
+
# Process services
|
70 |
+
if "/services" in content["pages"]:
|
71 |
+
for section in content["pages"]["/services"]["sections"]:
|
72 |
+
if section["heading"] or section["content"]:
|
73 |
+
services.append({
|
74 |
+
"name": section["heading"],
|
75 |
+
"description": section["content"]
|
76 |
+
})
|
77 |
+
content["services"] = services
|
78 |
+
|
79 |
+
# Process solutions
|
80 |
+
if "/solutions" in content["pages"]:
|
81 |
+
for section in content["pages"]["/solutions"]["sections"]:
|
82 |
+
if section["heading"] or section["content"]:
|
83 |
+
solutions.append({
|
84 |
+
"name": section["heading"],
|
85 |
+
"description": section["content"]
|
86 |
+
})
|
87 |
+
content["solutions"] = solutions
|
88 |
+
|
89 |
+
# Process about page for company info
|
90 |
+
if "/about" in content["pages"]:
|
91 |
+
about_sections = content["pages"]["/about"]["sections"]
|
92 |
+
content["company_info"] = {
|
93 |
+
"name": "SletcherSystems",
|
94 |
+
"description": "\n".join([
|
95 |
+
section["content"] for section in about_sections
|
96 |
+
if section["content"]
|
97 |
+
])
|
98 |
+
}
|
99 |
+
|
100 |
+
# Save updated content
|
101 |
+
with open("data/site_content.json", "w") as f:
|
102 |
+
json.dump(content, f, indent=2)
|
103 |
+
|
104 |
+
# Upload to Hugging Face Space
|
105 |
+
api = HfApi()
|
106 |
+
api.upload_file(
|
107 |
+
path_or_fileobj="data/site_content.json",
|
108 |
+
path_in_repo="data/site_content.json",
|
109 |
+
repo_id="SmokeyBandit/SletcherSystems",
|
110 |
+
repo_type="space"
|
111 |
+
)
|
112 |
+
|
113 |
+
if __name__ == "__main__":
|
114 |
+
update_content()
|