Spaces:
Runtime error
Runtime error
latterworks
commited on
Create noaa_incidents.py
Browse files- noaa_incidents.py +422 -0
noaa_incidents.py
ADDED
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# noaa_incidents.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import hashlib
|
5 |
+
import json
|
6 |
+
import pickle
|
7 |
+
import threading
|
8 |
+
import time
|
9 |
+
from pathlib import Path
|
10 |
+
from datetime import datetime
|
11 |
+
from typing import Dict, List, Optional, Tuple
|
12 |
+
import logging
|
13 |
+
|
14 |
+
import pandas as pd
|
15 |
+
import requests
|
16 |
+
from bs4 import BeautifulSoup
|
17 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
18 |
+
from tqdm import tqdm
|
19 |
+
|
20 |
+
import chromadb
|
21 |
+
from chromadb.utils import embedding_functions
|
22 |
+
from chromadb.config import Settings
|
23 |
+
|
24 |
+
# Configure logging
|
25 |
+
logging.basicConfig(
|
26 |
+
level=logging.INFO,
|
27 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
28 |
+
)
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
|
31 |
+
# Constants
|
32 |
+
BASE_URL = "https://incidentnews.noaa.gov"
|
33 |
+
BROWSE_URL = f"{BASE_URL}/browse/date"
|
34 |
+
OUTPUT_DIR = Path("output")
|
35 |
+
CACHE_DIR = Path("cache")
|
36 |
+
MAX_RETRIES = 3
|
37 |
+
BATCH_SIZE = 500
|
38 |
+
|
39 |
+
class NOAAIncidentScraper:
|
40 |
+
"""
|
41 |
+
Scrapes NOAA Incident News data with caching and multi-threading support.
|
42 |
+
Optimized for Hugging Face Spaces environment.
|
43 |
+
"""
|
44 |
+
def __init__(self, max_workers: int = 5, cache_dir: str = 'cache'):
|
45 |
+
"""
|
46 |
+
Initialize the scraper with custom configuration.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
max_workers (int): Maximum number of concurrent threads
|
50 |
+
cache_dir (str): Directory for caching web responses
|
51 |
+
"""
|
52 |
+
self.base_url = BASE_URL
|
53 |
+
self.browse_url = BROWSE_URL
|
54 |
+
self.headers = {
|
55 |
+
'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
56 |
+
'(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
|
57 |
+
}
|
58 |
+
self.incidents_data = []
|
59 |
+
self.data_lock = threading.Lock()
|
60 |
+
self.max_workers = max_workers
|
61 |
+
self.cache_dir = Path(cache_dir)
|
62 |
+
self.cache_dir.mkdir(exist_ok=True)
|
63 |
+
self.session = requests.Session()
|
64 |
+
self.session.headers.update(self.headers)
|
65 |
+
|
66 |
+
def get_cache_path(self, url: str) -> Path:
|
67 |
+
"""Generate a cache file path for a given URL."""
|
68 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()
|
69 |
+
return self.cache_dir / f"{url_hash}.cache"
|
70 |
+
|
71 |
+
def get_cached_response(self, url: str) -> Optional[str]:
|
72 |
+
"""Retrieve cached response for a URL."""
|
73 |
+
cache_path = self.get_cache_path(url)
|
74 |
+
if cache_path.exists():
|
75 |
+
try:
|
76 |
+
with cache_path.open('rb') as f:
|
77 |
+
return pickle.load(f)
|
78 |
+
except Exception as e:
|
79 |
+
logger.warning(f"Error loading cache for {url}: {e}")
|
80 |
+
return None
|
81 |
+
return None
|
82 |
+
|
83 |
+
def cache_response(self, url: str, content: str):
|
84 |
+
"""Cache response content for a URL."""
|
85 |
+
cache_path = self.get_cache_path(url)
|
86 |
+
try:
|
87 |
+
with cache_path.open('wb') as f:
|
88 |
+
pickle.dump(content, f)
|
89 |
+
except Exception as e:
|
90 |
+
logger.warning(f"Error caching response for {url}: {e}")
|
91 |
+
|
92 |
+
def fetch_url(self, url: str, use_cache: bool = True) -> Optional[str]:
|
93 |
+
"""
|
94 |
+
Fetch URL content with caching and retry mechanism.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
url (str): URL to fetch
|
98 |
+
use_cache (bool): Whether to use cached response
|
99 |
+
|
100 |
+
Returns:
|
101 |
+
Optional[str]: Response content or None if failed
|
102 |
+
"""
|
103 |
+
if use_cache:
|
104 |
+
cached = self.get_cached_response(url)
|
105 |
+
if cached:
|
106 |
+
return cached
|
107 |
+
|
108 |
+
for attempt in range(MAX_RETRIES):
|
109 |
+
try:
|
110 |
+
response = self.session.get(url, timeout=10)
|
111 |
+
response.raise_for_status()
|
112 |
+
content = response.text
|
113 |
+
|
114 |
+
if use_cache:
|
115 |
+
self.cache_response(url, content)
|
116 |
+
return content
|
117 |
+
|
118 |
+
except requests.RequestException as e:
|
119 |
+
wait_time = min(2 ** attempt, 10) # Exponential backoff with max 10s
|
120 |
+
logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
|
121 |
+
|
122 |
+
if attempt < MAX_RETRIES - 1:
|
123 |
+
logger.info(f"Retrying in {wait_time} seconds...")
|
124 |
+
time.sleep(wait_time)
|
125 |
+
else:
|
126 |
+
logger.error(f"Failed to fetch {url} after {MAX_RETRIES} attempts")
|
127 |
+
|
128 |
+
return None
|
129 |
+
|
130 |
+
def validate_incident_data(self, incident_data: Dict) -> bool:
|
131 |
+
"""
|
132 |
+
Validate scraped incident data.
|
133 |
+
|
134 |
+
Args:
|
135 |
+
incident_data (Dict): Incident data to validate
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
bool: True if data is valid
|
139 |
+
"""
|
140 |
+
required_fields = ['title', 'date', 'location']
|
141 |
+
return all(incident_data.get(field) for field in required_fields)
|
142 |
+
|
143 |
+
def get_total_pages(self) -> int:
|
144 |
+
"""Get the total number of pages to scrape."""
|
145 |
+
content = self.fetch_url(self.browse_url)
|
146 |
+
if not content:
|
147 |
+
return 0
|
148 |
+
|
149 |
+
try:
|
150 |
+
soup = BeautifulSoup(content, 'html.parser')
|
151 |
+
pagination = soup.find('ul', class_='pagination')
|
152 |
+
if pagination:
|
153 |
+
last_page = int(pagination.find_all('li')[-2].text)
|
154 |
+
return last_page
|
155 |
+
except Exception as e:
|
156 |
+
logger.error(f"Error determining total pages: {e}")
|
157 |
+
|
158 |
+
return 1
|
159 |
+
|
160 |
+
def scrape_incident_page(self, incident_url: str, use_cache: bool = True) -> Optional[Dict]:
|
161 |
+
"""
|
162 |
+
Scrape detailed information from an individual incident page.
|
163 |
+
|
164 |
+
Args:
|
165 |
+
incident_url (str): URL of the incident page
|
166 |
+
use_cache (bool): Whether to use cached response
|
167 |
+
|
168 |
+
Returns:
|
169 |
+
Optional[Dict]: Incident data or None if failed
|
170 |
+
"""
|
171 |
+
try:
|
172 |
+
full_url = f"{self.base_url}{incident_url}"
|
173 |
+
content = self.fetch_url(full_url, use_cache)
|
174 |
+
if not content:
|
175 |
+
return None
|
176 |
+
|
177 |
+
soup = BeautifulSoup(content, 'html.parser')
|
178 |
+
incident_details = {}
|
179 |
+
|
180 |
+
# Extract Title
|
181 |
+
title = soup.find('h1')
|
182 |
+
incident_details['title'] = title.text.strip() if title else None
|
183 |
+
|
184 |
+
# Extract Initial Notification
|
185 |
+
initial_notification = soup.find('p', class_='sub')
|
186 |
+
incident_details['initial_notification'] = (
|
187 |
+
initial_notification.text.strip() if initial_notification else None
|
188 |
+
)
|
189 |
+
|
190 |
+
# Extract Location and Date
|
191 |
+
location_date = soup.find('p', class_='')
|
192 |
+
if location_date:
|
193 |
+
location = location_date.find('span', class_='glyphicon-map-marker')
|
194 |
+
if location and location.next_sibling:
|
195 |
+
incident_details['location'] = location.next_sibling.strip()
|
196 |
+
|
197 |
+
date = location_date.find('span', class_='incident-date')
|
198 |
+
if date:
|
199 |
+
incident_details['date'] = date.text.strip()
|
200 |
+
|
201 |
+
# Extract Additional Details from Table
|
202 |
+
details_table = soup.find('table', class_='table')
|
203 |
+
if details_table:
|
204 |
+
for row in details_table.find_all('tr'):
|
205 |
+
cols = row.find_all('td')
|
206 |
+
if len(cols) == 2:
|
207 |
+
key = cols[0].text.strip().rstrip(':').lower().replace(' ', '_')
|
208 |
+
value = cols[1].text.strip()
|
209 |
+
incident_details[key] = value
|
210 |
+
|
211 |
+
if not self.validate_incident_data(incident_details):
|
212 |
+
logger.warning(f"Invalid incident data for {incident_url}")
|
213 |
+
return None
|
214 |
+
|
215 |
+
return incident_details
|
216 |
+
|
217 |
+
except Exception as e:
|
218 |
+
logger.error(f"Error scraping incident page {incident_url}: {e}")
|
219 |
+
return None
|
220 |
+
|
221 |
+
def scrape_listing_page(self, page_number: int, use_cache: bool = True) -> List[str]:
|
222 |
+
"""
|
223 |
+
Scrape a single listing page of incidents.
|
224 |
+
|
225 |
+
Args:
|
226 |
+
page_number (int): Page number to scrape
|
227 |
+
use_cache (bool): Whether to use cached response
|
228 |
+
|
229 |
+
Returns:
|
230 |
+
List[str]: List of incident URLs
|
231 |
+
"""
|
232 |
+
url = f"{self.browse_url}?page={page_number}"
|
233 |
+
try:
|
234 |
+
content = self.fetch_url(url, use_cache)
|
235 |
+
if not content:
|
236 |
+
return []
|
237 |
+
|
238 |
+
soup = BeautifulSoup(content, 'html.parser')
|
239 |
+
incidents_table = soup.find('table', class_='incident-links')
|
240 |
+
if not incidents_table:
|
241 |
+
logger.warning(f"No incidents table found on page {page_number}")
|
242 |
+
return []
|
243 |
+
|
244 |
+
incident_urls = []
|
245 |
+
for row in incidents_table.find_all('tr')[1:]: # Skip header row
|
246 |
+
cols = row.find_all('td')
|
247 |
+
if len(cols) >= 3:
|
248 |
+
incident_link = cols[0].find('a')
|
249 |
+
if incident_link and 'href' in incident_link.attrs:
|
250 |
+
incident_urls.append(incident_link['href'])
|
251 |
+
|
252 |
+
return incident_urls
|
253 |
+
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Error scraping listing page {page_number}: {e}")
|
256 |
+
return []
|
257 |
+
|
258 |
+
def process_incident(self, incident_url: str, use_cache: bool = True) -> Optional[str]:
|
259 |
+
"""Process a single incident URL with thread safety."""
|
260 |
+
incident_data = self.scrape_incident_page(incident_url, use_cache)
|
261 |
+
if incident_data:
|
262 |
+
with self.data_lock:
|
263 |
+
self.incidents_data.append(incident_data)
|
264 |
+
return incident_url
|
265 |
+
return None
|
266 |
+
|
267 |
+
def save_data(self) -> Tuple[str, str]:
|
268 |
+
"""
|
269 |
+
Save the scraped data to CSV and JSON files.
|
270 |
+
|
271 |
+
Returns:
|
272 |
+
Tuple[str, str]: Paths to saved CSV and JSON files
|
273 |
+
"""
|
274 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
275 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
276 |
+
|
277 |
+
try:
|
278 |
+
# Save to CSV
|
279 |
+
csv_filename = OUTPUT_DIR / f'noaa_incidents_{timestamp}.csv'
|
280 |
+
df = pd.DataFrame(self.incidents_data)
|
281 |
+
df.to_csv(csv_filename, index=False)
|
282 |
+
|
283 |
+
# Save to JSON
|
284 |
+
json_filename = OUTPUT_DIR / f'noaa_incidents_{timestamp}.json'
|
285 |
+
with open(json_filename, 'w', encoding='utf-8') as f:
|
286 |
+
json.dump(self.incidents_data, f, indent=4)
|
287 |
+
|
288 |
+
logger.info(f"Data saved to {csv_filename} and {json_filename}")
|
289 |
+
return str(csv_filename), str(json_filename)
|
290 |
+
|
291 |
+
except Exception as e:
|
292 |
+
logger.error(f"Error saving data: {e}")
|
293 |
+
return None, None
|
294 |
+
|
295 |
+
def run(self, validate_first: bool = True) -> Tuple[Optional[str], Optional[str]]:
|
296 |
+
"""
|
297 |
+
Run the complete scraping process.
|
298 |
+
|
299 |
+
Args:
|
300 |
+
validate_first (bool): Whether to validate first page before full scrape
|
301 |
+
|
302 |
+
Returns:
|
303 |
+
Tuple[Optional[str], Optional[str]]: Paths to saved CSV and JSON files
|
304 |
+
"""
|
305 |
+
logger.info("Starting NOAA IncidentNews scraper...")
|
306 |
+
|
307 |
+
if validate_first:
|
308 |
+
logger.info("Performing initial validation run...")
|
309 |
+
test_page = self.scrape_listing_page(1, use_cache=False)
|
310 |
+
if not test_page:
|
311 |
+
logger.error("Unable to scrape the first page. Aborting.")
|
312 |
+
return None, None
|
313 |
+
|
314 |
+
test_incident = self.scrape_incident_page(test_page[0], use_cache=False)
|
315 |
+
if not test_incident:
|
316 |
+
logger.error("Unable to scrape test incident. Aborting.")
|
317 |
+
return None, None
|
318 |
+
|
319 |
+
logger.info("Validation successful, proceeding with full scrape...")
|
320 |
+
|
321 |
+
total_pages = self.get_total_pages()
|
322 |
+
logger.info(f"Found {total_pages} pages to scrape")
|
323 |
+
|
324 |
+
# Collect incident URLs
|
325 |
+
all_incident_urls = []
|
326 |
+
logger.info("Collecting incident URLs...")
|
327 |
+
with tqdm(total=total_pages, desc="Collecting URLs") as pbar:
|
328 |
+
for page in range(1, total_pages + 1):
|
329 |
+
urls = self.scrape_listing_page(page)
|
330 |
+
all_incident_urls.extend(urls)
|
331 |
+
pbar.update(1)
|
332 |
+
|
333 |
+
total_incidents = len(all_incident_urls)
|
334 |
+
logger.info(f"Found {total_incidents} incidents to process")
|
335 |
+
|
336 |
+
if total_incidents == 0:
|
337 |
+
logger.error("No incidents found to process")
|
338 |
+
return None, None
|
339 |
+
|
340 |
+
# Process incidents
|
341 |
+
logger.info("Processing incidents...")
|
342 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
343 |
+
futures = [
|
344 |
+
executor.submit(self.process_incident, url)
|
345 |
+
for url in all_incident_urls
|
346 |
+
]
|
347 |
+
|
348 |
+
with tqdm(total=total_incidents, desc="Scraping incidents") as pbar:
|
349 |
+
for future in as_completed(futures):
|
350 |
+
try:
|
351 |
+
future.result()
|
352 |
+
pbar.update(1)
|
353 |
+
except Exception as e:
|
354 |
+
logger.error(f"Error processing incident: {e}")
|
355 |
+
|
356 |
+
logger.info(f"Scraped {len(self.incidents_data)} incidents")
|
357 |
+
|
358 |
+
if self.incidents_data:
|
359 |
+
return self.save_data()
|
360 |
+
else:
|
361 |
+
logger.error("No incident data scraped")
|
362 |
+
return None, None
|
363 |
+
|
364 |
+
|
365 |
+
class NOAAIncidentDB:
|
366 |
+
"""
|
367 |
+
Manages NOAA incident data using ChromaDB for vector storage and search.
|
368 |
+
"""
|
369 |
+
def __init__(self,
|
370 |
+
persist_directory: str = "noaa_db",
|
371 |
+
embedding_model: str = "all-MiniLM-L6-v2"):
|
372 |
+
"""
|
373 |
+
Initialize the database with ChromaDB settings.
|
374 |
+
|
375 |
+
Args:
|
376 |
+
persist_directory (str): Directory for ChromaDB storage
|
377 |
+
embedding_model (str): Model name for embeddings
|
378 |
+
"""
|
379 |
+
self.persist_directory = persist_directory
|
380 |
+
os.makedirs(self.persist_directory, exist_ok=True)
|
381 |
+
|
382 |
+
# Initialize ChromaDB client
|
383 |
+
self.client = chromadb.Client(Settings(
|
384 |
+
persist_directory=self.persist_directory,
|
385 |
+
anonymized_telemetry=False
|
386 |
+
))
|
387 |
+
|
388 |
+
# Setup embedding function
|
389 |
+
self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
|
390 |
+
model_name=embedding_model
|
391 |
+
)
|
392 |
+
|
393 |
+
# Get or create collection
|
394 |
+
self.collection = self.client.get_or_create_collection(
|
395 |
+
name="noaa_incidents",
|
396 |
+
embedding_function=self.embedding_function,
|
397 |
+
metadata={"description": "NOAA incident reports database"}
|
398 |
+
)
|
399 |
+
|
400 |
+
def load_incidents(self, csv_path: str) -> int:
|
401 |
+
"""
|
402 |
+
Load incidents from CSV into ChromaDB.
|
403 |
+
|
404 |
+
Args:
|
405 |
+
csv_path (str): Path to CSV file
|
406 |
+
|
407 |
+
Returns:
|
408 |
+
int: Number of incidents loaded
|
409 |
+
"""
|
410 |
+
logger.info(f"Loading incidents from {csv_path}")
|
411 |
+
|
412 |
+
try:
|
413 |
+
df = pd.read_csv(csv_path, dtype=str)
|
414 |
+
df = df.where(pd.notna(df), None)
|
415 |
+
|
416 |
+
documents = []
|
417 |
+
metadatas = []
|
418 |
+
ids = []
|
419 |
+
|
420 |
+
for idx, row in df.iterrows():
|
421 |
+
# Generate unique ID
|
422 |
+
unique_string = f"{
|