latterworks commited on
Commit
1bf21f5
·
verified ·
1 Parent(s): d46ba59

Create noaa_incidents.py

Browse files
Files changed (1) hide show
  1. noaa_incidents.py +422 -0
noaa_incidents.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # noaa_incidents.py
2
+
3
+ import os
4
+ import hashlib
5
+ import json
6
+ import pickle
7
+ import threading
8
+ import time
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+ from typing import Dict, List, Optional, Tuple
12
+ import logging
13
+
14
+ import pandas as pd
15
+ import requests
16
+ from bs4 import BeautifulSoup
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from tqdm import tqdm
19
+
20
+ import chromadb
21
+ from chromadb.utils import embedding_functions
22
+ from chromadb.config import Settings
23
+
24
+ # Configure logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(levelname)s - %(message)s'
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Constants
32
+ BASE_URL = "https://incidentnews.noaa.gov"
33
+ BROWSE_URL = f"{BASE_URL}/browse/date"
34
+ OUTPUT_DIR = Path("output")
35
+ CACHE_DIR = Path("cache")
36
+ MAX_RETRIES = 3
37
+ BATCH_SIZE = 500
38
+
39
+ class NOAAIncidentScraper:
40
+ """
41
+ Scrapes NOAA Incident News data with caching and multi-threading support.
42
+ Optimized for Hugging Face Spaces environment.
43
+ """
44
+ def __init__(self, max_workers: int = 5, cache_dir: str = 'cache'):
45
+ """
46
+ Initialize the scraper with custom configuration.
47
+
48
+ Args:
49
+ max_workers (int): Maximum number of concurrent threads
50
+ cache_dir (str): Directory for caching web responses
51
+ """
52
+ self.base_url = BASE_URL
53
+ self.browse_url = BROWSE_URL
54
+ self.headers = {
55
+ 'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
56
+ '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
57
+ }
58
+ self.incidents_data = []
59
+ self.data_lock = threading.Lock()
60
+ self.max_workers = max_workers
61
+ self.cache_dir = Path(cache_dir)
62
+ self.cache_dir.mkdir(exist_ok=True)
63
+ self.session = requests.Session()
64
+ self.session.headers.update(self.headers)
65
+
66
+ def get_cache_path(self, url: str) -> Path:
67
+ """Generate a cache file path for a given URL."""
68
+ url_hash = hashlib.md5(url.encode()).hexdigest()
69
+ return self.cache_dir / f"{url_hash}.cache"
70
+
71
+ def get_cached_response(self, url: str) -> Optional[str]:
72
+ """Retrieve cached response for a URL."""
73
+ cache_path = self.get_cache_path(url)
74
+ if cache_path.exists():
75
+ try:
76
+ with cache_path.open('rb') as f:
77
+ return pickle.load(f)
78
+ except Exception as e:
79
+ logger.warning(f"Error loading cache for {url}: {e}")
80
+ return None
81
+ return None
82
+
83
+ def cache_response(self, url: str, content: str):
84
+ """Cache response content for a URL."""
85
+ cache_path = self.get_cache_path(url)
86
+ try:
87
+ with cache_path.open('wb') as f:
88
+ pickle.dump(content, f)
89
+ except Exception as e:
90
+ logger.warning(f"Error caching response for {url}: {e}")
91
+
92
+ def fetch_url(self, url: str, use_cache: bool = True) -> Optional[str]:
93
+ """
94
+ Fetch URL content with caching and retry mechanism.
95
+
96
+ Args:
97
+ url (str): URL to fetch
98
+ use_cache (bool): Whether to use cached response
99
+
100
+ Returns:
101
+ Optional[str]: Response content or None if failed
102
+ """
103
+ if use_cache:
104
+ cached = self.get_cached_response(url)
105
+ if cached:
106
+ return cached
107
+
108
+ for attempt in range(MAX_RETRIES):
109
+ try:
110
+ response = self.session.get(url, timeout=10)
111
+ response.raise_for_status()
112
+ content = response.text
113
+
114
+ if use_cache:
115
+ self.cache_response(url, content)
116
+ return content
117
+
118
+ except requests.RequestException as e:
119
+ wait_time = min(2 ** attempt, 10) # Exponential backoff with max 10s
120
+ logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
121
+
122
+ if attempt < MAX_RETRIES - 1:
123
+ logger.info(f"Retrying in {wait_time} seconds...")
124
+ time.sleep(wait_time)
125
+ else:
126
+ logger.error(f"Failed to fetch {url} after {MAX_RETRIES} attempts")
127
+
128
+ return None
129
+
130
+ def validate_incident_data(self, incident_data: Dict) -> bool:
131
+ """
132
+ Validate scraped incident data.
133
+
134
+ Args:
135
+ incident_data (Dict): Incident data to validate
136
+
137
+ Returns:
138
+ bool: True if data is valid
139
+ """
140
+ required_fields = ['title', 'date', 'location']
141
+ return all(incident_data.get(field) for field in required_fields)
142
+
143
+ def get_total_pages(self) -> int:
144
+ """Get the total number of pages to scrape."""
145
+ content = self.fetch_url(self.browse_url)
146
+ if not content:
147
+ return 0
148
+
149
+ try:
150
+ soup = BeautifulSoup(content, 'html.parser')
151
+ pagination = soup.find('ul', class_='pagination')
152
+ if pagination:
153
+ last_page = int(pagination.find_all('li')[-2].text)
154
+ return last_page
155
+ except Exception as e:
156
+ logger.error(f"Error determining total pages: {e}")
157
+
158
+ return 1
159
+
160
+ def scrape_incident_page(self, incident_url: str, use_cache: bool = True) -> Optional[Dict]:
161
+ """
162
+ Scrape detailed information from an individual incident page.
163
+
164
+ Args:
165
+ incident_url (str): URL of the incident page
166
+ use_cache (bool): Whether to use cached response
167
+
168
+ Returns:
169
+ Optional[Dict]: Incident data or None if failed
170
+ """
171
+ try:
172
+ full_url = f"{self.base_url}{incident_url}"
173
+ content = self.fetch_url(full_url, use_cache)
174
+ if not content:
175
+ return None
176
+
177
+ soup = BeautifulSoup(content, 'html.parser')
178
+ incident_details = {}
179
+
180
+ # Extract Title
181
+ title = soup.find('h1')
182
+ incident_details['title'] = title.text.strip() if title else None
183
+
184
+ # Extract Initial Notification
185
+ initial_notification = soup.find('p', class_='sub')
186
+ incident_details['initial_notification'] = (
187
+ initial_notification.text.strip() if initial_notification else None
188
+ )
189
+
190
+ # Extract Location and Date
191
+ location_date = soup.find('p', class_='')
192
+ if location_date:
193
+ location = location_date.find('span', class_='glyphicon-map-marker')
194
+ if location and location.next_sibling:
195
+ incident_details['location'] = location.next_sibling.strip()
196
+
197
+ date = location_date.find('span', class_='incident-date')
198
+ if date:
199
+ incident_details['date'] = date.text.strip()
200
+
201
+ # Extract Additional Details from Table
202
+ details_table = soup.find('table', class_='table')
203
+ if details_table:
204
+ for row in details_table.find_all('tr'):
205
+ cols = row.find_all('td')
206
+ if len(cols) == 2:
207
+ key = cols[0].text.strip().rstrip(':').lower().replace(' ', '_')
208
+ value = cols[1].text.strip()
209
+ incident_details[key] = value
210
+
211
+ if not self.validate_incident_data(incident_details):
212
+ logger.warning(f"Invalid incident data for {incident_url}")
213
+ return None
214
+
215
+ return incident_details
216
+
217
+ except Exception as e:
218
+ logger.error(f"Error scraping incident page {incident_url}: {e}")
219
+ return None
220
+
221
+ def scrape_listing_page(self, page_number: int, use_cache: bool = True) -> List[str]:
222
+ """
223
+ Scrape a single listing page of incidents.
224
+
225
+ Args:
226
+ page_number (int): Page number to scrape
227
+ use_cache (bool): Whether to use cached response
228
+
229
+ Returns:
230
+ List[str]: List of incident URLs
231
+ """
232
+ url = f"{self.browse_url}?page={page_number}"
233
+ try:
234
+ content = self.fetch_url(url, use_cache)
235
+ if not content:
236
+ return []
237
+
238
+ soup = BeautifulSoup(content, 'html.parser')
239
+ incidents_table = soup.find('table', class_='incident-links')
240
+ if not incidents_table:
241
+ logger.warning(f"No incidents table found on page {page_number}")
242
+ return []
243
+
244
+ incident_urls = []
245
+ for row in incidents_table.find_all('tr')[1:]: # Skip header row
246
+ cols = row.find_all('td')
247
+ if len(cols) >= 3:
248
+ incident_link = cols[0].find('a')
249
+ if incident_link and 'href' in incident_link.attrs:
250
+ incident_urls.append(incident_link['href'])
251
+
252
+ return incident_urls
253
+
254
+ except Exception as e:
255
+ logger.error(f"Error scraping listing page {page_number}: {e}")
256
+ return []
257
+
258
+ def process_incident(self, incident_url: str, use_cache: bool = True) -> Optional[str]:
259
+ """Process a single incident URL with thread safety."""
260
+ incident_data = self.scrape_incident_page(incident_url, use_cache)
261
+ if incident_data:
262
+ with self.data_lock:
263
+ self.incidents_data.append(incident_data)
264
+ return incident_url
265
+ return None
266
+
267
+ def save_data(self) -> Tuple[str, str]:
268
+ """
269
+ Save the scraped data to CSV and JSON files.
270
+
271
+ Returns:
272
+ Tuple[str, str]: Paths to saved CSV and JSON files
273
+ """
274
+ OUTPUT_DIR.mkdir(exist_ok=True)
275
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
276
+
277
+ try:
278
+ # Save to CSV
279
+ csv_filename = OUTPUT_DIR / f'noaa_incidents_{timestamp}.csv'
280
+ df = pd.DataFrame(self.incidents_data)
281
+ df.to_csv(csv_filename, index=False)
282
+
283
+ # Save to JSON
284
+ json_filename = OUTPUT_DIR / f'noaa_incidents_{timestamp}.json'
285
+ with open(json_filename, 'w', encoding='utf-8') as f:
286
+ json.dump(self.incidents_data, f, indent=4)
287
+
288
+ logger.info(f"Data saved to {csv_filename} and {json_filename}")
289
+ return str(csv_filename), str(json_filename)
290
+
291
+ except Exception as e:
292
+ logger.error(f"Error saving data: {e}")
293
+ return None, None
294
+
295
+ def run(self, validate_first: bool = True) -> Tuple[Optional[str], Optional[str]]:
296
+ """
297
+ Run the complete scraping process.
298
+
299
+ Args:
300
+ validate_first (bool): Whether to validate first page before full scrape
301
+
302
+ Returns:
303
+ Tuple[Optional[str], Optional[str]]: Paths to saved CSV and JSON files
304
+ """
305
+ logger.info("Starting NOAA IncidentNews scraper...")
306
+
307
+ if validate_first:
308
+ logger.info("Performing initial validation run...")
309
+ test_page = self.scrape_listing_page(1, use_cache=False)
310
+ if not test_page:
311
+ logger.error("Unable to scrape the first page. Aborting.")
312
+ return None, None
313
+
314
+ test_incident = self.scrape_incident_page(test_page[0], use_cache=False)
315
+ if not test_incident:
316
+ logger.error("Unable to scrape test incident. Aborting.")
317
+ return None, None
318
+
319
+ logger.info("Validation successful, proceeding with full scrape...")
320
+
321
+ total_pages = self.get_total_pages()
322
+ logger.info(f"Found {total_pages} pages to scrape")
323
+
324
+ # Collect incident URLs
325
+ all_incident_urls = []
326
+ logger.info("Collecting incident URLs...")
327
+ with tqdm(total=total_pages, desc="Collecting URLs") as pbar:
328
+ for page in range(1, total_pages + 1):
329
+ urls = self.scrape_listing_page(page)
330
+ all_incident_urls.extend(urls)
331
+ pbar.update(1)
332
+
333
+ total_incidents = len(all_incident_urls)
334
+ logger.info(f"Found {total_incidents} incidents to process")
335
+
336
+ if total_incidents == 0:
337
+ logger.error("No incidents found to process")
338
+ return None, None
339
+
340
+ # Process incidents
341
+ logger.info("Processing incidents...")
342
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
343
+ futures = [
344
+ executor.submit(self.process_incident, url)
345
+ for url in all_incident_urls
346
+ ]
347
+
348
+ with tqdm(total=total_incidents, desc="Scraping incidents") as pbar:
349
+ for future in as_completed(futures):
350
+ try:
351
+ future.result()
352
+ pbar.update(1)
353
+ except Exception as e:
354
+ logger.error(f"Error processing incident: {e}")
355
+
356
+ logger.info(f"Scraped {len(self.incidents_data)} incidents")
357
+
358
+ if self.incidents_data:
359
+ return self.save_data()
360
+ else:
361
+ logger.error("No incident data scraped")
362
+ return None, None
363
+
364
+
365
+ class NOAAIncidentDB:
366
+ """
367
+ Manages NOAA incident data using ChromaDB for vector storage and search.
368
+ """
369
+ def __init__(self,
370
+ persist_directory: str = "noaa_db",
371
+ embedding_model: str = "all-MiniLM-L6-v2"):
372
+ """
373
+ Initialize the database with ChromaDB settings.
374
+
375
+ Args:
376
+ persist_directory (str): Directory for ChromaDB storage
377
+ embedding_model (str): Model name for embeddings
378
+ """
379
+ self.persist_directory = persist_directory
380
+ os.makedirs(self.persist_directory, exist_ok=True)
381
+
382
+ # Initialize ChromaDB client
383
+ self.client = chromadb.Client(Settings(
384
+ persist_directory=self.persist_directory,
385
+ anonymized_telemetry=False
386
+ ))
387
+
388
+ # Setup embedding function
389
+ self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
390
+ model_name=embedding_model
391
+ )
392
+
393
+ # Get or create collection
394
+ self.collection = self.client.get_or_create_collection(
395
+ name="noaa_incidents",
396
+ embedding_function=self.embedding_function,
397
+ metadata={"description": "NOAA incident reports database"}
398
+ )
399
+
400
+ def load_incidents(self, csv_path: str) -> int:
401
+ """
402
+ Load incidents from CSV into ChromaDB.
403
+
404
+ Args:
405
+ csv_path (str): Path to CSV file
406
+
407
+ Returns:
408
+ int: Number of incidents loaded
409
+ """
410
+ logger.info(f"Loading incidents from {csv_path}")
411
+
412
+ try:
413
+ df = pd.read_csv(csv_path, dtype=str)
414
+ df = df.where(pd.notna(df), None)
415
+
416
+ documents = []
417
+ metadatas = []
418
+ ids = []
419
+
420
+ for idx, row in df.iterrows():
421
+ # Generate unique ID
422
+ unique_string = f"{