import os import requests from bs4 import BeautifulSoup import pandas as pd import time from random import randint def scrape_tariffs(urls): data = [] # Ensure the 'data' directory exists before saving the CSV os.makedirs("data", exist_ok=True) for url in urls: try: response = requests.get(url, timeout=10) # Added timeout response.raise_for_status() # Raise exception for bad status codes (4xx, 5xx) # Scrape data if the response is OK if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") rows = soup.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) >= 2: try: data.append({ "category": cells[0].text.strip(), "rate": float(cells[1].text.strip().replace(",", "")), }) except ValueError: continue except requests.exceptions.RequestException as e: print(f"Error fetching data from {url}: {e}") print("Retrying...") # Retry logic in case of failure (max 3 retries with random delay) retries = 3 while retries > 0: time.sleep(randint(1, 3)) # Sleep for a random time before retrying retries -= 1 try: response = requests.get(url, timeout=10) response.raise_for_status() if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") rows = soup.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) >= 2: try: data.append({ "category": cells[0].text.strip(), "rate": float(cells[1].text.strip().replace(",", "")), }) except ValueError: continue break except requests.exceptions.RequestException: print(f"Retry failed: {e}") continue # Sleep between requests to avoid hitting the servers too quickly time.sleep(randint(2, 5)) if data: df = pd.DataFrame(data) # Save the scraped data to the 'data' directory df.to_csv("data/tariffs.csv", index=False) print("Tariff data saved successfully.") else: print("No tariff data found.")