Energyguru / scraper.py
akazmi's picture
Create scraper.py
ccaec45 verified
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from random import randint
def scrape_tariffs(urls):
data = []
# Ensure the 'data' directory exists before saving the CSV
os.makedirs("data", exist_ok=True)
for url in urls:
try:
response = requests.get(url, timeout=10) # Added timeout
response.raise_for_status() # Raise exception for bad status codes (4xx, 5xx)
# Scrape data if the response is OK
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
rows = soup.find_all("tr")
for row in rows:
cells = row.find_all("td")
if len(cells) >= 2:
try:
data.append({
"category": cells[0].text.strip(),
"rate": float(cells[1].text.strip().replace(",", "")),
})
except ValueError:
continue
except requests.exceptions.RequestException as e:
print(f"Error fetching data from {url}: {e}")
print("Retrying...")
# Retry logic in case of failure (max 3 retries with random delay)
retries = 3
while retries > 0:
time.sleep(randint(1, 3)) # Sleep for a random time before retrying
retries -= 1
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
rows = soup.find_all("tr")
for row in rows:
cells = row.find_all("td")
if len(cells) >= 2:
try:
data.append({
"category": cells[0].text.strip(),
"rate": float(cells[1].text.strip().replace(",", "")),
})
except ValueError:
continue
break
except requests.exceptions.RequestException:
print(f"Retry failed: {e}")
continue
# Sleep between requests to avoid hitting the servers too quickly
time.sleep(randint(2, 5))
if data:
df = pd.DataFrame(data)
# Save the scraped data to the 'data' directory
df.to_csv("data/tariffs.csv", index=False)
print("Tariff data saved successfully.")
else:
print("No tariff data found.")