Spaces:

akazmi
/

Energyguru

Sleeping

App Files Files Community

Energyguru / scraper.py

akazmi

Create scraper.py

ccaec45 verified 2 months ago

raw

history blame contribute delete

2.9 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import time
	from random import randint

	def scrape_tariffs(urls):
	data = []

	# Ensure the 'data' directory exists before saving the CSV
	os.makedirs("data", exist_ok=True)

	for url in urls:
	try:
	response = requests.get(url, timeout=10) # Added timeout
	response.raise_for_status() # Raise exception for bad status codes (4xx, 5xx)

	# Scrape data if the response is OK
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, "html.parser")
	rows = soup.find_all("tr")

	for row in rows:
	cells = row.find_all("td")
	if len(cells) >= 2:
	try:
	data.append({
	"category": cells[0].text.strip(),
	"rate": float(cells[1].text.strip().replace(",", "")),
	})
	except ValueError:
	continue

	except requests.exceptions.RequestException as e:
	print(f"Error fetching data from {url}: {e}")
	print("Retrying...")

	# Retry logic in case of failure (max 3 retries with random delay)
	retries = 3
	while retries > 0:
	time.sleep(randint(1, 3)) # Sleep for a random time before retrying
	retries -= 1
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, "html.parser")
	rows = soup.find_all("tr")

	for row in rows:
	cells = row.find_all("td")
	if len(cells) >= 2:
	try:
	data.append({
	"category": cells[0].text.strip(),
	"rate": float(cells[1].text.strip().replace(",", "")),
	})
	except ValueError:
	continue
	break
	except requests.exceptions.RequestException:
	print(f"Retry failed: {e}")
	continue

	# Sleep between requests to avoid hitting the servers too quickly
	time.sleep(randint(2, 5))

	if data:
	df = pd.DataFrame(data)
	# Save the scraped data to the 'data' directory
	df.to_csv("data/tariffs.csv", index=False)
	print("Tariff data saved successfully.")
	else:
	print("No tariff data found.")