Spaces:

akazmi
/

Energyguru

Sleeping

App Files Files Community

akazmi commited on Jan 8

Commit

ccaec45

verified ·

1 Parent(s): 0d1b440

Create scraper.py

Browse files

Files changed (1) hide show

scraper.py +75 -0

scraper.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+from random import randint
+def scrape_tariffs(urls):
+    data = []
+    # Ensure the 'data' directory exists before saving the CSV
+    os.makedirs("data", exist_ok=True)
+    for url in urls:
+        try:
+            response = requests.get(url, timeout=10)  # Added timeout
+            response.raise_for_status()  # Raise exception for bad status codes (4xx, 5xx)
+            # Scrape data if the response is OK
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.content, "html.parser")
+                rows = soup.find_all("tr")
+                for row in rows:
+                    cells = row.find_all("td")
+                    if len(cells) >= 2:
+                        try:
+                            data.append({
+                                "category": cells[0].text.strip(),
+                                "rate": float(cells[1].text.strip().replace(",", "")),
+                            })
+                        except ValueError:
+                            continue
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching data from {url}: {e}")
+            print("Retrying...")
+            # Retry logic in case of failure (max 3 retries with random delay)
+            retries = 3
+            while retries > 0:
+                time.sleep(randint(1, 3))  # Sleep for a random time before retrying
+                retries -= 1
+                try:
+                    response = requests.get(url, timeout=10)
+                    response.raise_for_status()
+                    if response.status_code == 200:
+                        soup = BeautifulSoup(response.content, "html.parser")
+                        rows = soup.find_all("tr")
+                        for row in rows:
+                            cells = row.find_all("td")
+                            if len(cells) >= 2:
+                                try:
+                                    data.append({
+                                        "category": cells[0].text.strip(),
+                                        "rate": float(cells[1].text.strip().replace(",", "")),
+                                    })
+                                except ValueError:
+                                    continue
+                        break
+                except requests.exceptions.RequestException:
+                    print(f"Retry failed: {e}")
+                    continue
+        # Sleep between requests to avoid hitting the servers too quickly
+        time.sleep(randint(2, 5))
+    if data:
+        df = pd.DataFrame(data)
+        # Save the scraped data to the 'data' directory
+        df.to_csv("data/tariffs.csv", index=False)
+        print("Tariff data saved successfully.")
+    else:
+        print("No tariff data found.")