akazmi commited on
Commit
ccaec45
·
verified ·
1 Parent(s): 0d1b440

Create scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +75 -0
scraper.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ import time
6
+ from random import randint
7
+
8
+ def scrape_tariffs(urls):
9
+ data = []
10
+
11
+ # Ensure the 'data' directory exists before saving the CSV
12
+ os.makedirs("data", exist_ok=True)
13
+
14
+ for url in urls:
15
+ try:
16
+ response = requests.get(url, timeout=10) # Added timeout
17
+ response.raise_for_status() # Raise exception for bad status codes (4xx, 5xx)
18
+
19
+ # Scrape data if the response is OK
20
+ if response.status_code == 200:
21
+ soup = BeautifulSoup(response.content, "html.parser")
22
+ rows = soup.find_all("tr")
23
+
24
+ for row in rows:
25
+ cells = row.find_all("td")
26
+ if len(cells) >= 2:
27
+ try:
28
+ data.append({
29
+ "category": cells[0].text.strip(),
30
+ "rate": float(cells[1].text.strip().replace(",", "")),
31
+ })
32
+ except ValueError:
33
+ continue
34
+
35
+ except requests.exceptions.RequestException as e:
36
+ print(f"Error fetching data from {url}: {e}")
37
+ print("Retrying...")
38
+
39
+ # Retry logic in case of failure (max 3 retries with random delay)
40
+ retries = 3
41
+ while retries > 0:
42
+ time.sleep(randint(1, 3)) # Sleep for a random time before retrying
43
+ retries -= 1
44
+ try:
45
+ response = requests.get(url, timeout=10)
46
+ response.raise_for_status()
47
+ if response.status_code == 200:
48
+ soup = BeautifulSoup(response.content, "html.parser")
49
+ rows = soup.find_all("tr")
50
+
51
+ for row in rows:
52
+ cells = row.find_all("td")
53
+ if len(cells) >= 2:
54
+ try:
55
+ data.append({
56
+ "category": cells[0].text.strip(),
57
+ "rate": float(cells[1].text.strip().replace(",", "")),
58
+ })
59
+ except ValueError:
60
+ continue
61
+ break
62
+ except requests.exceptions.RequestException:
63
+ print(f"Retry failed: {e}")
64
+ continue
65
+
66
+ # Sleep between requests to avoid hitting the servers too quickly
67
+ time.sleep(randint(2, 5))
68
+
69
+ if data:
70
+ df = pd.DataFrame(data)
71
+ # Save the scraped data to the 'data' directory
72
+ df.to_csv("data/tariffs.csv", index=False)
73
+ print("Tariff data saved successfully.")
74
+ else:
75
+ print("No tariff data found.")