Spaces:
Sleeping
Sleeping
UPDATE: urls
Browse files- functions.py +15 -18
functions.py
CHANGED
|
@@ -261,27 +261,24 @@ def listTables(username: str):
|
|
| 261 |
|
| 262 |
def getLinks(url: str, timeout = 30):
|
| 263 |
start = time.time()
|
| 264 |
-
def getLinksFromPage(url: str):
|
| 265 |
-
response = requests.get(url
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
fullUrl = urljoin(url, "/"+urljoin("/"+parseObject.path, urljoin("/"+parseObject.params, urljoin("/"+parseObject.query, "/"+parseObject.fragment))))
|
| 276 |
-
else:
|
| 277 |
-
fullUrl = href
|
| 278 |
-
if urlparse(fullUrl).netloc == urlparse(url).netloc:
|
| 279 |
-
allLinks.append(fullUrl)
|
| 280 |
else:
|
| 281 |
-
|
|
|
|
|
|
|
| 282 |
else:
|
| 283 |
continue
|
| 284 |
-
return
|
| 285 |
links = getLinksFromPage(url)
|
| 286 |
uniqueLinks = set()
|
| 287 |
for link in links:
|
|
|
|
| 261 |
|
| 262 |
def getLinks(url: str, timeout = 30):
|
| 263 |
start = time.time()
|
| 264 |
+
def getLinksFromPage(url: str) -> list:
|
| 265 |
+
response = requests.get(url)
|
| 266 |
+
soup = BeautifulSoup(response.content, "lxml")
|
| 267 |
+
anchors = soup.find_all("a")
|
| 268 |
+
links = []
|
| 269 |
+
for anchor in anchors:
|
| 270 |
+
if anchor.attrs["href"]:
|
| 271 |
+
if urlparse(anchor.get("href")).netloc == urlparse(url).netloc:
|
| 272 |
+
newUrl = anchors.get("href")
|
| 273 |
+
elif anchor.get("href").startswith("/"):
|
| 274 |
+
newUrl = urljoin(url + "/", anchor.get("href"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
else:
|
| 276 |
+
pass
|
| 277 |
+
links.append(newUrl)
|
| 278 |
+
links = list(set(links))
|
| 279 |
else:
|
| 280 |
continue
|
| 281 |
+
return links
|
| 282 |
links = getLinksFromPage(url)
|
| 283 |
uniqueLinks = set()
|
| 284 |
for link in links:
|