Rauhan commited on
Commit
fce68f1
1 Parent(s): 51ed650

UPDATE: urls

Browse files
Files changed (1) hide show
  1. functions.py +15 -18
functions.py CHANGED
@@ -261,27 +261,24 @@ def listTables(username: str):
261
 
262
  def getLinks(url: str, timeout = 30):
263
  start = time.time()
264
- def getLinksFromPage(url: str):
265
- response = requests.get(url, verify=False)
266
- htmlContent = response.content
267
- soup = BeautifulSoup(htmlContent, "lxml")
268
- anchorTags = soup.find_all("a")
269
- allLinks = []
270
- for tag in anchorTags:
271
- if "href" in tag.attrs:
272
- href = tag.attrs["href"]
273
- parseObject = urlparse(href)
274
- if ((parseObject.scheme == "") | (parseObject.netloc == "")):
275
- fullUrl = urljoin(url, "/"+urljoin("/"+parseObject.path, urljoin("/"+parseObject.params, urljoin("/"+parseObject.query, "/"+parseObject.fragment))))
276
- else:
277
- fullUrl = href
278
- if urlparse(fullUrl).netloc == urlparse(url).netloc:
279
- allLinks.append(fullUrl)
280
  else:
281
- continue
 
 
282
  else:
283
  continue
284
- return allLinks
285
  links = getLinksFromPage(url)
286
  uniqueLinks = set()
287
  for link in links:
 
261
 
262
  def getLinks(url: str, timeout = 30):
263
  start = time.time()
264
+ def getLinksFromPage(url: str) -> list:
265
+ response = requests.get(url)
266
+ soup = BeautifulSoup(response.content, "lxml")
267
+ anchors = soup.find_all("a")
268
+ links = []
269
+ for anchor in anchors:
270
+ if anchor.attrs["href"]:
271
+ if urlparse(anchor.get("href")).netloc == urlparse(url).netloc:
272
+ newUrl = anchors.get("href")
273
+ elif anchor.get("href").startswith("/"):
274
+ newUrl = urljoin(url + "/", anchor.get("href"))
 
 
 
 
 
275
  else:
276
+ pass
277
+ links.append(newUrl)
278
+ links = list(set(links))
279
  else:
280
  continue
281
+ return links
282
  links = getLinksFromPage(url)
283
  uniqueLinks = set()
284
  for link in links: