Spaces:

techconspartners
/

ConversAI

Sleeping

Rauhan commited on Aug 1, 2024

Commit

fce68f1

1 Parent(s): 51ed650

UPDATE: urls

Files changed (1) hide show

functions.py CHANGED Viewed

@@ -261,27 +261,24 @@ def listTables(username: str):
 def getLinks(url: str, timeout = 30):
     start = time.time()
-    def getLinksFromPage(url: str):
-        response = requests.get(url, verify=False)
-        htmlContent = response.content
-        soup = BeautifulSoup(htmlContent, "lxml")
-        anchorTags = soup.find_all("a")
-        allLinks = []
-        for tag in anchorTags:
-            if "href" in tag.attrs:
-                href = tag.attrs["href"]
-                parseObject = urlparse(href)
-                if ((parseObject.scheme == "") | (parseObject.netloc == "")):
-                    fullUrl = urljoin(url, "/"+urljoin("/"+parseObject.path, urljoin("/"+parseObject.params, urljoin("/"+parseObject.query, "/"+parseObject.fragment))))
-                else:
-                    fullUrl = href
-                if urlparse(fullUrl).netloc == urlparse(url).netloc:
-                    allLinks.append(fullUrl)
                 else:
-                    continue
             else:
                 continue
-        return allLinks
     links = getLinksFromPage(url)
     uniqueLinks = set()
     for link in links:

 def getLinks(url: str, timeout = 30):
     start = time.time()
+    def getLinksFromPage(url: str) -> list:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.content, "lxml")
+        anchors = soup.find_all("a")
+        links = []
+        for anchor in anchors:
+            if anchor.attrs["href"]:
+                if urlparse(anchor.get("href")).netloc == urlparse(url).netloc:
+                    newUrl = anchors.get("href")
+                elif anchor.get("href").startswith("/"):
+                    newUrl = urljoin(url + "/", anchor.get("href"))
                 else:
+                    pass
+                links.append(newUrl)
+                links = list(set(links))
             else:
                 continue
+        return links
     links = getLinksFromPage(url)
     uniqueLinks = set()
     for link in links: