Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +19 -15
helpers.py
CHANGED
@@ -34,21 +34,25 @@ from tqdm.asyncio import tqdm
|
|
34 |
async def _fetch_urls(base_url):
|
35 |
"""Extract all links from a JavaScript-rendered webpage."""
|
36 |
async with async_playwright() as p:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
return list(urls)
|
53 |
|
54 |
async def _fetch_web_content(urls):
|
|
|
34 |
async def _fetch_urls(base_url):
|
35 |
"""Extract all links from a JavaScript-rendered webpage."""
|
36 |
async with async_playwright() as p:
|
37 |
+
try:
|
38 |
+
browser = await p.chromium.launch(headless=True)
|
39 |
+
page = await browser.new_page()
|
40 |
+
await page.goto(base_url)
|
41 |
+
await page.wait_for_load_state("networkidle")
|
42 |
+
|
43 |
+
urls = set()
|
44 |
+
links = await page.locator("a").all()
|
45 |
+
for link in links:
|
46 |
+
href = await link.get_attribute("href")
|
47 |
+
if href and "#" not in href:
|
48 |
+
full_url = urljoin(base_url, href)
|
49 |
+
if full_url.startswith(base_url):
|
50 |
+
urls.add(full_url)
|
51 |
+
|
52 |
+
await browser.close()
|
53 |
+
except Exception as e:
|
54 |
+
print(f"⚠️ Không thể truy cập {base_url}: {e}")
|
55 |
+
return [] # Trả về danh sách rỗng nếu gặp lỗi
|
56 |
return list(urls)
|
57 |
|
58 |
async def _fetch_web_content(urls):
|