quoc-khanh commited on
Commit
a55659f
·
verified ·
1 Parent(s): 4f35988

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +19 -15
helpers.py CHANGED
@@ -34,21 +34,25 @@ from tqdm.asyncio import tqdm
34
  async def _fetch_urls(base_url):
35
  """Extract all links from a JavaScript-rendered webpage."""
36
  async with async_playwright() as p:
37
- browser = await p.chromium.launch(headless=True)
38
- page = await browser.new_page()
39
- await page.goto(base_url)
40
- await page.wait_for_load_state("networkidle")
41
-
42
- urls = set()
43
- links = await page.locator("a").all()
44
- for link in links:
45
- href = await link.get_attribute("href")
46
- if href and "#" not in href:
47
- full_url = urljoin(base_url, href)
48
- if full_url.startswith(base_url):
49
- urls.add(full_url)
50
-
51
- await browser.close()
 
 
 
 
52
  return list(urls)
53
 
54
  async def _fetch_web_content(urls):
 
34
  async def _fetch_urls(base_url):
35
  """Extract all links from a JavaScript-rendered webpage."""
36
  async with async_playwright() as p:
37
+ try:
38
+ browser = await p.chromium.launch(headless=True)
39
+ page = await browser.new_page()
40
+ await page.goto(base_url)
41
+ await page.wait_for_load_state("networkidle")
42
+
43
+ urls = set()
44
+ links = await page.locator("a").all()
45
+ for link in links:
46
+ href = await link.get_attribute("href")
47
+ if href and "#" not in href:
48
+ full_url = urljoin(base_url, href)
49
+ if full_url.startswith(base_url):
50
+ urls.add(full_url)
51
+
52
+ await browser.close()
53
+ except Exception as e:
54
+ print(f"⚠️ Không thể truy cập {base_url}: {e}")
55
+ return [] # Trả về danh sách rỗng nếu gặp lỗi
56
  return list(urls)
57
 
58
  async def _fetch_web_content(urls):