quoc-khanh commited on
Commit
6c5699f
·
verified ·
1 Parent(s): 07a8a1f

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +64 -67
helpers.py CHANGED
@@ -16,80 +16,77 @@ import requests
16
  from bs4 import BeautifulSoup
17
  import os
18
 
19
- # os.system("playwright install-deps chromium")
20
- os.system("playwright install chromium")
21
-
22
  # from file_loader import get_vectorstore
23
  if "GOOGLE_API_KEY" not in os.environ:
24
  os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
25
  key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
26
 
27
- import asyncio
28
- from urllib.parse import urljoin
29
- from playwright.async_api import async_playwright
30
- from langchain_community.document_loaders import AsyncHtmlLoader
31
- from langchain_community.document_transformers import Html2TextTransformer
32
- from tqdm.asyncio import tqdm
33
-
34
- async def _fetch_urls(base_url):
35
- """Extract all links from a JavaScript-rendered webpage."""
36
- async with async_playwright() as p:
37
- try:
38
- browser = await p.chromium.launch(headless=True)
39
- page = await browser.new_page()
40
- await page.goto(base_url)
41
- await page.wait_for_load_state("networkidle")
42
 
43
- urls = set()
44
- links = await page.locator("a").all()
45
- for link in links:
46
- href = await link.get_attribute("href")
47
- if href and "#" not in href:
48
- full_url = urljoin(base_url, href)
49
- if full_url.startswith(base_url):
50
- urls.add(full_url)
51
 
52
- await browser.close()
53
- except Exception as e:
54
- print(f"⚠️ Không thể truy cập {base_url}: {e}")
55
- return [] # Trả về danh sách rỗng nếu gặp lỗi
56
- return list(urls)
57
-
58
- async def _fetch_web_content(urls):
59
- """Fetch HTML content and convert it to text, with a progress bar."""
60
- docs = []
61
- progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
62
-
63
- for page_url in urls:
64
- try:
65
- loader = AsyncHtmlLoader(page_url)
66
- html2text = Html2TextTransformer()
67
- html = await loader.aload()
68
- doc = html2text.transform_documents(html)
69
- docs.extend(doc)
70
- except Exception as e:
71
- print(f"Error loading {page_url}: {e}")
72
-
73
- progress_bar.update(1) # Update progress bar
74
-
75
- progress_bar.close()
76
- return docs
77
-
78
- def scrape_website(base_urls):
79
- """
80
- Scrapes a list of base URLs and extracts their content.
81
- Includes a progress bar for tracking.
82
- """
83
- async def _main():
84
- all_urls = []
85
- for base_url in base_urls:
86
- urls = await _fetch_urls(base_url)
87
- all_urls.extend(urls)
88
-
89
- docs = await _fetch_web_content(all_urls)
90
- return docs
91
-
92
- return asyncio.run(_main())
93
 
94
  def log_message(messages, filename="chat_log.txt"):
95
  """Ghi lịch sử tin nhắn vào file log"""
 
16
  from bs4 import BeautifulSoup
17
  import os
18
 
 
 
 
19
  # from file_loader import get_vectorstore
20
  if "GOOGLE_API_KEY" not in os.environ:
21
  os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
22
  key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
23
 
24
+ # import asyncio
25
+ # from urllib.parse import urljoin
26
+ # from playwright.async_api import async_playwright
27
+ # from langchain_community.document_loaders import AsyncHtmlLoader
28
+ # from langchain_community.document_transformers import Html2TextTransformer
29
+ # from tqdm.asyncio import tqdm
30
+
31
+ # async def _fetch_urls(base_url):
32
+ # """Extract all links from a JavaScript-rendered webpage."""
33
+ # async with async_playwright() as p:
34
+ # try:
35
+ # browser = await p.chromium.launch(headless=True)
36
+ # page = await browser.new_page()
37
+ # await page.goto(base_url)
38
+ # await page.wait_for_load_state("networkidle")
39
 
40
+ # urls = set()
41
+ # links = await page.locator("a").all()
42
+ # for link in links:
43
+ # href = await link.get_attribute("href")
44
+ # if href and "#" not in href:
45
+ # full_url = urljoin(base_url, href)
46
+ # if full_url.startswith(base_url):
47
+ # urls.add(full_url)
48
 
49
+ # await browser.close()
50
+ # except Exception as e:
51
+ # print(f"⚠️ Không thể truy cập {base_url}: {e}")
52
+ # return [] # Trả về danh sách rỗng nếu gặp lỗi
53
+ # return list(urls)
54
+
55
+ # async def _fetch_web_content(urls):
56
+ # """Fetch HTML content and convert it to text, with a progress bar."""
57
+ # docs = []
58
+ # progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
59
+
60
+ # for page_url in urls:
61
+ # try:
62
+ # loader = AsyncHtmlLoader(page_url)
63
+ # html2text = Html2TextTransformer()
64
+ # html = await loader.aload()
65
+ # doc = html2text.transform_documents(html)
66
+ # docs.extend(doc)
67
+ # except Exception as e:
68
+ # print(f"Error loading {page_url}: {e}")
69
+
70
+ # progress_bar.update(1) # Update progress bar
71
+
72
+ # progress_bar.close()
73
+ # return docs
74
+
75
+ # def scrape_website(base_urls):
76
+ # """
77
+ # Scrapes a list of base URLs and extracts their content.
78
+ # Includes a progress bar for tracking.
79
+ # """
80
+ # async def _main():
81
+ # all_urls = []
82
+ # for base_url in base_urls:
83
+ # urls = await _fetch_urls(base_url)
84
+ # all_urls.extend(urls)
85
+
86
+ # docs = await _fetch_web_content(all_urls)
87
+ # return docs
88
+
89
+ # return asyncio.run(_main())
90
 
91
  def log_message(messages, filename="chat_log.txt"):
92
  """Ghi lịch sử tin nhắn vào file log"""