Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +64 -67
helpers.py
CHANGED
@@ -16,80 +16,77 @@ import requests
|
|
16 |
from bs4 import BeautifulSoup
|
17 |
import os
|
18 |
|
19 |
-
# os.system("playwright install-deps chromium")
|
20 |
-
os.system("playwright install chromium")
|
21 |
-
|
22 |
# from file_loader import get_vectorstore
|
23 |
if "GOOGLE_API_KEY" not in os.environ:
|
24 |
os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
25 |
key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
26 |
|
27 |
-
import asyncio
|
28 |
-
from urllib.parse import urljoin
|
29 |
-
from playwright.async_api import async_playwright
|
30 |
-
from langchain_community.document_loaders import AsyncHtmlLoader
|
31 |
-
from langchain_community.document_transformers import Html2TextTransformer
|
32 |
-
from tqdm.asyncio import tqdm
|
33 |
-
|
34 |
-
async def _fetch_urls(base_url):
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
async def _fetch_web_content(urls):
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
def scrape_website(base_urls):
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
|
94 |
def log_message(messages, filename="chat_log.txt"):
|
95 |
"""Ghi lịch sử tin nhắn vào file log"""
|
|
|
16 |
from bs4 import BeautifulSoup
|
17 |
import os
|
18 |
|
|
|
|
|
|
|
19 |
# from file_loader import get_vectorstore
|
20 |
if "GOOGLE_API_KEY" not in os.environ:
|
21 |
os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
22 |
key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
23 |
|
24 |
+
# import asyncio
|
25 |
+
# from urllib.parse import urljoin
|
26 |
+
# from playwright.async_api import async_playwright
|
27 |
+
# from langchain_community.document_loaders import AsyncHtmlLoader
|
28 |
+
# from langchain_community.document_transformers import Html2TextTransformer
|
29 |
+
# from tqdm.asyncio import tqdm
|
30 |
+
|
31 |
+
# async def _fetch_urls(base_url):
|
32 |
+
# """Extract all links from a JavaScript-rendered webpage."""
|
33 |
+
# async with async_playwright() as p:
|
34 |
+
# try:
|
35 |
+
# browser = await p.chromium.launch(headless=True)
|
36 |
+
# page = await browser.new_page()
|
37 |
+
# await page.goto(base_url)
|
38 |
+
# await page.wait_for_load_state("networkidle")
|
39 |
|
40 |
+
# urls = set()
|
41 |
+
# links = await page.locator("a").all()
|
42 |
+
# for link in links:
|
43 |
+
# href = await link.get_attribute("href")
|
44 |
+
# if href and "#" not in href:
|
45 |
+
# full_url = urljoin(base_url, href)
|
46 |
+
# if full_url.startswith(base_url):
|
47 |
+
# urls.add(full_url)
|
48 |
|
49 |
+
# await browser.close()
|
50 |
+
# except Exception as e:
|
51 |
+
# print(f"⚠️ Không thể truy cập {base_url}: {e}")
|
52 |
+
# return [] # Trả về danh sách rỗng nếu gặp lỗi
|
53 |
+
# return list(urls)
|
54 |
+
|
55 |
+
# async def _fetch_web_content(urls):
|
56 |
+
# """Fetch HTML content and convert it to text, with a progress bar."""
|
57 |
+
# docs = []
|
58 |
+
# progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
|
59 |
+
|
60 |
+
# for page_url in urls:
|
61 |
+
# try:
|
62 |
+
# loader = AsyncHtmlLoader(page_url)
|
63 |
+
# html2text = Html2TextTransformer()
|
64 |
+
# html = await loader.aload()
|
65 |
+
# doc = html2text.transform_documents(html)
|
66 |
+
# docs.extend(doc)
|
67 |
+
# except Exception as e:
|
68 |
+
# print(f"Error loading {page_url}: {e}")
|
69 |
+
|
70 |
+
# progress_bar.update(1) # Update progress bar
|
71 |
+
|
72 |
+
# progress_bar.close()
|
73 |
+
# return docs
|
74 |
+
|
75 |
+
# def scrape_website(base_urls):
|
76 |
+
# """
|
77 |
+
# Scrapes a list of base URLs and extracts their content.
|
78 |
+
# Includes a progress bar for tracking.
|
79 |
+
# """
|
80 |
+
# async def _main():
|
81 |
+
# all_urls = []
|
82 |
+
# for base_url in base_urls:
|
83 |
+
# urls = await _fetch_urls(base_url)
|
84 |
+
# all_urls.extend(urls)
|
85 |
+
|
86 |
+
# docs = await _fetch_web_content(all_urls)
|
87 |
+
# return docs
|
88 |
+
|
89 |
+
# return asyncio.run(_main())
|
90 |
|
91 |
def log_message(messages, filename="chat_log.txt"):
|
92 |
"""Ghi lịch sử tin nhắn vào file log"""
|