Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +4 -8
helpers.py
CHANGED
@@ -86,29 +86,25 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
|
86 |
# docs = await _fetch_web_content(all_urls)
|
87 |
# return docs
|
88 |
|
89 |
-
# return asyncio.run(_main
|
90 |
-
|
91 |
|
92 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
93 |
-
|
94 |
-
# urls = asyncio.run(get_urls_playwright(base_url))
|
95 |
docs = []
|
96 |
-
for page_url in urls
|
97 |
try:
|
98 |
-
# loader = WebBaseLoader(web_paths=[page_url])
|
99 |
loader = AsyncHtmlLoader(page_url)
|
100 |
html2text = Html2TextTransformer()
|
101 |
|
102 |
html = loader.load()
|
103 |
doc = html2text.transform_documents(html)
|
104 |
-
docs.extend(doc)
|
105 |
except Exception as e:
|
106 |
print(f"Lỗi khi tải {page_url}: {e}")
|
107 |
|
108 |
print(f"Tải thành công {len(docs)} trang.")
|
109 |
return docs
|
110 |
|
111 |
-
|
112 |
def log_message(messages, filename="chat_log.txt"):
|
113 |
"""Ghi lịch sử tin nhắn vào file log"""
|
114 |
with open(filename, "a", encoding="utf-8") as f:
|
|
|
86 |
# docs = await _fetch_web_content(all_urls)
|
87 |
# return docs
|
88 |
|
89 |
+
# return asyncio.run(_main)
|
|
|
90 |
|
91 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
92 |
+
"""Tải nội dung từ danh sách URL"""
|
|
|
93 |
docs = []
|
94 |
+
for page_url in base_urls: # Kiểm tra biến urls có được định nghĩa trước đó không
|
95 |
try:
|
|
|
96 |
loader = AsyncHtmlLoader(page_url)
|
97 |
html2text = Html2TextTransformer()
|
98 |
|
99 |
html = loader.load()
|
100 |
doc = html2text.transform_documents(html)
|
101 |
+
docs.extend(doc)
|
102 |
except Exception as e:
|
103 |
print(f"Lỗi khi tải {page_url}: {e}")
|
104 |
|
105 |
print(f"Tải thành công {len(docs)} trang.")
|
106 |
return docs
|
107 |
|
|
|
108 |
def log_message(messages, filename="chat_log.txt"):
|
109 |
"""Ghi lịch sử tin nhắn vào file log"""
|
110 |
with open(filename, "a", encoding="utf-8") as f:
|