Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +55 -19
helpers.py
CHANGED
@@ -16,7 +16,7 @@ import shutil
|
|
16 |
import requests
|
17 |
from bs4 import BeautifulSoup
|
18 |
import os
|
19 |
-
from langchain_docling import DoclingLoader
|
20 |
|
21 |
|
22 |
# from file_loader import get_vectorstore
|
@@ -91,24 +91,67 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
|
91 |
|
92 |
# return asyncio.run(_main)
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
95 |
-
"""
|
96 |
-
docs = []
|
97 |
-
|
98 |
-
for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
|
99 |
try:
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
doc =
|
105 |
docs.extend(doc)
|
106 |
except Exception as e:
|
107 |
-
print(f"
|
108 |
|
109 |
-
print(f"
|
110 |
return docs
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
def log_message(messages, filename="chat_log.txt"):
|
114 |
"""Ghi lịch sử tin nhắn vào file log"""
|
@@ -134,13 +177,6 @@ def remove_tables_from_docx(file_path):
|
|
134 |
|
135 |
return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
|
136 |
|
137 |
-
def load_text_data(file_path):
|
138 |
-
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
139 |
-
# cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
140 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
141 |
-
return DoclingLoader(file_path=file_path, chunker=text_splitter # This will break your doc into manageable pieces.
|
142 |
-
).load()
|
143 |
-
|
144 |
|
145 |
def extract_tables_from_docx(file_path):
|
146 |
doc = Document(file_path)
|
|
|
16 |
import requests
|
17 |
from bs4 import BeautifulSoup
|
18 |
import os
|
19 |
+
from langchain_docling import DoclingLoader, ExportType
|
20 |
|
21 |
|
22 |
# from file_loader import get_vectorstore
|
|
|
91 |
|
92 |
# return asyncio.run(_main)
|
93 |
|
94 |
+
# class ChunkerWrapper:
|
95 |
+
# def __init__(self, splitter):
|
96 |
+
# self.splitter = splitter
|
97 |
+
|
98 |
+
# def chunk(self, text):
|
99 |
+
# # Use the 'split_text' method of the splitter to divide the text
|
100 |
+
# return self.splitter.split_text(text)
|
101 |
+
|
102 |
+
# def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
103 |
+
# """Tải nội dung từ danh sách URL với thanh tiến trình"""
|
104 |
+
# docs = []
|
105 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
106 |
+
# chunker = ChunkerWrapper(text_splitter)
|
107 |
+
# for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
|
108 |
+
# try:
|
109 |
+
# # loader = WebBaseLoader(page_url)
|
110 |
+
# loader = DoclingLoader(file_path=page_url,chunker=chunker # This will break your doc into manageable pieces.
|
111 |
+
# )
|
112 |
+
# html = loader.load()
|
113 |
+
# doc = html
|
114 |
+
# docs.extend(doc)
|
115 |
+
# except Exception as e:
|
116 |
+
# print(f"Lỗi khi tải {page_url}: {e}")
|
117 |
+
|
118 |
+
# print(f"Tải thành công {len(docs)} trang.")
|
119 |
+
# return docs
|
120 |
+
|
121 |
+
# def load_text_data(file_path):
|
122 |
+
# """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
123 |
+
# # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
124 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
125 |
+
# chunker = ChunkerWrapper(text_splitter)
|
126 |
+
# return DoclingLoader(file_path=file_path, chunker=chunker # This will break your doc into manageable pieces.
|
127 |
+
# ).load()
|
128 |
+
|
129 |
+
|
130 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
131 |
+
"""Fetch content from a list of URLs with a progress bar."""
|
132 |
+
docs = []
|
133 |
+
for page_url in tqdm(base_urls, desc="Loading page", unit="url"):
|
|
|
134 |
try:
|
135 |
+
loader = DoclingLoader(
|
136 |
+
file_path=page_url,
|
137 |
+
export_type=ExportType.DOC_CHUNKS # Enable internal chunking
|
138 |
+
)
|
139 |
+
doc = loader.load()
|
140 |
docs.extend(doc)
|
141 |
except Exception as e:
|
142 |
+
print(f"Error loading {page_url}: {e}")
|
143 |
|
144 |
+
print(f"Successfully loaded {len(docs)} documents.")
|
145 |
return docs
|
146 |
|
147 |
+
def load_text_data(file_path):
|
148 |
+
"""Load text content from a DOCX file (tables removed)."""
|
149 |
+
loader = DoclingLoader(
|
150 |
+
file_path=file_path,
|
151 |
+
export_type=ExportType.DOC_CHUNKS # Enable internal chunking
|
152 |
+
)
|
153 |
+
return loader.load()
|
154 |
+
|
155 |
|
156 |
def log_message(messages, filename="chat_log.txt"):
|
157 |
"""Ghi lịch sử tin nhắn vào file log"""
|
|
|
177 |
|
178 |
return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
def extract_tables_from_docx(file_path):
|
182 |
doc = Document(file_path)
|