Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +12 -7
helpers.py
CHANGED
@@ -94,9 +94,12 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
|
94 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
95 |
"""Tải nội dung từ danh sách URL với thanh tiến trình"""
|
96 |
docs = []
|
|
|
97 |
for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
|
98 |
try:
|
99 |
-
loader = WebBaseLoader(page_url)
|
|
|
|
|
100 |
html = loader.load()
|
101 |
doc = html
|
102 |
docs.extend(doc)
|
@@ -134,7 +137,9 @@ def remove_tables_from_docx(file_path):
|
|
134 |
def load_text_data(file_path):
|
135 |
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
136 |
# cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
137 |
-
|
|
|
|
|
138 |
|
139 |
|
140 |
def extract_tables_from_docx(file_path):
|
@@ -216,12 +221,12 @@ def get_splits(file_path, output_json_path):
|
|
216 |
|
217 |
# Chia nhỏ văn bản
|
218 |
# json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
|
219 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
220 |
|
221 |
# table_splits = json_splitter.create_documents(texts=[table_data])
|
222 |
-
text_splits = text_splitter.split_documents(text_data)
|
223 |
# all_splits = table_splits + text_splits DoclingLoader
|
224 |
-
return text_splits
|
225 |
|
226 |
def get_json_splits_only(file_path):
|
227 |
table_data = load_json_manually(file_path)
|
@@ -238,8 +243,8 @@ def get_json_splits_only(file_path):
|
|
238 |
cleaned_data = remove_accents(table_data)
|
239 |
wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
|
240 |
|
241 |
-
json_splitter = RecursiveJsonSplitter(max_chunk_size =
|
242 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=
|
243 |
|
244 |
table_splits = json_splitter.create_documents(texts=[wrapped_data])
|
245 |
table_splits = text_splitter.split_documents(table_splits)
|
|
|
94 |
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
95 |
"""Tải nội dung từ danh sách URL với thanh tiến trình"""
|
96 |
docs = []
|
97 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
98 |
for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
|
99 |
try:
|
100 |
+
# loader = WebBaseLoader(page_url)
|
101 |
+
loader = DoclingLoader(file_path=page_url,chunker=text_splitter # This will break your doc into manageable pieces.
|
102 |
+
)
|
103 |
html = loader.load()
|
104 |
doc = html
|
105 |
docs.extend(doc)
|
|
|
137 |
def load_text_data(file_path):
|
138 |
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
139 |
# cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
140 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
141 |
+
return DoclingLoader(file_path=file_path, chunker=text_splitter # This will break your doc into manageable pieces.
|
142 |
+
).load()
|
143 |
|
144 |
|
145 |
def extract_tables_from_docx(file_path):
|
|
|
221 |
|
222 |
# Chia nhỏ văn bản
|
223 |
# json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
|
224 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
225 |
|
226 |
# table_splits = json_splitter.create_documents(texts=[table_data])
|
227 |
+
# text_splits = text_splitter.split_documents(text_data)
|
228 |
# all_splits = table_splits + text_splits DoclingLoader
|
229 |
+
return text_data #text_splits
|
230 |
|
231 |
def get_json_splits_only(file_path):
|
232 |
table_data = load_json_manually(file_path)
|
|
|
243 |
cleaned_data = remove_accents(table_data)
|
244 |
wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
|
245 |
|
246 |
+
json_splitter = RecursiveJsonSplitter(max_chunk_size = 2000)
|
247 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
248 |
|
249 |
table_splits = json_splitter.create_documents(texts=[wrapped_data])
|
250 |
table_splits = text_splitter.split_documents(table_splits)
|