Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +13 -11
helpers.py
CHANGED
@@ -16,6 +16,8 @@ import shutil
|
|
16 |
import requests
|
17 |
from bs4 import BeautifulSoup
|
18 |
import os
|
|
|
|
|
19 |
|
20 |
# from file_loader import get_vectorstore
|
21 |
if "GOOGLE_API_KEY" not in os.environ:
|
@@ -94,11 +96,11 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
|
94 |
docs = []
|
95 |
for page_url in base_urls: # Kiểm tra biến urls có được định nghĩa trước đó không
|
96 |
try:
|
97 |
-
loader =
|
98 |
-
html2text = Html2TextTransformer()
|
99 |
|
100 |
html = loader.load()
|
101 |
-
doc = html2text.transform_documents(html)
|
102 |
docs.extend(doc)
|
103 |
except Exception as e:
|
104 |
print(f"Lỗi khi tải {page_url}: {e}")
|
@@ -132,8 +134,8 @@ def remove_tables_from_docx(file_path):
|
|
132 |
|
133 |
def load_text_data(file_path):
|
134 |
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
135 |
-
cleaned_file = remove_tables_from_docx(file_path)
|
136 |
-
return
|
137 |
|
138 |
|
139 |
def extract_tables_from_docx(file_path):
|
@@ -210,17 +212,17 @@ def load_table_data(file_path, output_json_path):
|
|
210 |
return table_data
|
211 |
|
212 |
def get_splits(file_path, output_json_path):
|
213 |
-
table_data = load_table_data(file_path, output_json_path)
|
214 |
-
text_data = load_text_data(file_path)
|
215 |
|
216 |
# Chia nhỏ văn bản
|
217 |
-
json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
|
218 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
219 |
|
220 |
-
table_splits = json_splitter.create_documents(texts=[table_data])
|
221 |
text_splits = text_splitter.split_documents(text_data)
|
222 |
-
all_splits = table_splits + text_splits
|
223 |
-
return
|
224 |
|
225 |
def get_json_splits_only(file_path):
|
226 |
table_data = load_json_manually(file_path)
|
|
|
16 |
import requests
|
17 |
from bs4 import BeautifulSoup
|
18 |
import os
|
19 |
+
from langchain_docling import DoclingLoader
|
20 |
+
|
21 |
|
22 |
# from file_loader import get_vectorstore
|
23 |
if "GOOGLE_API_KEY" not in os.environ:
|
|
|
96 |
docs = []
|
97 |
for page_url in base_urls: # Kiểm tra biến urls có được định nghĩa trước đó không
|
98 |
try:
|
99 |
+
loader = DoclingLoader(page_url)
|
100 |
+
# html2text = Html2TextTransformer()
|
101 |
|
102 |
html = loader.load()
|
103 |
+
doc = html#html2text.transform_documents(html)
|
104 |
docs.extend(doc)
|
105 |
except Exception as e:
|
106 |
print(f"Lỗi khi tải {page_url}: {e}")
|
|
|
134 |
|
135 |
def load_text_data(file_path):
|
136 |
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
|
137 |
+
cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
|
138 |
+
return DoclingLoader(cleaned_file).load()
|
139 |
|
140 |
|
141 |
def extract_tables_from_docx(file_path):
|
|
|
212 |
return table_data
|
213 |
|
214 |
def get_splits(file_path, output_json_path):
|
215 |
+
# table_data = load_table_data(file_path, output_json_path)
|
216 |
+
# text_data = load_text_data(file_path)
|
217 |
|
218 |
# Chia nhỏ văn bản
|
219 |
+
# json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
|
220 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
221 |
|
222 |
+
# table_splits = json_splitter.create_documents(texts=[table_data])
|
223 |
text_splits = text_splitter.split_documents(text_data)
|
224 |
+
# all_splits = table_splits + text_splits DoclingLoader
|
225 |
+
return text_splits
|
226 |
|
227 |
def get_json_splits_only(file_path):
|
228 |
table_data = load_json_manually(file_path)
|