Spaces:

quoc-khanh
/

chatbot4nct_test1

Sleeping

App Files Files Community

quoc-khanh commited on Feb 26

Commit

b65f639

verified ·

1 Parent(s): c09b200

Update helpers.py

Browse files

Files changed (1) hide show

helpers.py +12 -7

helpers.py CHANGED Viewed

@@ -94,9 +94,12 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
 def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
     """Tải nội dung từ danh sách URL với thanh tiến trình"""
     docs = []
     for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
         try:
-            loader = WebBaseLoader(page_url)
             html = loader.load()
             doc = html
             docs.extend(doc)
@@ -134,7 +137,9 @@ def remove_tables_from_docx(file_path):
 def load_text_data(file_path):
     """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
     # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
-    return DoclingLoader(file_path).load()
 def extract_tables_from_docx(file_path):
@@ -216,12 +221,12 @@ def get_splits(file_path, output_json_path):
     # Chia nhỏ văn bản
     # json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
     # table_splits = json_splitter.create_documents(texts=[table_data])
-    text_splits = text_splitter.split_documents(text_data)
     # all_splits = table_splits + text_splits DoclingLoader
-    return text_splits
 def get_json_splits_only(file_path):
     table_data = load_json_manually(file_path)
@@ -238,8 +243,8 @@ def get_json_splits_only(file_path):
     cleaned_data = remove_accents(table_data)
     wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
-    json_splitter = RecursiveJsonSplitter(max_chunk_size = 512)
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
     table_splits = json_splitter.create_documents(texts=[wrapped_data])
     table_splits = text_splitter.split_documents(table_splits)

 def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
     """Tải nội dung từ danh sách URL với thanh tiến trình"""
     docs = []
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
     for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
         try:
+            # loader = WebBaseLoader(page_url)
+            loader = DoclingLoader(file_path=page_url,chunker=text_splitter  # This will break your doc into manageable pieces.
+                                    )
             html = loader.load()
             doc = html
             docs.extend(doc)
 def load_text_data(file_path):
     """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
     # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
+    return DoclingLoader(file_path=file_path, chunker=text_splitter  # This will break your doc into manageable pieces.
+                        ).load()
 def extract_tables_from_docx(file_path):
     # Chia nhỏ văn bản
     # json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
+    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
     # table_splits = json_splitter.create_documents(texts=[table_data])
+    # text_splits = text_splitter.split_documents(text_data)
     # all_splits = table_splits + text_splits DoclingLoader
+    return text_data #text_splits
 def get_json_splits_only(file_path):
     table_data = load_json_manually(file_path)
     cleaned_data = remove_accents(table_data)
     wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
+    json_splitter = RecursiveJsonSplitter(max_chunk_size = 2000)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
     table_splits = json_splitter.create_documents(texts=[wrapped_data])
     table_splits = text_splitter.split_documents(table_splits)