Spaces:
Sleeping
Sleeping
Update helpers.py
Browse files- helpers.py +3 -1
helpers.py
CHANGED
@@ -147,9 +147,11 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
|
147 |
|
148 |
def load_text_data(file_path):
|
149 |
"""Load text content from a DOCX file (tables removed)."""
|
|
|
150 |
loader = DoclingLoader(
|
151 |
file_path=file_path,
|
152 |
-
export_type=ExportType.
|
|
|
153 |
)
|
154 |
return loader.load()
|
155 |
|
|
|
147 |
|
148 |
def load_text_data(file_path):
|
149 |
"""Load text content from a DOCX file (tables removed)."""
|
150 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
|
151 |
loader = DoclingLoader(
|
152 |
file_path=file_path,
|
153 |
+
export_type=ExportType.MARKDOWN, # Enable internal chunking,
|
154 |
+
chunker = text_splitter
|
155 |
)
|
156 |
return loader.load()
|
157 |
|