quoc-khanh commited on
Commit
9e543ad
·
verified ·
1 Parent(s): 6d361ab

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +1 -1
helpers.py CHANGED
@@ -154,7 +154,7 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
154
 
155
  def load_text_data(file_path):
156
  """Load text content from a DOCX file (tables removed)."""
157
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
158
  loader = DoclingLoader(
159
  file_path=file_path,
160
  export_type=ExportType.MARKDOWN, # Enable internal chunking,
 
154
 
155
  def load_text_data(file_path):
156
  """Load text content from a DOCX file (tables removed)."""
157
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=1000)
158
  loader = DoclingLoader(
159
  file_path=file_path,
160
  export_type=ExportType.MARKDOWN, # Enable internal chunking,