quoc-khanh commited on
Commit
82258a9
·
verified ·
1 Parent(s): 7382bd4

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +3 -1
helpers.py CHANGED
@@ -147,9 +147,11 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
147
 
148
  def load_text_data(file_path):
149
  """Load text content from a DOCX file (tables removed)."""
 
150
  loader = DoclingLoader(
151
  file_path=file_path,
152
- export_type=ExportType.DOC_CHUNKS # Enable internal chunking
 
153
  )
154
  return loader.load()
155
 
 
147
 
148
  def load_text_data(file_path):
149
  """Load text content from a DOCX file (tables removed)."""
150
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
151
  loader = DoclingLoader(
152
  file_path=file_path,
153
+ export_type=ExportType.MARKDOWN, # Enable internal chunking,
154
+ chunker = text_splitter
155
  )
156
  return loader.load()
157