quoc-khanh commited on
Commit
b65f639
·
verified ·
1 Parent(s): c09b200

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +12 -7
helpers.py CHANGED
@@ -94,9 +94,12 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
94
  def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
95
  """Tải nội dung từ danh sách URL với thanh tiến trình"""
96
  docs = []
 
97
  for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
98
  try:
99
- loader = WebBaseLoader(page_url)
 
 
100
  html = loader.load()
101
  doc = html
102
  docs.extend(doc)
@@ -134,7 +137,9 @@ def remove_tables_from_docx(file_path):
134
  def load_text_data(file_path):
135
  """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
136
  # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
137
- return DoclingLoader(file_path).load()
 
 
138
 
139
 
140
  def extract_tables_from_docx(file_path):
@@ -216,12 +221,12 @@ def get_splits(file_path, output_json_path):
216
 
217
  # Chia nhỏ văn bản
218
  # json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
219
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
220
 
221
  # table_splits = json_splitter.create_documents(texts=[table_data])
222
- text_splits = text_splitter.split_documents(text_data)
223
  # all_splits = table_splits + text_splits DoclingLoader
224
- return text_splits
225
 
226
  def get_json_splits_only(file_path):
227
  table_data = load_json_manually(file_path)
@@ -238,8 +243,8 @@ def get_json_splits_only(file_path):
238
  cleaned_data = remove_accents(table_data)
239
  wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
240
 
241
- json_splitter = RecursiveJsonSplitter(max_chunk_size = 512)
242
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
243
 
244
  table_splits = json_splitter.create_documents(texts=[wrapped_data])
245
  table_splits = text_splitter.split_documents(table_splits)
 
94
  def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
95
  """Tải nội dung từ danh sách URL với thanh tiến trình"""
96
  docs = []
97
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
98
  for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
99
  try:
100
+ # loader = WebBaseLoader(page_url)
101
+ loader = DoclingLoader(file_path=page_url,chunker=text_splitter # This will break your doc into manageable pieces.
102
+ )
103
  html = loader.load()
104
  doc = html
105
  docs.extend(doc)
 
137
  def load_text_data(file_path):
138
  """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
139
  # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
140
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
141
+ return DoclingLoader(file_path=file_path, chunker=text_splitter # This will break your doc into manageable pieces.
142
+ ).load()
143
 
144
 
145
  def extract_tables_from_docx(file_path):
 
221
 
222
  # Chia nhỏ văn bản
223
  # json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
224
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
225
 
226
  # table_splits = json_splitter.create_documents(texts=[table_data])
227
+ # text_splits = text_splitter.split_documents(text_data)
228
  # all_splits = table_splits + text_splits DoclingLoader
229
+ return text_data #text_splits
230
 
231
  def get_json_splits_only(file_path):
232
  table_data = load_json_manually(file_path)
 
243
  cleaned_data = remove_accents(table_data)
244
  wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
245
 
246
+ json_splitter = RecursiveJsonSplitter(max_chunk_size = 2000)
247
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
248
 
249
  table_splits = json_splitter.create_documents(texts=[wrapped_data])
250
  table_splits = text_splitter.split_documents(table_splits)