quoc-khanh commited on
Commit
53833ab
·
verified ·
1 Parent(s): 56d2620

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +13 -11
helpers.py CHANGED
@@ -16,6 +16,8 @@ import shutil
16
  import requests
17
  from bs4 import BeautifulSoup
18
  import os
 
 
19
 
20
  # from file_loader import get_vectorstore
21
  if "GOOGLE_API_KEY" not in os.environ:
@@ -94,11 +96,11 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
94
  docs = []
95
  for page_url in base_urls: # Kiểm tra biến urls có được định nghĩa trước đó không
96
  try:
97
- loader = AsyncHtmlLoader(page_url)
98
- html2text = Html2TextTransformer()
99
 
100
  html = loader.load()
101
- doc = html2text.transform_documents(html)
102
  docs.extend(doc)
103
  except Exception as e:
104
  print(f"Lỗi khi tải {page_url}: {e}")
@@ -132,8 +134,8 @@ def remove_tables_from_docx(file_path):
132
 
133
  def load_text_data(file_path):
134
  """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
135
- cleaned_file = remove_tables_from_docx(file_path)
136
- return UnstructuredWordDocumentLoader(cleaned_file).load()
137
 
138
 
139
  def extract_tables_from_docx(file_path):
@@ -210,17 +212,17 @@ def load_table_data(file_path, output_json_path):
210
  return table_data
211
 
212
  def get_splits(file_path, output_json_path):
213
- table_data = load_table_data(file_path, output_json_path)
214
- text_data = load_text_data(file_path)
215
 
216
  # Chia nhỏ văn bản
217
- json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
218
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
219
 
220
- table_splits = json_splitter.create_documents(texts=[table_data])
221
  text_splits = text_splitter.split_documents(text_data)
222
- all_splits = table_splits + text_splits
223
- return all_splits
224
 
225
  def get_json_splits_only(file_path):
226
  table_data = load_json_manually(file_path)
 
16
  import requests
17
  from bs4 import BeautifulSoup
18
  import os
19
+ from langchain_docling import DoclingLoader
20
+
21
 
22
  # from file_loader import get_vectorstore
23
  if "GOOGLE_API_KEY" not in os.environ:
 
96
  docs = []
97
  for page_url in base_urls: # Kiểm tra biến urls có được định nghĩa trước đó không
98
  try:
99
+ loader = DoclingLoader(page_url)
100
+ # html2text = Html2TextTransformer()
101
 
102
  html = loader.load()
103
+ doc = html#html2text.transform_documents(html)
104
  docs.extend(doc)
105
  except Exception as e:
106
  print(f"Lỗi khi tải {page_url}: {e}")
 
134
 
135
  def load_text_data(file_path):
136
  """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
137
+ cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
138
+ return DoclingLoader(cleaned_file).load()
139
 
140
 
141
  def extract_tables_from_docx(file_path):
 
212
  return table_data
213
 
214
  def get_splits(file_path, output_json_path):
215
+ # table_data = load_table_data(file_path, output_json_path)
216
+ # text_data = load_text_data(file_path)
217
 
218
  # Chia nhỏ văn bản
219
+ # json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
220
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
221
 
222
+ # table_splits = json_splitter.create_documents(texts=[table_data])
223
  text_splits = text_splitter.split_documents(text_data)
224
+ # all_splits = table_splits + text_splits DoclingLoader
225
+ return text_splits
226
 
227
  def get_json_splits_only(file_path):
228
  table_data = load_json_manually(file_path)