quoc-khanh commited on
Commit
49ecf68
·
verified ·
1 Parent(s): b65f639

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +55 -19
helpers.py CHANGED
@@ -16,7 +16,7 @@ import shutil
16
  import requests
17
  from bs4 import BeautifulSoup
18
  import os
19
- from langchain_docling import DoclingLoader
20
 
21
 
22
  # from file_loader import get_vectorstore
@@ -91,24 +91,67 @@ key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
91
 
92
  # return asyncio.run(_main)
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
95
- """Tải nội dung từ danh sách URL với thanh tiến trình"""
96
- docs = []
97
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
98
- for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
99
  try:
100
- # loader = WebBaseLoader(page_url)
101
- loader = DoclingLoader(file_path=page_url,chunker=text_splitter # This will break your doc into manageable pieces.
102
- )
103
- html = loader.load()
104
- doc = html
105
  docs.extend(doc)
106
  except Exception as e:
107
- print(f"Lỗi khi tải {page_url}: {e}")
108
 
109
- print(f"Tải thành công {len(docs)} trang.")
110
  return docs
111
 
 
 
 
 
 
 
 
 
112
 
113
  def log_message(messages, filename="chat_log.txt"):
114
  """Ghi lịch sử tin nhắn vào file log"""
@@ -134,13 +177,6 @@ def remove_tables_from_docx(file_path):
134
 
135
  return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
136
 
137
- def load_text_data(file_path):
138
- """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
139
- # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
140
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
141
- return DoclingLoader(file_path=file_path, chunker=text_splitter # This will break your doc into manageable pieces.
142
- ).load()
143
-
144
 
145
  def extract_tables_from_docx(file_path):
146
  doc = Document(file_path)
 
16
  import requests
17
  from bs4 import BeautifulSoup
18
  import os
19
+ from langchain_docling import DoclingLoader, ExportType
20
 
21
 
22
  # from file_loader import get_vectorstore
 
91
 
92
  # return asyncio.run(_main)
93
 
94
+ # class ChunkerWrapper:
95
+ # def __init__(self, splitter):
96
+ # self.splitter = splitter
97
+
98
+ # def chunk(self, text):
99
+ # # Use the 'split_text' method of the splitter to divide the text
100
+ # return self.splitter.split_text(text)
101
+
102
+ # def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
103
+ # """Tải nội dung từ danh sách URL với thanh tiến trình"""
104
+ # docs = []
105
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
106
+ # chunker = ChunkerWrapper(text_splitter)
107
+ # for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
108
+ # try:
109
+ # # loader = WebBaseLoader(page_url)
110
+ # loader = DoclingLoader(file_path=page_url,chunker=chunker # This will break your doc into manageable pieces.
111
+ # )
112
+ # html = loader.load()
113
+ # doc = html
114
+ # docs.extend(doc)
115
+ # except Exception as e:
116
+ # print(f"Lỗi khi tải {page_url}: {e}")
117
+
118
+ # print(f"Tải thành công {len(docs)} trang.")
119
+ # return docs
120
+
121
+ # def load_text_data(file_path):
122
+ # """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
123
+ # # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
124
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
125
+ # chunker = ChunkerWrapper(text_splitter)
126
+ # return DoclingLoader(file_path=file_path, chunker=chunker # This will break your doc into manageable pieces.
127
+ # ).load()
128
+
129
+
130
  def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
131
+ """Fetch content from a list of URLs with a progress bar."""
132
+ docs = []
133
+ for page_url in tqdm(base_urls, desc="Loading page", unit="url"):
 
134
  try:
135
+ loader = DoclingLoader(
136
+ file_path=page_url,
137
+ export_type=ExportType.DOC_CHUNKS # Enable internal chunking
138
+ )
139
+ doc = loader.load()
140
  docs.extend(doc)
141
  except Exception as e:
142
+ print(f"Error loading {page_url}: {e}")
143
 
144
+ print(f"Successfully loaded {len(docs)} documents.")
145
  return docs
146
 
147
+ def load_text_data(file_path):
148
+ """Load text content from a DOCX file (tables removed)."""
149
+ loader = DoclingLoader(
150
+ file_path=file_path,
151
+ export_type=ExportType.DOC_CHUNKS # Enable internal chunking
152
+ )
153
+ return loader.load()
154
+
155
 
156
  def log_message(messages, filename="chat_log.txt"):
157
  """Ghi lịch sử tin nhắn vào file log"""
 
177
 
178
  return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
179
 
 
 
 
 
 
 
 
180
 
181
  def extract_tables_from_docx(file_path):
182
  doc = Document(file_path)