quoc-khanh commited on
Commit
b618a75
·
verified ·
1 Parent(s): 333458b

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +27 -1
helpers.py CHANGED
@@ -15,7 +15,33 @@ import shutil
15
  import requests
16
  from bs4 import BeautifulSoup
17
 
18
- from file_loader import get_vectorstore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  async def get_urls_splits(url='https://nct.neu.edu.vn/', char='https://nct.neu.edu.vn/'):
21
  reqs = requests.get(url)
 
15
  import requests
16
  from bs4 import BeautifulSoup
17
 
18
+ # from file_loader import get_vectorstore
19
+
20
+ ###
21
+
22
+ def get_vectorstore():
23
+ ### Xử lý tất cả các tài liệu và nhét vào database
24
+ folder_path = "syllabus_nct_word_format/"
25
+ docx_files = list_docx_files(folder_path)
26
+
27
+ all_splits = [] # Khởi tạo danh sách lưu kết quả
28
+ for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
29
+ output_json_path = f"output_{i}.json"
30
+ splits = get_splits(file_path, output_json_path)
31
+ all_splits += splits
32
+
33
+ # Xử lý FAQ
34
+ FAQ_path = "syllabus_nct_word_format/FAQ.json"
35
+ FAQ_splits = get_json_splits_only(FAQ_path)
36
+ all_splits += FAQ_splits
37
+
38
+ # Lưu vào vectorstore với nhúng từ Google GenAI
39
+ embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
40
+ vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
41
+
42
+ return vectorstore
43
+
44
+ ###
45
 
46
  async def get_urls_splits(url='https://nct.neu.edu.vn/', char='https://nct.neu.edu.vn/'):
47
  reqs = requests.get(url)