quoc-khanh commited on
Commit
7ecbb5e
·
verified ·
1 Parent(s): c927341

Update file_loader.py

Browse files
Files changed (1) hide show
  1. file_loader.py +9 -9
file_loader.py CHANGED
@@ -20,17 +20,17 @@ def get_vectorstore():
20
  docx_files = list_docx_files(folder_path)
21
 
22
  all_splits = [] # Khởi tạo danh sách lưu kết quả
23
- print("Feeding relevent websites' contents")
24
- #
25
- with open('syllabus_nct_word_format/urls.txt', 'r') as f:
26
- base_urls = [line.strip() for line in f]
27
- # urls_list
28
- # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
29
- # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
30
 
31
 
32
- website_contents = get_web_documents(base_urls=base_urls)
33
- all_splits += website_contents
34
 
35
  print('Feeding .docx files')
36
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
 
20
  docx_files = list_docx_files(folder_path)
21
 
22
  all_splits = [] # Khởi tạo danh sách lưu kết quả
23
+ # print("Feeding relevent websites' contents")
24
+ # #
25
+ # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
26
+ # base_urls = [line.strip() for line in f]
27
+ # # urls_list
28
+ # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
29
+ # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
30
 
31
 
32
+ # website_contents = get_web_documents(base_urls=base_urls)
33
+ # all_splits += website_contents
34
 
35
  print('Feeding .docx files')
36
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):