quoc-khanh commited on
Commit
7baa217
·
verified ·
1 Parent(s): 3dd246e

Update file_loader.py

Browse files
Files changed (1) hide show
  1. file_loader.py +19 -5
file_loader.py CHANGED
@@ -63,13 +63,27 @@ def get_vectorstore():
63
  folder_path = "syllabus_nct_docx_format_K66/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
64
  docx_files = list_docx_files(folder_path)
65
  all_splits = [] # Khởi tạo danh sách lưu kết quả
 
66
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
 
67
  output_json_path = f"output_{i}.json"
68
- metadata = define_metadata(file_path)
69
- splits = get_splits(file_path, output_json_path)
70
- splits_with_metadata = update_documents_metadata(splits, metadata)
71
- all_splits += splits_with_metadata
72
- # if i == 1: break
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  FAQ_path = "syllabus_nct_word_format/FAQ.json"
75
  FAQ_splits = get_json_splits_only(FAQ_path)
 
63
  folder_path = "syllabus_nct_docx_format_K66/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
64
  docx_files = list_docx_files(folder_path)
65
  all_splits = [] # Khởi tạo danh sách lưu kết quả
66
+ error_files = [] # List to store files that caused errors
67
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
68
+ # print(f'Reading {file_path}')
69
  output_json_path = f"output_{i}.json"
70
+
71
+ try:
72
+ metadata = define_metadata(file_path)
73
+ splits = get_splits(file_path, output_json_path)
74
+ splits_with_metadata = update_documents_metadata(splits, metadata)
75
+ all_splits += splits_with_metadata
76
+ except Exception as e:
77
+ print(f"Error processing {file_path}: {e}")
78
+ error_files.append(file_path) # Store the problematic file
79
+
80
+ # Print all files that caused errors
81
+ if error_files:
82
+ print("\nFiles that caused errors:")
83
+ for file in error_files:
84
+ print(file)
85
+ else:
86
+ print("\nAll files processed successfully!")
87
 
88
  FAQ_path = "syllabus_nct_word_format/FAQ.json"
89
  FAQ_splits = get_json_splits_only(FAQ_path)