Spaces:
Sleeping
Sleeping
Update file_loader.py
Browse files- file_loader.py +19 -5
file_loader.py
CHANGED
@@ -63,13 +63,27 @@ def get_vectorstore():
|
|
63 |
folder_path = "syllabus_nct_docx_format_K66/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
|
64 |
docx_files = list_docx_files(folder_path)
|
65 |
all_splits = [] # Khởi tạo danh sách lưu kết quả
|
|
|
66 |
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
|
|
|
67 |
output_json_path = f"output_{i}.json"
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
FAQ_path = "syllabus_nct_word_format/FAQ.json"
|
75 |
FAQ_splits = get_json_splits_only(FAQ_path)
|
|
|
63 |
folder_path = "syllabus_nct_docx_format_K66/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
|
64 |
docx_files = list_docx_files(folder_path)
|
65 |
all_splits = [] # Khởi tạo danh sách lưu kết quả
|
66 |
+
error_files = [] # List to store files that caused errors
|
67 |
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
|
68 |
+
# print(f'Reading {file_path}')
|
69 |
output_json_path = f"output_{i}.json"
|
70 |
+
|
71 |
+
try:
|
72 |
+
metadata = define_metadata(file_path)
|
73 |
+
splits = get_splits(file_path, output_json_path)
|
74 |
+
splits_with_metadata = update_documents_metadata(splits, metadata)
|
75 |
+
all_splits += splits_with_metadata
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error processing {file_path}: {e}")
|
78 |
+
error_files.append(file_path) # Store the problematic file
|
79 |
+
|
80 |
+
# Print all files that caused errors
|
81 |
+
if error_files:
|
82 |
+
print("\nFiles that caused errors:")
|
83 |
+
for file in error_files:
|
84 |
+
print(file)
|
85 |
+
else:
|
86 |
+
print("\nAll files processed successfully!")
|
87 |
|
88 |
FAQ_path = "syllabus_nct_word_format/FAQ.json"
|
89 |
FAQ_splits = get_json_splits_only(FAQ_path)
|