Spaces:
Sleeping
Sleeping
File size: 4,450 Bytes
fd22419 0a72b65 46ee881 0a72b65 fd22419 0787084 d95c7be 46ee881 fd22419 5840faa 72ee423 d95c7be 46ee881 fd22419 46ee881 9c4d944 2d91f2a 46ee881 2d91f2a 46ee881 d4c53ab 46ee881 7baa217 03cd284 7baa217 03cd284 7baa217 d644065 fd22419 9082445 46ee881 9082445 fd22419 46ee881 4f8fe62 32e14d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# Import từ helpers
from helpers import (
list_docx_files, # Lấy danh sách file .docx
get_splits, # Xử lý file docx thành splits
get_json_splits_only, # Xử lý file JSON (FAQ)
get_web_documents, # Xử lý dữ liệu từ web
define_metadata,
update_documents_metadata
)
import json
def get_vectorstore():
# ### Xử lý tất cả các tài liệu và nhét vào database
# folder_path = "syllabus_nct_word_format/"
# docx_files = list_docx_files(folder_path)
# all_splits = [] # Khởi tạo danh sách lưu kết quả
# # print("Feeding relevent websites' contents")
# # #
# # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
# # base_urls = [line.strip() for line in f]
# # # urls_list
# # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
# # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
# # website_contents = get_web_documents(base_urls=base_urls)
# # all_splits += website_contents
# print('Feeding .docx files')
# for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
# output_json_path = f"output_{i}.json"
# splits = get_splits(file_path, output_json_path)
# all_splits += splits
# print('Feeding .json files')
# # Xử lý FAQ
# FAQ_path = "syllabus_nct_word_format/FAQ.json"
# FAQ_splits = get_json_splits_only(FAQ_path)
# all_splits += FAQ_splits
# FAQ_path = "syllabus_nct_word_format/FAQ2.json"
# FAQ_splits = get_json_splits_only(FAQ_path)
# all_splits += FAQ_splits
# # Lưu vào vectorstore với nhúng từ Google GenAI
# # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
# print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
# embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# print('Set vectorstore FAISS')
# vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
# print('Vectorstore ready!')
# return vectorstore
folder_path = "syllabus_nct_docx_format_K66/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
# 'syllabus_nct_word_format/Trường Công nghệ/Chương trình An toàn thông tin'
docx_files = list_docx_files(folder_path)
all_splits = [] # Khởi tạo danh sách lưu kết quả
error_files = [] # List to store files that caused errors
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
# print(f'Reading {file_path}')
output_json_path = f"output_{i}.json"
try:
metadata = define_metadata(file_path)
splits = get_splits(file_path, output_json_path)
splits_with_metadata = update_documents_metadata(splits, metadata)
all_splits += splits_with_metadata
except Exception as e:
print(f"Error processing {file_path}: {e}")
error_files.append(file_path) # Store the problematic file
# Print all files that caused errors
if error_files:
print("\nFiles that caused errors:")
for file in error_files:
print(file)
else:
print("\nAll files processed successfully!")
FAQ_path = "syllabus_nct_word_format/FAQ.json"
FAQ_splits = get_json_splits_only(FAQ_path)
all_splits += FAQ_splits
FAQ_path = "syllabus_nct_word_format/FAQ2.json"
FAQ_splits = get_json_splits_only(FAQ_path)
all_splits += FAQ_splits
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
print('Set vectorstore FAISS')
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
print('Vectorstore ready!')
return vectorstore
|