Spaces:
Sleeping
Sleeping
File size: 3,845 Bytes
fd22419 0a72b65 46ee881 0a72b65 fd22419 0787084 d95c7be 46ee881 fd22419 5840faa 72ee423 d95c7be 46ee881 fd22419 46ee881 9c4d944 2d91f2a 46ee881 2d91f2a 46ee881 d644065 46ee881 03cd284 46ee881 03cd284 46ee881 d644065 fd22419 9082445 46ee881 9082445 fd22419 46ee881 4f8fe62 32e14d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# Import từ helpers
from helpers import (
list_docx_files, # Lấy danh sách file .docx
get_splits, # Xử lý file docx thành splits
get_json_splits_only, # Xử lý file JSON (FAQ)
get_web_documents, # Xử lý dữ liệu từ web
define_metadata,
update_documents_metadata
)
import json
def get_vectorstore():
# ### Xử lý tất cả các tài liệu và nhét vào database
# folder_path = "syllabus_nct_word_format/"
# docx_files = list_docx_files(folder_path)
# all_splits = [] # Khởi tạo danh sách lưu kết quả
# # print("Feeding relevent websites' contents")
# # #
# # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
# # base_urls = [line.strip() for line in f]
# # # urls_list
# # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
# # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
# # website_contents = get_web_documents(base_urls=base_urls)
# # all_splits += website_contents
# print('Feeding .docx files')
# for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
# output_json_path = f"output_{i}.json"
# splits = get_splits(file_path, output_json_path)
# all_splits += splits
# print('Feeding .json files')
# # Xử lý FAQ
# FAQ_path = "syllabus_nct_word_format/FAQ.json"
# FAQ_splits = get_json_splits_only(FAQ_path)
# all_splits += FAQ_splits
# FAQ_path = "syllabus_nct_word_format/FAQ2.json"
# FAQ_splits = get_json_splits_only(FAQ_path)
# all_splits += FAQ_splits
# # Lưu vào vectorstore với nhúng từ Google GenAI
# # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
# print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
# embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# print('Set vectorstore FAISS')
# vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
# print('Vectorstore ready!')
# return vectorstore
folder_path = "syllabus_nct_pdf_format/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
docx_files = list_docx_files(folder_path)
all_splits = [] # Khởi tạo danh sách lưu kết quả
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
output_json_path = f"output_{i}.json"
metadata = define_metadata(file_path)
splits = get_splits(file_path, output_json_path)
splits_with_metadata = update_documents_metadata(splits, metadata)
all_splits += splits_with_metadata
# if i == 1: break
FAQ_path = "syllabus_nct_word_format/FAQ.json"
FAQ_splits = get_json_splits_only(FAQ_path)
all_splits += FAQ_splits
FAQ_path = "syllabus_nct_word_format/FAQ2.json"
FAQ_splits = get_json_splits_only(FAQ_path)
all_splits += FAQ_splits
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
print('Set vectorstore FAISS')
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
print('Vectorstore ready!')
return vectorstore
|