File size: 3,845 Bytes
fd22419
 
 
0a72b65
46ee881
0a72b65
fd22419
 
 
 
 
0787084
d95c7be
46ee881
 
fd22419
5840faa
72ee423
 
d95c7be
46ee881
 
 
fd22419
46ee881
 
 
 
 
 
 
 
9c4d944
2d91f2a
46ee881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d91f2a
46ee881
 
 
 
 
 
 
 
 
d644065
46ee881
 
03cd284
 
46ee881
03cd284
46ee881
 
d644065
 
 
 
 
 
 
 
 
fd22419
9082445
46ee881
9082445
fd22419
46ee881
4f8fe62
32e14d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Import từ helpers
from helpers import (
    list_docx_files,  # Lấy danh sách file .docx
    get_splits,  # Xử lý file docx thành splits
    get_json_splits_only,  # Xử lý file JSON (FAQ)  
    get_web_documents,  # Xử lý dữ liệu từ web
    define_metadata,
    update_documents_metadata
)

import json

def get_vectorstore():
    # ### Xử lý tất cả các tài liệu và nhét vào database
    # folder_path = "syllabus_nct_word_format/"
    # docx_files = list_docx_files(folder_path)
    
    # all_splits = []  # Khởi tạo danh sách lưu kết quả
    # # print("Feeding relevent websites' contents")
    # # # 
    # # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
    # #     base_urls = [line.strip() for line in f]
    # # # urls_list
    # # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
    # # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
    
    
    # # website_contents = get_web_documents(base_urls=base_urls)
    # # all_splits += website_contents
    
    # print('Feeding .docx files')
    # for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
    #     output_json_path = f"output_{i}.json"
    #     splits = get_splits(file_path, output_json_path)
    #     all_splits += splits
        
    # print('Feeding .json files')
    # # Xử lý FAQ
    # FAQ_path = "syllabus_nct_word_format/FAQ.json"
    # FAQ_splits = get_json_splits_only(FAQ_path)
    # all_splits += FAQ_splits

    # FAQ_path = "syllabus_nct_word_format/FAQ2.json"
    # FAQ_splits = get_json_splits_only(FAQ_path)
    # all_splits += FAQ_splits
    
    # # Lưu vào vectorstore với nhúng từ Google GenAI
    # # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    # print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
    # embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    # print('Set vectorstore FAISS')
    # vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
    # print('Vectorstore ready!')
    # return vectorstore

    folder_path = "syllabus_nct_pdf_format/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
    docx_files = list_docx_files(folder_path)
    all_splits = []  # Khởi tạo danh sách lưu kết quả
    for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
        output_json_path = f"output_{i}.json"
        metadata = define_metadata(file_path)
        splits = get_splits(file_path, output_json_path)
        splits_with_metadata = update_documents_metadata(splits, metadata)
        all_splits += splits_with_metadata
        # if i == 1: break

    FAQ_path = "syllabus_nct_word_format/FAQ.json"
    FAQ_splits = get_json_splits_only(FAQ_path)
    all_splits += FAQ_splits

    FAQ_path = "syllabus_nct_word_format/FAQ2.json"
    FAQ_splits = get_json_splits_only(FAQ_path)
    all_splits += FAQ_splits
    
    print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")   #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
    print('Set vectorstore FAISS')
    vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
    
    print('Vectorstore ready!')
    return vectorstore