File size: 4,450 Bytes
fd22419
 
 
0a72b65
46ee881
0a72b65
fd22419
 
 
 
 
0787084
d95c7be
46ee881
 
fd22419
5840faa
72ee423
 
d95c7be
46ee881
 
 
fd22419
46ee881
 
 
 
 
 
 
 
9c4d944
2d91f2a
46ee881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d91f2a
46ee881
 
 
 
 
 
 
 
 
d4c53ab
 
46ee881
 
7baa217
03cd284
7baa217
03cd284
7baa217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d644065
 
 
 
 
 
 
 
fd22419
9082445
46ee881
9082445
fd22419
46ee881
4f8fe62
32e14d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Import từ helpers
from helpers import (
    list_docx_files,  # Lấy danh sách file .docx
    get_splits,  # Xử lý file docx thành splits
    get_json_splits_only,  # Xử lý file JSON (FAQ)  
    get_web_documents,  # Xử lý dữ liệu từ web
    define_metadata,
    update_documents_metadata
)

import json

def get_vectorstore():
    # ### Xử lý tất cả các tài liệu và nhét vào database
    # folder_path = "syllabus_nct_word_format/"
    # docx_files = list_docx_files(folder_path)
    
    # all_splits = []  # Khởi tạo danh sách lưu kết quả
    # # print("Feeding relevent websites' contents")
    # # # 
    # # with open('syllabus_nct_word_format/urls.txt', 'r') as f:
    # #     base_urls = [line.strip() for line in f]
    # # # urls_list
    # # # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
    # # # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
    
    
    # # website_contents = get_web_documents(base_urls=base_urls)
    # # all_splits += website_contents
    
    # print('Feeding .docx files')
    # for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
    #     output_json_path = f"output_{i}.json"
    #     splits = get_splits(file_path, output_json_path)
    #     all_splits += splits
        
    # print('Feeding .json files')
    # # Xử lý FAQ
    # FAQ_path = "syllabus_nct_word_format/FAQ.json"
    # FAQ_splits = get_json_splits_only(FAQ_path)
    # all_splits += FAQ_splits

    # FAQ_path = "syllabus_nct_word_format/FAQ2.json"
    # FAQ_splits = get_json_splits_only(FAQ_path)
    # all_splits += FAQ_splits
    
    # # Lưu vào vectorstore với nhúng từ Google GenAI
    # # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    # print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
    # embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    # print('Set vectorstore FAISS')
    # vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
    # print('Vectorstore ready!')
    # return vectorstore

    folder_path = "syllabus_nct_docx_format_K66/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
    # 'syllabus_nct_word_format/Trường Công nghệ/Chương trình An toàn thông tin'
    docx_files = list_docx_files(folder_path)
    all_splits = []  # Khởi tạo danh sách lưu kết quả
    error_files = []  # List to store files that caused errors
    for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
        # print(f'Reading {file_path}')
        output_json_path = f"output_{i}.json"
        
        try:
            metadata = define_metadata(file_path)
            splits = get_splits(file_path, output_json_path)
            splits_with_metadata = update_documents_metadata(splits, metadata)
            all_splits += splits_with_metadata    
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            error_files.append(file_path)  # Store the problematic file
    
    # Print all files that caused errors
    if error_files:
        print("\nFiles that caused errors:")
        for file in error_files:
            print(file)
    else:
        print("\nAll files processed successfully!")

    FAQ_path = "syllabus_nct_word_format/FAQ.json"
    FAQ_splits = get_json_splits_only(FAQ_path)
    all_splits += FAQ_splits

    FAQ_path = "syllabus_nct_word_format/FAQ2.json"
    FAQ_splits = get_json_splits_only(FAQ_path)
    all_splits += FAQ_splits
    
    print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")   #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
    print('Set vectorstore FAISS')
    vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
    
    print('Vectorstore ready!')
    return vectorstore