from docx import Document import json import datetime import tempfile from pathlib import Path from unidecode import unidecode from langchain_community.document_loaders import JSONLoader, UnstructuredWordDocumentLoader, WebBaseLoader from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter from langchain_community.vectorstores import FAISS from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI import google.generativeai as genai from tqdm import tqdm from pathlib import Path import shutil import requests from bs4 import BeautifulSoup import os # from file_loader import get_vectorstore if "GOOGLE_API_KEY" not in os.environ: os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs" key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs" ### def get_vectorstore(): ### Xử lý tất cả các tài liệu và nhét vào database folder_path = "syllabus_nct_word_format/" docx_files = list_docx_files(folder_path) all_splits = [] # Khởi tạo danh sách lưu kết quả for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")): output_json_path = f"output_{i}.json" splits = get_splits(file_path, output_json_path) all_splits += splits # Xử lý FAQ FAQ_path = "syllabus_nct_word_format/FAQ.json" FAQ_splits = get_json_splits_only(FAQ_path) all_splits += FAQ_splits # Lưu vào vectorstore với nhúng từ Google GenAI embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding) return vectorstore ### async def get_urls_splits(url='https://nct.neu.edu.vn/', char='https://nct.neu.edu.vn/'): reqs = requests.get(url) soup = BeautifulSoup(reqs.text, 'html.parser') urls = [] for link in soup.find_all('a', href=True): # Chỉ lấy thẻ có 'href' href = link.get('href') if href.startswith(char): urls.append(href) return urls # docs = [] # for page_url in url: # loader = WebBaseLoader(web_paths=[page_url]) # async for doc in loader.alazy_load(): # docs.append(doc) # assert len(docs) == 1 # # doc = docs[0] # return docs # Ví dụ sử dụng # nct_urls = get_nct_urls('https://nct.neu.edu.vn/') # print(nct_urls) def log_message(messages, filename="chat_log.txt"): """Ghi lịch sử tin nhắn vào file log""" with open(filename, "a", encoding="utf-8") as f: log_entry = { "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "conversation": messages } f.write(json.dumps(log_entry, ensure_ascii=False) + "\n") def remove_tables_from_docx(file_path): """Tạo bản sao của file DOCX nhưng loại bỏ tất cả bảng bên trong.""" doc = Document(file_path) new_doc = Document() for para in doc.paragraphs: new_doc.add_paragraph(para.text) # 📌 Lưu vào file tạm, đảm bảo đóng đúng cách with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file: temp_path = temp_file.name new_doc.save(temp_path) return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc def load_text_data(file_path): """Tải nội dung văn bản từ file DOCX (đã loại bảng).""" cleaned_file = remove_tables_from_docx(file_path) return UnstructuredWordDocumentLoader(cleaned_file).load() def extract_tables_from_docx(file_path): doc = Document(file_path) tables = [] all_paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] # Lấy tất cả đoạn văn bản không rỗng table_index = 0 para_index = 0 table_positions = [] # Xác định vị trí của bảng trong tài liệu for element in doc.element.body: if element.tag.endswith("tbl"): table_positions.append((table_index, para_index)) table_index += 1 elif element.tag.endswith("p"): para_index += 1 for idx, (table_idx, para_idx) in enumerate(table_positions): data = [] for row in doc.tables[table_idx].rows: data.append([cell.text.strip() for cell in row.cells]) if len(data) > 1: # Chỉ lấy bảng có dữ liệu # Lấy 5 dòng trước và sau bảng related_start = max(0, para_idx - 5) related_end = min(len(all_paragraphs), para_idx + 5) related_text = all_paragraphs[related_start:related_end] tables.append({"table": idx + 1, "content": data, "related": related_text}) return tables def convert_to_json(tables): structured_data = {} for table in tables: headers = [unidecode(h) for h in table["content"][0]] # Bỏ dấu ở headers rows = [[unidecode(cell) for cell in row] for row in table["content"][1:]] # Bỏ dấu ở dữ liệu json_table = [dict(zip(headers, row)) for row in rows if len(row) == len(headers)] related_text = [unidecode(text) for text in table["related"]] # Bỏ dấu ở văn bản liên quan structured_data[table["table"]] = { "content": json_table, "related": related_text } return json.dumps(structured_data, indent=4, ensure_ascii=False) def save_json_to_file(json_data, output_path): with open(output_path, 'w', encoding='utf-8') as f: json.dump(json.loads(json_data), f, ensure_ascii=False, indent=4) # def load_json_with_langchain(json_path): # loader = JSONLoader(file_path=json_path, jq_schema='.. | .content?', text_content=False) # data = loader.load() # # # Kiểm tra xem dữ liệu có bị lỗi không # # print("Sample Data:", data[:2]) # In thử 2 dòng đầu # return data def load_json_manually(json_path): with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) return data def load_table_data(file_path, output_json_path): tables = extract_tables_from_docx(file_path) json_output = convert_to_json(tables) save_json_to_file(json_output, output_json_path) table_data = load_json_manually(output_json_path) return table_data def get_splits(file_path, output_json_path): table_data = load_table_data(file_path, output_json_path) text_data = load_text_data(file_path) # Chia nhỏ văn bản json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250) table_splits = json_splitter.create_documents(texts=[table_data]) text_splits = text_splitter.split_documents(text_data) all_splits = table_splits + text_splits return all_splits def get_json_splits_only(file_path): table_data = load_json_manually(file_path) def remove_accents(obj): #xoa dau tieng viet if isinstance(obj, str): return unidecode(obj) elif isinstance(obj, list): return [remove_accents(item) for item in obj] elif isinstance(obj, dict): return {remove_accents(k): remove_accents(v) for k, v in obj.items()} return obj cleaned_data = remove_accents(table_data) wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data json_splitter = RecursiveJsonSplitter(max_chunk_size = 512) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250) table_splits = json_splitter.create_documents(texts=[wrapped_data]) table_splits = text_splitter.split_documents(table_splits) return table_splits def list_docx_files(folder_path): return [str(file) for file in Path(folder_path).rglob("*.docx")] def prompt_order(queries): text = 'IMPORTANT: Here is the questions of user in order, use that and the context above to know the best answer:\n' i = 0 for q in queries: i += 1 text += f'Question {i}: {str(q)}\n' return text # Define the augment_prompt function def augment_prompt(query: str, k: int = 10): queries = [] queries.append(query) if not vectorstore: vectorstore = get_vectorstore() retriever = vectorstore.as_retriever(search_kwargs={"k": k}) results = retriever.invoke(query) if results: source_knowledge = "\n\n".join([doc.page_content for doc in results]) return f"""Using the contexts below, answer the query. Contexts: {source_knowledge} """ else: return f"No relevant context found.\n." def get_answer(query, queries_list=None): if queries_list is None: queries_list = [] messages = [ {"role": "user", "parts": [{"text": "IMPORTANT: You are a super energetic, helpful, polite, Vietnamese-speaking assistant. If you can not see the answer in contexts, try to search it up online by yourself but remember to give the source."}]}, {"role": "user", "parts": [{"text": augment_prompt(query)}]} ] # bonus = ''' # Bạn tham kháo thêm các nguồn thông tin tại: # Trang thông tin điện tử: https://neu.edu.vn ; https://daotao.neu.edu.vn # Trang mạng xã hội có thông tin tuyển sinh: https://www.facebook.com/ktqdNEU ; https://www.facebook.com/tvtsneu ; # Email tuyển sinh: tuvantuyensinh@neu.edu.vn # Số điện thoại tuyển sinh: 0888.128.558 # ''' queries_list.append(query) queries = {"role": "user", "parts": [{"text": prompt_order(queries_list)}]} messages_with_queries = messages.copy() messages_with_queries.append(queries) # messages_with_queries.insert(0, queries) # Configure API key genai.configure(api_key=key) # Initialize the Gemini model model = genai.GenerativeModel("gemini-2.0-flash") response = model.generate_content(contents=messages_with_queries, stream=True) response_text = "" for chunk in response: response_text += chunk.text yield response_text messages.append({"role": "model", "parts": [{"text": response_text}]}) # user_feedback = yield "\nNhập phản hồi của bạn (hoặc nhập 'q' để thoát): " # if user_feedback.lower() == "q": # break # messages.append({"role": "user", "parts": [{"text": query}]}) log_message(messages)