Spaces:

quoc-khanh
/

chatbot4nct_test1

Sleeping

App Files Files Community

quoc-khanh commited on Feb 25

Commit

4993b07

verified ·

1 Parent(s): 24c98f4

Create helpers.py

Browse files

Files changed (1) hide show

helpers.py +255 -0

helpers.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from docx import Document
+import json
+import datetime
+import tempfile
+from pathlib import Path
+from unidecode import unidecode
+from langchain.document_loaders import JSONLoader, UnstructuredWordDocumentLoader, WebBaseLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
+import google.generativeai as genai
+from tqdm import tqdm
+from pathlib import Path
+import shutil
+import requests
+from bs4 import BeautifulSoup
+async def get_urls_splits(url='https://nct.neu.edu.vn/', char='https://nct.neu.edu.vn/'):
+    reqs = requests.get(url)
+    soup = BeautifulSoup(reqs.text, 'html.parser')
+    urls = []
+    for link in soup.find_all('a', href=True):  # Chỉ lấy thẻ có 'href'
+        href = link.get('href')
+        if href.startswith(char):
+            urls.append(href)
+    return urls
+    # docs = []
+    # for page_url in url:
+    #     loader = WebBaseLoader(web_paths=[page_url])
+    #     async for doc in loader.alazy_load():
+    #         docs.append(doc)
+    #     assert len(docs) == 1
+    #     # doc = docs[0]
+    # return docs
+# Ví dụ sử dụng
+# nct_urls = get_nct_urls('https://nct.neu.edu.vn/')
+# print(nct_urls)
+def log_message(messages, filename="chat_log.txt"):
+    """Ghi lịch sử tin nhắn vào file log"""
+    with open(filename, "a", encoding="utf-8") as f:
+        log_entry = {
+            "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "conversation": messages
+        }
+        f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
+def remove_tables_from_docx(file_path):
+    """Tạo bản sao của file DOCX nhưng loại bỏ tất cả bảng bên trong."""
+    doc = Document(file_path)
+    new_doc = Document()
+    for para in doc.paragraphs:
+        new_doc.add_paragraph(para.text)
+    # 📌 Lưu vào file tạm, đảm bảo đóng đúng cách
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
+        temp_path = temp_file.name
+        new_doc.save(temp_path)
+    return temp_path  # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
+def load_text_data(file_path):
+    """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
+    cleaned_file = remove_tables_from_docx(file_path)
+    return UnstructuredWordDocumentLoader(cleaned_file).load()
+def extract_tables_from_docx(file_path):
+    doc = Document(file_path)
+    tables = []
+    all_paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]  # Lấy tất cả đoạn văn bản không rỗng
+    table_index = 0
+    para_index = 0
+    table_positions = []
+    # Xác định vị trí của bảng trong tài liệu
+    for element in doc.element.body:
+        if element.tag.endswith("tbl"):
+            table_positions.append((table_index, para_index))
+            table_index += 1
+        elif element.tag.endswith("p"):
+            para_index += 1
+    for idx, (table_idx, para_idx) in enumerate(table_positions):
+        data = []
+        for row in doc.tables[table_idx].rows:
+            data.append([cell.text.strip() for cell in row.cells])
+        if len(data) > 1:  # Chỉ lấy bảng có dữ liệu
+            # Lấy 5 dòng trước và sau bảng
+            related_start = max(0, para_idx - 5)
+            related_end = min(len(all_paragraphs), para_idx + 5)
+            related_text = all_paragraphs[related_start:related_end]
+            tables.append({"table": idx + 1, "content": data, "related": related_text})
+    return tables
+def convert_to_json(tables):
+    structured_data = {}
+    for table in tables:
+        headers = [unidecode(h) for h in table["content"][0]]  # Bỏ dấu ở headers
+        rows = [[unidecode(cell) for cell in row] for row in table["content"][1:]]  # Bỏ dấu ở dữ liệu
+        json_table = [dict(zip(headers, row)) for row in rows if len(row) == len(headers)]
+        related_text = [unidecode(text) for text in table["related"]]  # Bỏ dấu ở văn bản liên quan
+        structured_data[table["table"]] = {
+            "content": json_table,
+            "related": related_text
+        }
+    return json.dumps(structured_data, indent=4, ensure_ascii=False)
+def save_json_to_file(json_data, output_path):
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(json.loads(json_data), f, ensure_ascii=False, indent=4)
+# def load_json_with_langchain(json_path):
+#     loader = JSONLoader(file_path=json_path, jq_schema='.. | .content?', text_content=False)
+#     data = loader.load()
+#     # # Kiểm tra xem dữ liệu có bị lỗi không
+#     # print("Sample Data:", data[:2])  # In thử 2 dòng đầu
+#     return data
+def load_json_manually(json_path):
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+def load_table_data(filepath, output_json_path):
+    tables = extract_tables_from_docx(file_path)
+    json_output = convert_to_json(tables)
+    save_json_to_file(json_output, output_json_path)
+    table_data = load_json_manually(output_json_path)
+    return table_data
+def get_splits(file_path, output_json_path):
+    table_data = load_table_data(file_path, output_json_path)
+    text_data = load_text_data(file_path)
+    # Chia nhỏ văn bản
+    json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
+    table_splits = json_splitter.create_documents(texts=[table_data])
+    text_splits = text_splitter.split_documents(text_data)
+    all_splits = table_splits + text_splits
+    return all_splits
+def get_json_splits_only(file_path):
+    table_data = load_json_manually(file_path)
+    def remove_accents(obj): #xoa dau tieng viet
+        if isinstance(obj, str):
+            return unidecode(obj)
+        elif isinstance(obj, list):
+            return [remove_accents(item) for item in obj]
+        elif isinstance(obj, dict):
+            return {remove_accents(k): remove_accents(v) for k, v in obj.items()}
+        return obj
+    cleaned_data = remove_accents(table_data)
+    wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
+    json_splitter = RecursiveJsonSplitter(max_chunk_size = 512)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
+    table_splits = json_splitter.create_documents(texts=[wrapped_data])
+    table_splits = text_splitter.split_documents(table_splits)
+    return table_splits
+def list_docx_files(folder_path):
+    return [str(file) for file in Path(folder_path).rglob("*.docx")]
+def prompt_order(queries):
+    text = 'IMPORTANT: Here is the questions of user in order, use that and the context above to know the best answer:\n'
+    i = 0
+    for q in queries:
+        i += 1
+        text += f'Question {i}: {str(q)}\n'
+    return text
+# Define the augment_prompt function
+def augment_prompt(query: str, k: int = 10):
+    queries = []
+    queries.append(query)
+    retriever = vectorstore.as_retriever(search_kwargs={"k": k})
+    results = retriever.invoke(query)
+    if results:
+        source_knowledge = "\n\n".join([doc.page_content for doc in results])
+        return f"""Using the contexts below, answer the query.
+Contexts:
+{source_knowledge}
+"""
+    else:
+        return f"No relevant context found.\n."
+def get_answer(query, queries_list=None):
+    if queries_list is None:
+        queries_list = []
+    messages = [
+    {"role": "user", "parts": [{"text": "IMPORTANT: You are a super energetic, helpful, polite, Vietnamese-speaking assistant. If you can not see the answer in contexts, try to search it up online by yourself but remember to give the source."}]},
+    {"role": "user", "parts": [{"text": augment_prompt(query)}]}
+]
+#     bonus = '''
+# Bạn tham kháo thêm các nguồn thông tin tại:
+# Trang thông tin điện tử: https://neu.edu.vn ; https://daotao.neu.edu.vn
+# Trang mạng xã hội có thông tin tuyển sinh: https://www.facebook.com/ktqdNEU ; https://www.facebook.com/tvtsneu ;
+# Email tuyển sinh: [email protected]
+# Số điện thoại tuyển sinh: 0888.128.558
+#   '''
+    queries_list.append(query)
+    queries = {"role": "user", "parts": [{"text": prompt_order(queries_list)}]}
+    messages_with_queries = messages.copy()
+    messages_with_queries.append(queries)
+    # messages_with_queries.insert(0, queries)
+  # Configure API key
+    genai.configure(api_key=key)
+  # Initialize the Gemini model
+    model = genai.GenerativeModel("gemini-2.0-flash")
+    response = model.generate_content(contents=messages_with_queries, stream=True)
+    response_text = ""
+    for chunk in response:
+        response_text += chunk.text
+        yield response_text
+    messages.append({"role": "model", "parts": [{"text": response_text}]})
+        # user_feedback = yield "\nNhập phản hồi của bạn (hoặc nhập 'q' để thoát): "
+        # if user_feedback.lower() == "q":
+        #     break
+        # messages.append({"role": "user", "parts": [{"text": query}]})
+    log_message(messages)