Spaces:

quoc-khanh
/

chatbot4nct_test1

Sleeping

App Files Files Community

quoc-khanh commited on Mar 19

Commit

611e958

verified ·

1 Parent(s): 291a353

Update helpers.py

Browse files

Files changed (1) hide show

helpers.py +109 -5

helpers.py CHANGED Viewed

@@ -19,6 +19,10 @@ import os
 from langchain_docling import DoclingLoader#, ExportType
 from langchain_docling.loader import ExportType
 import logging
 # logging.getLogger("langchain").setLevel(logging.ERROR)
 logging.getLogger().setLevel(logging.ERROR)
@@ -297,9 +301,109 @@ def list_docx_files(folder_path):
     return [str(file) for file in Path(folder_path).rglob("*.docx")]
 def prompt_order(queries):
-    text = 'IMPORTANT: Here is the questions of user in order, use that and the context above to know the best answer:\n'
-    i = 0
     for q in queries:
-        i += 1
-        text += f'Question {i}: {str(q)}\n'
-    return text

 from langchain_docling import DoclingLoader#, ExportType
 from langchain_docling.loader import ExportType
 import logging
+from langchain.schema import Document
+import re
+import ast
 # logging.getLogger("langchain").setLevel(logging.ERROR)
 logging.getLogger().setLevel(logging.ERROR)
     return [str(file) for file in Path(folder_path).rglob("*.docx")]
 def prompt_order(queries):
+    text = 'Câu hỏi: '
     for q in queries:
+        text += f'{str(q)}. '
+    return text
+def update_documents_metadata(documents, new_metadata):
+    updated_documents = []
+    for doc in documents:
+        # Preserve the original 'source'
+        original_source = doc.metadata.get("source")
+        # Update metadata with new key-value pairs
+        doc.metadata.update(new_metadata)
+        # Ensure the 'source' remains unchanged
+        if original_source:
+            doc.metadata["source"] = original_source
+        updated_documents.append(doc)
+    return updated_documents
+def extract_metadata(response):
+    if not isinstance(response, str):
+        response = str(response)  # Chuyển sang string nếu cần
+    # Tìm tất cả các dictionary trong chuỗi đầu vào
+    matches = re.findall(r'\{.*?\}', response, re.DOTALL)
+    if not matches:
+        return None  # Trả về None nếu không tìm thấy dict nào
+    smallest_dict = None
+    min_length = float("inf")
+    for match in matches:
+        try:
+            parsed_dict = ast.literal_eval(match)  # Chuyển đổi string thành dictionary
+            if isinstance(parsed_dict, dict):
+                dict_length = len(str(parsed_dict))  # Độ dài chuỗi của dict
+                if dict_length < min_length:
+                    smallest_dict = parsed_dict
+                    min_length = dict_length
+        except Exception:
+            continue  # Bỏ qua nếu không phải dictionary hợp lệ
+    return smallest_dict
+def update_metadata(metadata, metadata_child):
+    for key, new_value in metadata_child.items():
+        if key in metadata:
+            # Nếu giá trị hiện tại không phải list, chuyển đổi thành list
+            if not isinstance(metadata[key], list):
+                metadata[key] = [metadata[key]]
+            # Nếu giá trị mới cũng là list thì thêm tất cả, ngược lại thêm từng phần tử
+            if isinstance(new_value, list):
+                metadata[key].extend(new_value)
+            else:
+                metadata[key].append(new_value)
+        else:
+            # Nếu key chưa có, tạo mới với giá trị được chuyển sang dạng list (nếu cần)
+            metadata[key] = new_value if isinstance(new_value, list) else [new_value]
+    return metadata
+def define_metadata(input_text):
+    condition1 = 'Chương trình'
+    condition2 = 'Đề án'
+    condition3 = 'Đề cương'
+    condition4 = ['Trí tuệ nhân tạo',
+                  'Toán kinh tế',
+                  'Thống kê kinh tế',
+                  'Phân tích dữ liệu trong Kinh tế',
+                  'Kỹ thuật phần mềm',
+                  'Khoa học máy tính',
+                  'Khoa học dữ liệu',
+                  'Hệ thống thông tin quản lý',
+                  'Hệ thống thông tin',
+                  'Định phí bảo hiểm và Quản trị rủi ro',
+                  'Công nghệ thông tin',
+                  'An toàn thông tin']
+    #cond1 cond2 la str, con3 la list ten cac nganh
+    result = {}
+    if condition3 in input_text:
+        result['Tai lieu ve'] = 'Đề cương'
+    elif condition1 in input_text:
+        result['Tai lieu ve'] = 'Chương trình đào tạo'
+    elif condition2 in input_text:
+        result['Tai lieu ve'] = 'Đề án'
+    for cond in condition4:
+        if cond in input_text:
+            if cond in ['An toàn thông tin', 'Công nghệ thông tin', 'Khoa học máy tính', 'Kỹ thuật phần mềm']:
+                result['Khoa'] = 'Công nghệ thông tin (FIT)'
+            elif cond in ['Toán kinh tế', 'Phân tích dữ liệu trong Kinh tế', 'Định phí bảo hiểm và Quản trị rủi ro']:
+                result['Khoa'] = 'Toán Kinh tế (MFE)'
+                if cond == 'Toán kinh tế':
+                    cond == 'Toán kinh tế (TOKT)'
+                elif cond == 'Phân tích dữ liệu trong Kinh tế':
+                    cond == 'Phân tích dữ liệu trong Kinh tế (DSEB)'
+                elif cond == 'Định phí bảo hiểm và Quản trị rủi ro':
+                    cond == 'Định phí bảo hiểm và Quản trị rủi ro (Actuary)'
+            elif cond in ['Khoa h���c dữ liệu', 'Trí tuệ nhân tạo']:
+                result['Khoa'] = 'Khoa học dữ liệu và Trí tuệ nhân tạo (FDA)'
+            elif cond == 'Thống kê kinh tế':
+                result['Khoa'] = 'Thống kê'
+            elif cond in ['Hệ thống thông tin', 'Hệ thống thông tin quản lý']:
+                result['Khoa'] = 'Hệ thống thông tin quản lý (MIS)'
+            result['Nganh'] = cond
+    return result