Spaces:

TahaRasouli
/

Smart_AAS_v2.0

Sleeping

App Files Files Community

TahaRasouli commited on Dec 20, 2024

Commit

660dd5e

verified ·

1 Parent(s): a0be55a

Update app.py

Browse files

Files changed (1) hide show

app.py +400 -110

app.py CHANGED Viewed

@@ -1,14 +1,29 @@
 import streamlit as st
 import chromadb
 from chromadb.utils import embedding_functions
-from chromadb.config import Settings
-from groq import Groq
-import xml.etree.ElementTree as ET
-from datetime import datetime
-import os
-# Reuse the helper functions from the original script
 def extract_node_details(element):
     details = {
         "NodeId": element.attrib.get("NodeId", "N/A"),
         "Description": None,
@@ -31,8 +46,12 @@ def extract_node_details(element):
     return details
 def extract_value_content(value_element):
-    if not list(value_element):
         return value_element.text or "No value provided."
     content = []
     for child in value_element:
         tag = child.tag.split('}')[-1]
@@ -40,10 +59,16 @@ def extract_value_content(value_element):
         content.append(f"<{tag}>{child_text}</{tag}>")
     return "".join(content)
-def parse_nodes_to_dict(uploaded_file):
-    tree = ET.parse(uploaded_file)
     root = tree.getroot()
     namespace = root.tag.split('}')[0].strip('{')
     node_types = ["UAObject", "UAVariable", "UAObjectType"]
     nodes_dict = {}
     for node_type in node_types:
@@ -54,8 +79,28 @@ def parse_nodes_to_dict(uploaded_file):
                 nodes_dict[node_id] = details
     return nodes_dict
 def convert_to_natural_language(details):
-    client = Groq(api_key=st.secrets["GROQ_API_KEY"])
     messages = [
         {
             "role": "user",
@@ -68,112 +113,357 @@ def convert_to_natural_language(details):
     )
     return chat_completion.choices[0].message.content
-# Streamlit app
-def main():
-    st.title("OPC UA Node Query System")
-    # Create persistent storage directory
-    os.makedirs("chroma_db", exist_ok=True)
-    # Initialize ChromaDB with persistent storage
-    chroma_client = chromadb.Client(Settings(
-        chroma_db_impl="duckdb+parquet",
-        persist_directory="chroma_db"
-    ))
-    # Initialize session state
-    if 'collection_name' not in st.session_state:
-        st.session_state.collection_name = None
-    if 'initialized' not in st.session_state:
-        st.session_state.initialized = False
-    # File upload
-    uploaded_file = st.file_uploader("Upload OPC UA XML file", type=['xml'])
-    if uploaded_file and not st.session_state.initialized:
-        with st.spinner("Processing XML file and initializing database..."):
             try:
-                # Parse nodes
-                nodes_dict = parse_nodes_to_dict(uploaded_file)
-                # Convert to natural language
-                node_NL = {}
-                for node_id, details in nodes_dict.items():
-                    nl_description = convert_to_natural_language(details)
-                    node_NL[node_id] = nl_description
-                # Create collection with unique name
-                collection_name = f"node_embeddings_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-                collection = chroma_client.create_collection(
-                    name=collection_name,
-                    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
-                        model_name="all-MiniLM-L6-v2"
-                    )
-                )
-                # Add nodes to ChromaDB
                 collection.add(
-                    documents=[desc for desc in node_NL.values()],
-                    metadatas=[{"NodeId": node_id} for node_id in node_NL.keys()],
-                    ids=[node_id for node_id in node_NL.keys()]
                 )
-                # Persist the database
-                st.session_state.collection_name = collection_name
-                st.session_state.initialized = True
-                st.success("Database initialized successfully!")
-            except Exception as e:
-                st.error(f"An error occurred: {str(e)}")
-    # Query section
-    if st.session_state.initialized and st.session_state.collection_name:
-        st.header("Query Nodes")
-        # Get the existing collection
-        collection = chroma_client.get_collection(
-            name=st.session_state.collection_name,
-            embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
-                model_name="all-MiniLM-L6-v2"
-            )
         )
-        user_query = st.text_input("Enter your query:")
-        if user_query:
-            with st.spinner("Searching and generating response..."):
-                try:
-                    # Retrieve matches
-                    results = collection.query(
-                        query_texts=[user_query],
-                        n_results=5
-                    )
-                    # Display results
-                    st.subheader("Top Matches")
-                    for i, (doc, metadata) in enumerate(zip(results["documents"][0], results["metadatas"][0]), 1):
-                        with st.expander(f"Match {i}: NodeId = {metadata['NodeId']}"):
-                            st.write(doc)
-                    # Generate LLM response
-                    retrieved_context = "\n".join(results["documents"][0])
-                    client = Groq(api_key=st.secrets["GROQ_API_KEY"])
-                    messages = [
-                        {
-                            "role": "user",
-                            "content": f"Answer the following query based on the provided context:\n\nQuery: {user_query}\n\nContext: {retrieved_context}"
                         }
-                    ]
-                    chat_completion = client.chat.completions.create(
-                        messages=messages,
-                        model="llama3-8b-8192",
-                    )
-                    st.subheader("Generated Answer")
-                    st.write(chat_completion.choices[0].message.content)
-                except Exception as e:
-                    st.error(f"An error occurred during query: {str(e)}")
 if __name__ == "__main__":
     main()

 import streamlit as st
+import os
+import tempfile
+from typing import Dict, List, Tuple
+import xml.etree.ElementTree as ET
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from groq import Groq
 import chromadb
 from chromadb.utils import embedding_functions
+import PyPDF2
+import numpy as np
+# Initialize session state for storing processed files
+if 'processed_files' not in st.session_state:
+    st.session_state.processed_files = {}
+if 'current_collection' not in st.session_state:
+    st.session_state.current_collection = None
+if 'current_raw_nodes' not in st.session_state:
+    st.session_state.current_raw_nodes = {}
+# Original XML processing functions remain unchanged
 def extract_node_details(element):
+    """
+    Extracts details like description, value, NodeId, DisplayName, and references from an XML element.
+    """
     details = {
         "NodeId": element.attrib.get("NodeId", "N/A"),
         "Description": None,
     return details
 def extract_value_content(value_element):
+    """
+    Recursively extracts the content of a <Value> element, handling any embedded child elements.
+    """
+    if not list(value_element):  # No child elements, return text directly
         return value_element.text or "No value provided."
+    # Process child elements
     content = []
     for child in value_element:
         tag = child.tag.split('}')[-1]
         content.append(f"<{tag}>{child_text}</{tag}>")
     return "".join(content)
+def parse_nodes_to_dict(filename):
+    """
+    Parses the XML file and saves node details into a dictionary.
+    Each node's NodeId serves as the key, and the value is a dictionary of the node's details.
+    """
+    tree = ET.parse(filename)
     root = tree.getroot()
+    # Retrieve namespace from the root
     namespace = root.tag.split('}')[0].strip('{')
+    # Node types to extract
     node_types = ["UAObject", "UAVariable", "UAObjectType"]
     nodes_dict = {}
     for node_type in node_types:
                 nodes_dict[node_id] = details
     return nodes_dict
+def format_node_content(details):
+    """
+    Formats raw node details into a single string for semantic comparison.
+    """
+    content_parts = []
+    if details["Description"]:
+        content_parts.append(f"Description: {details['Description']}")
+    if details["DisplayName"]:
+        content_parts.append(f"DisplayName: {details['DisplayName']}")
+    if details["Value"]:
+        content_parts.append(f"Value: {details['Value']}")
+    return " | ".join(content_parts)
 def convert_to_natural_language(details):
+    """
+    Converts node details to natural language using Groq LLM.
+    """
+    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
     messages = [
         {
             "role": "user",
     )
     return chat_completion.choices[0].message.content
+# New file type detection and processing functions without magic library
+def detect_file_type(file_path):
+    """
+    Detects if the input file is PDF or XML using file extension and content analysis.
+    """
+    try:
+        # Check file extension
+        file_extension = os.path.splitext(file_path)[1].lower()
+        # Read the first few bytes of the file to check its content
+        with open(file_path, 'rb') as f:
+            header = f.read(8)  # Read first 8 bytes
+        # Check for PDF signature
+        if file_extension == '.pdf' or header.startswith(b'%PDF'):
+            # Verify it's actually a PDF by trying to open it
             try:
+                with open(file_path, 'rb') as f:
+                    PyPDF2.PdfReader(f)
+                return 'pdf'
+            except:
+                return 'unknown'
+        # Check for XML
+        elif file_extension == '.xml':
+            # Try to parse as XML
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content_start = f.read(1024)  # Read first 1KB
+                    # Check for XML declaration or root element
+                    if content_start.strip().startswith(('<?xml', '<')):
+                        ET.parse(file_path)  # Verify it's valid XML
+                        return 'xml'
+            except:
+                return 'unknown'
+        return 'unknown'
+    except Exception as e:
+        print(f"Error detecting file type: {str(e)}")
+        return 'unknown'
+def process_pdf(file_path):
+    """
+    Extracts text content from PDF and splits it into meaningful chunks.
+    """
+    try:
+        chunks = []
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            for page_num in range(len(pdf_reader.pages)):
+                page = pdf_reader.pages[page_num]
+                text = page.extract_text()
+                # Split text into paragraphs
+                paragraphs = text.split('\n\n')
+                # Process each paragraph
+                for para_num, paragraph in enumerate(paragraphs):
+                    if len(paragraph.strip()) > 0:  # Skip empty paragraphs
+                        chunk = {
+                            'content': paragraph.strip(),
+                            'metadata': {
+                                'page_number': page_num + 1,
+                                'paragraph_number': para_num + 1,
+                                'source_type': 'pdf',
+                                'file_name': os.path.basename(file_path)
+                            }
+                        }
+                        chunks.append(chunk)
+        return chunks
+    except Exception as e:
+        print(f"Error processing PDF: {str(e)}")
+        return []
+def add_to_vector_db(collection, chunks, embedder):
+    """
+    Adds processed chunks to the vector database with proper metadata.
+    """
+    try:
+        for i, chunk in enumerate(chunks):
+            # Create unique ID for each chunk
+            chunk_id = f"{chunk['metadata']['file_name']}_{chunk['metadata']['page_number']}_{chunk['metadata']['paragraph_number']}"
+            collection.add(
+                documents=[chunk['content']],
+                metadatas=[chunk['metadata']],
+                ids=[chunk_id]
+            )
+    except Exception as e:
+        print(f"Error adding to vector database: {str(e)}")
+def process_file(file_path):
+    """
+    Main function to process either PDF or XML file and add to vector database.
+    Also returns the raw node details for XML files.
+    """
+    try:
+        # Initialize ChromaDB and embedding function
+        client = chromadb.Client()
+        embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name="all-MiniLM-L6-v2"
+        )
+        # Create or get collection
+        collection = client.create_collection(
+            name="document_embeddings",
+            get_or_create=True
+        )
+        # Store for raw node details
+        raw_nodes = {}
+        # Detect file type
+        file_type = detect_file_type(file_path)
+        if file_type == 'pdf':
+            # Process PDF
+            chunks = process_pdf(file_path)
+            add_to_vector_db(collection, chunks, embedder)
+        elif file_type == 'xml':
+            # Parse XML and store raw nodes
+            raw_nodes = parse_nodes_to_dict(file_path)
+            # Convert to natural language for RAG
+            for node_id, details in raw_nodes.items():
+                nl_description = convert_to_natural_language(details)
+                # Add to vector DB
                 collection.add(
+                    documents=[nl_description],
+                    metadatas=[{"NodeId": node_id, "source_type": "xml"}],
+                    ids=[node_id]
                 )
+        else:
+            raise ValueError("Unsupported file type")
+        return collection, raw_nodes
+    except Exception as e:
+        print(f"Error processing file: {str(e)}")
+        return None, {}
+def generate_rag_response(query_text, context):
+    """
+    Generates a RAG response using the Groq LLM based on the query and retrieved context.
+    Args:
+        query_text (str): The user's query
+        context (str): The retrieved context from the vector database
+    Returns:
+        str: The generated response from the LLM
+    """
+    try:
+        client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that answers questions based on the provided context. "
+                          "If the context doesn't contain relevant information, acknowledge that."
+            },
+            {
+                "role": "user",
+                "content": f"Answer the following query based on the provided context:\n\n"
+                          f"Query: {query_text}\n\n"
+                          f"Context: {context}"
+            }
+        ]
+        chat_completion = client.chat.completions.create(
+            messages=messages,
+            model="llama3-8b-8192",
         )
+        return chat_completion.choices[0].message.content
+    except Exception as e:
+        print(f"Error generating RAG response: {str(e)}")
+        return "Error generating response"
+def find_similar_nodes(query_text, raw_nodes, top_k=5):
+    """
+    Finds the most semantically similar nodes to the query using raw node content.
+    Args:
+        query_text (str): The user's query
+        raw_nodes (dict): Dictionary of node_id: node_details pairs
+        top_k (int): Number of top results to return
+    """
+    try:
+        # Initialize the sentence transformer model
+        model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Format node contents and create mapping
+        node_contents = {}
+        for node_id, details in raw_nodes.items():
+            formatted_content = format_node_content(details)
+            if formatted_content:  # Only include nodes with content
+                node_contents[node_id] = formatted_content
+        # Generate embeddings for the query
+        query_embedding = model.encode([query_text])[0]
+        # Create a list of (node_id, content) tuples
+        nodes = list(node_contents.items())
+        contents = [content for _, content in nodes]
+        # Generate embeddings for all node contents
+        content_embeddings = model.encode(contents)
+        # Calculate cosine similarities
+        similarities = cosine_similarity([query_embedding], content_embeddings)[0]
+        # Get indices of top-k similar nodes
+        top_indices = np.argsort(similarities)[-top_k:][::-1]
+        # Format results
+        results = []
+        for idx in top_indices:
+            node_id, content = nodes[idx]
+            similarity_score = similarities[idx]
+            results.append({
+                'node_id': node_id,
+                'raw_content': content,
+                'original_details': raw_nodes[node_id],
+                'similarity_score': similarity_score
+            })
+        return results
+    except Exception as e:
+        print(f"Error finding similar nodes: {str(e)}")
+        return []
+def query_documents(collection, raw_nodes, query_text, n_results=5):
+    """
+    Query the vector database and perform semantic similarity search on raw nodes.
+    """
+    try:
+        # Get results from vector database
+        results = collection.query(
+            query_texts=[query_text],
+            n_results=n_results
+        )
+        # Combine the retrieved results into context for RAG
+        retrieved_context = "\n".join(results["documents"][0])
+        # Generate RAG response
+        rag_response = generate_rag_response(query_text, retrieved_context)
+        # Find semantically similar nodes using raw node content
+        similar_nodes = find_similar_nodes(query_text, raw_nodes) if raw_nodes else []
+        # Format vector DB results
+        formatted_results = []
+        for i in range(len(results["documents"][0])):
+            result = {
+                "content": results["documents"][0][i],
+                "metadata": results["metadatas"][0][i],
+                "score": results["distances"][0][i] if "distances" in results else None,
+                "rag_response": rag_response if i == 0 else None
+            }
+            formatted_results.append(result)
+        return formatted_results, similar_nodes
+    except Exception as e:
+        print(f"Error querying documents: {str(e)}")
+        return [], []
+def main():
+    st.title("Document Query System")
+    st.write("Upload PDF or XML files and query their contents")
+    # File upload section
+    uploaded_files = st.file_uploader(
+        "Upload PDF or XML files",
+        type=['pdf', 'xml'],
+        accept_multiple_files=True
+    )
+    # Process uploaded files
+    if uploaded_files:
+        for uploaded_file in uploaded_files:
+            if uploaded_file.name not in st.session_state.processed_files:
+                with st.spinner(f'Processing {uploaded_file.name}...'):
+                    collection, raw_nodes = process_file(uploaded_file)
+                    if collection:
+                        st.session_state.processed_files[uploaded_file.name] = {
+                            'collection': collection,
+                            'raw_nodes': raw_nodes
                         }
+                        st.success(f"Successfully processed {uploaded_file.name}")
+                    else:
+                        st.error(f"Failed to process {uploaded_file.name}")
+    # File selection and querying section
+    if st.session_state.processed_files:
+        selected_file = st.selectbox(
+            "Select file to query",
+            options=list(st.session_state.processed_files.keys())
+        )
+        if selected_file:
+            st.session_state.current_collection = st.session_state.processed_files[selected_file]['collection']
+            st.session_state.current_raw_nodes = st.session_state.processed_files[selected_file]['raw_nodes']
+            query = st.text_input("Enter your query:")
+            if st.button("Search"):
+                if query:
+                    with st.spinner('Searching...'):
+                        results, similar_nodes = query_documents(
+                            st.session_state.current_collection,
+                            st.session_state.current_raw_nodes,
+                            query
+                        )
+                        # Display RAG response
+                        if results and results[0]['rag_response']:
+                            st.subheader("Generated Answer")
+                            st.write(results[0]['rag_response'])
+                        # Display vector DB results
+                        st.subheader("Search Results")
+                        for i, result in enumerate(results, 1):
+                            with st.expander(f"Match {i}"):
+                                st.write(f"Content: {result['content']}")
+                                st.write(f"Source: {result['metadata']['source_type']}")
+                                if result['metadata']['source_type'] == 'pdf':
+                                    st.write(f"Page: {result['metadata']['page_number']}")
+                                elif result['metadata']['source_type'] == 'xml':
+                                    st.write(f"NodeId: {result['metadata']['NodeId']}")
+                        # Display semantic similarity results
+                        if similar_nodes:
+                            st.subheader("Similar Nodes")
+                            for i, node in enumerate(similar_nodes, 1):
+                                with st.expander(f"Similar Node {i}"):
+                                    st.write(f"NodeId: {node['node_id']}")
+                                    st.write(f"Description: {node['original_details'].get('Description', 'N/A')}")
+                                    st.write(f"DisplayName: {node['original_details'].get('DisplayName', 'N/A')}")
+                                    st.write(f"Value: {node['original_details'].get('Value', 'N/A')}")
+                                    st.write(f"Similarity Score: {node['similarity_score']:.4f}")
 if __name__ == "__main__":
     main()