Spaces:

TahaRasouli
/

Smart_AAS_v2.0

Sleeping

App Files Files Community

TahaRasouli commited on Nov 25, 2024

Commit

d59e7dc

verified ·

1 Parent(s): bc1d7ed

Upload 3 files

Browse files

Files changed (3) hide show

app.py +235 -0
requirements.txt.txt +5 -0
unified_document_processor.py +380 -0

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import streamlit as st
+import os
+import tempfile
+from typing import List
+from unified_document_processor import UnifiedDocumentProcessor, CustomEmbeddingFunction
+import chromadb
+from chromadb.config import Settings
+from groq import Groq
+def initialize_session_state():
+    """Initialize all session state variables"""
+    if 'CHROMADB_DIR' not in st.session_state:
+        st.session_state.CHROMADB_DIR = os.path.join(os.getcwd(), 'chromadb_data')
+        os.makedirs(st.session_state.CHROMADB_DIR, exist_ok=True)
+    if 'processed_files' not in st.session_state:
+        st.session_state.processed_files = set()
+    if 'processor' not in st.session_state:
+        try:
+            st.session_state.processor = None  # Will be initialized in StreamlitDocProcessor
+        except Exception as e:
+            st.error(f"Error initializing processor: {str(e)}")
+class StreamlitDocProcessor:
+    def __init__(self):
+        if st.session_state.processor is None:
+            try:
+                groq_api_key = st.secrets["GROQ_API_KEY"]
+                # Initialize processor with persistent ChromaDB
+                st.session_state.processor = self.initialize_processor(groq_api_key)
+                # Update processed files after initializing processor
+                st.session_state.processed_files = self.get_processed_files()
+            except Exception as e:
+                st.error(f"Error initializing processor: {str(e)}")
+                return
+    def initialize_processor(self, groq_api_key):
+        """Initialize the processor with persistent ChromaDB"""
+        class PersistentUnifiedDocumentProcessor(UnifiedDocumentProcessor):
+            def __init__(self, api_key, collection_name="unified_content", persist_dir=None):
+                self.groq_client = Groq(api_key=api_key)
+                self.max_elements_per_chunk = 50
+                self.pdf_chunk_size = 500
+                self.pdf_overlap = 50
+                self._initialize_nltk()
+                # Initialize persistent ChromaDB
+                self.chroma_client = chromadb.PersistentClient(
+                    path=persist_dir,
+                    settings=Settings(
+                        allow_reset=True,
+                        is_persistent=True
+                    )
+                )
+                # Get or create collection
+                try:
+                    self.collection = self.chroma_client.get_collection(
+                        name=collection_name,
+                        embedding_function=CustomEmbeddingFunction()
+                    )
+                except:
+                    self.collection = self.chroma_client.create_collection(
+                        name=collection_name,
+                        embedding_function=CustomEmbeddingFunction()
+                    )
+        return PersistentUnifiedDocumentProcessor(
+            groq_api_key,
+            persist_dir=st.session_state.CHROMADB_DIR
+        )
+    def get_processed_files(self) -> set:
+        """Get list of processed files from ChromaDB"""
+        try:
+            if st.session_state.processor:
+                available_files = st.session_state.processor.get_available_files()
+                return set(available_files['pdf'] + available_files['xml'])
+            return set()
+        except Exception as e:
+            st.error(f"Error getting processed files: {str(e)}")
+            return set()
+    def run(self):
+        st.title("AAS Assistant")
+        # Create sidebar for navigation
+        page = st.sidebar.selectbox(
+            "Choose a page",
+            ["Upload & Process", "Query"]
+        )
+        if page == "Upload & Process":
+            self.upload_and_process_page()
+        else:
+            self.qa_page()
+    def upload_and_process_page(self):
+        st.header("Upload and Process Documents")
+        # File uploader
+        uploaded_files = st.file_uploader(
+            "Upload PDF or XML files",
+            type=['pdf', 'xml'],
+            accept_multiple_files=True
+        )
+        if uploaded_files:
+            for uploaded_file in uploaded_files:
+                # Create progress bar
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                if uploaded_file.name not in st.session_state.processed_files:
+                    try:
+                        # Create a temporary file
+                        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
+                            tmp_file.write(uploaded_file.getbuffer())
+                            temp_path = tmp_file.name
+                        # Process the file
+                        status_text.text(f'Processing {uploaded_file.name}...')
+                        progress_bar.progress(25)
+                        result = st.session_state.processor.process_file(temp_path)
+                        progress_bar.progress(75)
+                        if result['success']:
+                            st.session_state.processed_files.add(uploaded_file.name)
+                            progress_bar.progress(100)
+                            status_text.success(f"Successfully processed {uploaded_file.name}")
+                        else:
+                            progress_bar.progress(100)
+                            status_text.error(f"Failed to process {uploaded_file.name}: {result['error']}")
+                    except Exception as e:
+                        status_text.error(f"Error processing {uploaded_file.name}: {str(e)}")
+                    finally:
+                        # Clean up temporary file
+                        try:
+                            os.unlink(temp_path)
+                        except:
+                            pass
+                else:
+                    status_text.info(f"{uploaded_file.name} has already been processed")
+                    progress_bar.progress(100)
+        # Display processed files
+        if st.session_state.processed_files:
+            st.subheader("Processed Files")
+            for file in sorted(st.session_state.processed_files):
+                st.text(f"✓ {file}")
+    def qa_page(self):
+        st.header("Query our database")
+        try:
+            # Refresh available files
+            st.session_state.processed_files = self.get_processed_files()
+            if not st.session_state.processed_files:
+                st.warning("No processed files available. Please upload and process some files first.")
+                return
+            # Enhanced file selection with type indicators
+            available_files = self.get_processed_files()
+            xml_files = [f"📱 {f}" for f in available_files['xml']]
+            pdf_files = [f"📄 {f}" for f in available_files['pdf']]
+            all_files = sorted(xml_files + pdf_files)
+            selected_files = st.multiselect(
+                "Select files to search through",
+                all_files,
+                default=all_files,
+                help="📱 = XML files, 📄 = PDF files"
+            )
+            # Clean up the file names (remove emojis) for processing
+            selected_files = [f[2:] for f in selected_files]  # Remove emoji prefix
+            if not selected_files:
+                st.warning("Please select at least one file to search through.")
+                return
+            # Question input with suggested prompts for XML
+            xml_selected = any(f.endswith('.xml') for f in selected_files)
+            if xml_selected:
+                st.info("Suggested questions for XML content:\n" +
+                    "• What are the main components and their relationships?\n" +
+                    "• What data types and properties are defined?\n" +
+                    "• How are the elements structured and organized?")
+            question = st.text_input("Enter your question:")
+            if st.button("Ask Question") and question:
+                try:
+                    with st.spinner("Searching for answer..."):
+                        answer = st.session_state.processor.ask_question_selective(
+                            question,
+                            selected_files
+                        )
+                        # Display the answer in a structured way
+                        st.write("Answer:", answer)
+                        # If XML files were queried, show additional metadata
+                        if xml_selected:
+                            with st.expander("Show XML Structure Details"):
+                                st.write("Related XML Elements:")
+                                # Get the structure information from the processor
+                                xml_details = st.session_state.processor.get_xml_structure_info(
+                                    selected_files,
+                                    question
+                                )
+                                for detail in xml_details:
+                                    st.code(detail, language="xml")
+                except Exception as e:
+                    st.error(f"Error getting answer: {str(e)}")
+        except Exception as e:
+            st.error(f"Error in Q&A interface: {str(e)}")
+def main():
+    # Initialize session state
+    initialize_session_state()
+    # Create and run app
+    app = StreamlitDocProcessor()
+    app.run()
+if __name__ == "__main__":
+    main()

requirements.txt.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+groq
+chromadb
+sentence-transformers
+PyPDF2
+nltk

unified_document_processor.py ADDED Viewed

	@@ -0,0 +1,380 @@

+from typing import List, Dict, Union
+from groq import Groq
+import chromadb
+import os
+import datetime
+import json
+import xml.etree.ElementTree as ET
+import nltk
+from nltk.tokenize import sent_tokenize
+import PyPDF2
+from sentence_transformers import SentenceTransformer
+class CustomEmbeddingFunction:
+    def __init__(self):
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+    def __call__(self, input: List[str]) -> List[List[float]]:
+        embeddings = self.model.encode(input)
+        return embeddings.tolist()
+class UnifiedDocumentProcessor:
+    def __init__(self, groq_api_key, collection_name="unified_content"):
+        """Initialize the processor with necessary clients"""
+        self.groq_client = Groq(api_key=groq_api_key)
+        # XML-specific settings
+        self.max_elements_per_chunk = 50
+        # PDF-specific settings
+        self.pdf_chunk_size = 500
+        self.pdf_overlap = 50
+        # Initialize NLTK
+        self._initialize_nltk()
+        # Initialize ChromaDB with a single collection for all document types
+        self.chroma_client = chromadb.Client()
+        existing_collections = self.chroma_client.list_collections()
+        collection_exists = any(col.name == collection_name for col in existing_collections)
+        if collection_exists:
+            print(f"Using existing collection: {collection_name}")
+            self.collection = self.chroma_client.get_collection(
+                name=collection_name,
+                embedding_function=CustomEmbeddingFunction()
+            )
+        else:
+            print(f"Creating new collection: {collection_name}")
+            self.collection = self.chroma_client.create_collection(
+                name=collection_name,
+                embedding_function=CustomEmbeddingFunction()
+            )
+    def _initialize_nltk(self):
+        """Ensure both NLTK resources are available."""
+        try:
+            nltk.download('punkt')
+            try:
+                nltk.data.find('tokenizers/punkt_tab')
+            except LookupError:
+                nltk.download('punkt_tab')
+        except Exception as e:
+            print(f"Warning: Error downloading NLTK resources: {str(e)}")
+            print("Falling back to basic sentence splitting...")
+    def _basic_sentence_split(self, text: str) -> List[str]:
+        """Fallback method for sentence tokenization"""
+        sentences = []
+        current = ""
+        for char in text:
+            current += char
+            if char in ['.', '!', '?'] and len(current.strip()) > 0:
+                sentences.append(current.strip())
+                current = ""
+        if current.strip():
+            sentences.append(current.strip())
+        return sentences
+    def extract_text_from_pdf(self, pdf_path: str) -> str:
+        """Extract text from PDF file"""
+        try:
+            text = ""
+            with open(pdf_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page in pdf_reader.pages:
+                    text += page.extract_text() + " "
+            return text.strip()
+        except Exception as e:
+            raise Exception(f"Error extracting text from PDF: {str(e)}")
+    def chunk_text(self, text: str) -> List[str]:
+        """Split text into chunks while preserving sentence boundaries"""
+        try:
+            sentences = sent_tokenize(text)
+        except Exception as e:
+            print(f"Warning: Using fallback sentence splitting: {str(e)}")
+            sentences = self._basic_sentence_split(text)
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for sentence in sentences:
+            words = sentence.split()
+            sentence_size = len(words)
+            if current_size + sentence_size > self.pdf_chunk_size:
+                if current_chunk:
+                    chunks.append(' '.join(current_chunk))
+                    overlap_words = current_chunk[-self.pdf_overlap:] if self.pdf_overlap > 0 else []
+                    current_chunk = overlap_words + words
+                    current_size = len(current_chunk)
+                else:
+                    current_chunk = words
+                    current_size = sentence_size
+            else:
+                current_chunk.extend(words)
+                current_size += sentence_size
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+    def store_in_vector_db(self, text: str, metadata: Dict) -> str:
+        """Store content in vector database"""
+        doc_id = f"{metadata['source_file']}_{metadata['content_type']}_{metadata['chunk_id']}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        self.collection.add(
+            documents=[text],
+            metadatas=[metadata],
+            ids=[doc_id]
+        )
+        return doc_id
+    def process_file(self, file_path: str) -> Dict:
+        """Process any supported file type"""
+        try:
+            file_extension = os.path.splitext(file_path)[1].lower()
+            if file_extension == '.xml':
+                return self.process_xml_file(file_path)
+            elif file_extension == '.pdf':
+                return self.process_pdf_file(file_path)
+            else:
+                return {
+                    'success': False,
+                    'error': f'Unsupported file type: {file_extension}'
+                }
+        except Exception as e:
+            return {
+                'success': False,
+                'error': f'Error processing file: {str(e)}'
+            }
+    def process_xml_file(self, xml_file_path: str) -> Dict:
+        """Process XML file with direct embedding"""
+        try:
+            tree = ET.parse(xml_file_path)
+            root = tree.getroot()
+            # Process XML into semantic chunks with context
+            chunks = []
+            current_path = []
+            def process_element(element, context=None):
+                if context is None:
+                    context = {}
+                # Create element description
+                current_path.append(element.tag)
+                element_info = []
+                # Add tag information
+                element_info.append(f"Element: {element.tag}")
+                element_info.append(f"Path: {'/' + '/'.join(current_path)}")
+                # Process namespace if present
+                if '}' in element.tag:
+                    namespace = element.tag.split('}')[0].strip('{')
+                    element_info.append(f"Namespace: {namespace}")
+                # Process attributes with improved structure
+                if element.attrib:
+                    for key, value in element.attrib.items():
+                        element_info.append(f"Attribute - {key}: {value}")
+                # Process text content
+                if element.text and element.text.strip():
+                    element_info.append(f"Content: {element.text.strip()}")
+                # Create chunk text
+                chunk_text = " | ".join(element_info)
+                # Store chunk with metadata
+                chunks.append({
+                    'text': chunk_text,
+                    'path': '/' + '/'.join(current_path),
+                    'context': context.copy(),
+                    'element_type': element.tag
+                })
+                # Process children
+                child_context = context.copy()
+                if element.attrib:
+                    child_context[element.tag] = element.attrib
+                for child in element:
+                    process_element(child, child_context)
+                current_path.pop()
+            # Start processing from root
+            process_element(root)
+            print(f"Generated {len(chunks)} XML chunks")
+            results = []
+            for i, chunk in enumerate(chunks):
+                try:
+                    metadata = {
+                        'source_file': os.path.basename(xml_file_path),
+                        'content_type': 'xml',
+                        'chunk_id': i,
+                        'total_chunks': len(chunks),
+                        'xml_path': chunk['path'],
+                        'element_type': chunk['element_type'],
+                        'context': json.dumps(chunk['context']),
+                        'timestamp': str(datetime.datetime.now())
+                    }
+                    # Store directly in vector database
+                    doc_id = self.store_in_vector_db(chunk['text'], metadata)
+                    results.append({
+                        'chunk': i,
+                        'success': True,
+                        'doc_id': doc_id,
+                        'text': chunk['text']
+                    })
+                except Exception as e:
+                    print(f"Error processing chunk {i}: {str(e)}")
+                    results.append({
+                        'chunk': i,
+                        'success': False,
+                        'error': str(e)
+                    })
+            return {
+                'success': True,
+                'total_chunks': len(chunks),
+                'results': results
+            }
+        except Exception as e:
+            return {
+                'success': False,
+                'error': str(e)
+            }
+    def process_pdf_file(self, pdf_file_path: str) -> Dict:
+        """Process PDF file with direct embedding"""
+        try:
+            full_text = self.extract_text_from_pdf(pdf_file_path)
+            chunks = self.chunk_text(full_text)
+            print(f"Split PDF into {len(chunks)} chunks")
+            results = []
+            for i, chunk in enumerate(chunks):
+                try:
+                    metadata = {
+                        'source_file': os.path.basename(pdf_file_path),
+                        'content_type': 'pdf',
+                        'chunk_id': i,
+                        'total_chunks': len(chunks),
+                        'timestamp': str(datetime.datetime.now()),
+                        'chunk_size': len(chunk.split())
+                    }
+                    # Store directly in vector database
+                    doc_id = self.store_in_vector_db(chunk, metadata)
+                    results.append({
+                        'chunk': i,
+                        'success': True,
+                        'doc_id': doc_id,
+                        'text': chunk[:200] + "..." if len(chunk) > 200 else chunk
+                    })
+                except Exception as e:
+                    results.append({
+                        'chunk': i,
+                        'success': False,
+                        'error': str(e)
+                    })
+            return {
+                'success': True,
+                'total_chunks': len(chunks),
+                'results': results
+            }
+        except Exception as e:
+            return {
+                'success': False,
+                'error': str(e)
+            }
+    def get_available_files(self) -> Dict[str, List[str]]:
+        """Get list of all files in the database"""
+        try:
+            all_entries = self.collection.get(
+                include=['metadatas']
+            )
+            files = {
+                'pdf': set(),
+                'xml': set()
+            }
+            for metadata in all_entries['metadatas']:
+                file_type = metadata['content_type']
+                file_name = metadata['source_file']
+                files[file_type].add(file_name)
+            return {
+                'pdf': sorted(list(files['pdf'])),
+                'xml': sorted(list(files['xml']))
+            }
+        except Exception as e:
+            print(f"Error getting available files: {str(e)}")
+            return {'pdf': [], 'xml': []}
+    def ask_question_selective(self, question: str, selected_files: List[str], n_results: int = 5) -> str:
+        """Ask a question using only the selected files"""
+        try:
+            filter_dict = {
+                'source_file': {'$in': selected_files}
+            }
+            results = self.collection.query(
+                query_texts=[question],
+                n_results=n_results,
+                where=filter_dict,
+                include=["documents", "metadatas"]
+            )
+            if not results['documents'][0]:
+                return "No relevant content found in the selected files."
+            # Format answer based on content type
+            formatted_answer = []
+            for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
+                if meta['content_type'] == 'xml':
+                    formatted_answer.append(f"Found in XML path: {meta['xml_path']}\n{doc}")
+                else:
+                    formatted_answer.append(doc)
+            # Create response using the matched content
+            prompt = f"""Based on these relevant sections, please answer: {question}
+            Relevant Content:
+            {' '.join(formatted_answer)}
+            Please provide a clear, concise answer based on the above content."""
+            response = self.groq_client.chat.completions.create(
+                messages=[{"role": "user", "content": prompt}],
+                model="llama3-8b-8192",
+                temperature=0.2
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            return f"Error processing your question: {str(e)}"