Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +235 -0
- requirements.txt.txt +5 -0
- unified_document_processor.py +380 -0
app.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
from typing import List
|
5 |
+
from unified_document_processor import UnifiedDocumentProcessor, CustomEmbeddingFunction
|
6 |
+
import chromadb
|
7 |
+
from chromadb.config import Settings
|
8 |
+
from groq import Groq
|
9 |
+
|
10 |
+
def initialize_session_state():
|
11 |
+
"""Initialize all session state variables"""
|
12 |
+
if 'CHROMADB_DIR' not in st.session_state:
|
13 |
+
st.session_state.CHROMADB_DIR = os.path.join(os.getcwd(), 'chromadb_data')
|
14 |
+
os.makedirs(st.session_state.CHROMADB_DIR, exist_ok=True)
|
15 |
+
|
16 |
+
if 'processed_files' not in st.session_state:
|
17 |
+
st.session_state.processed_files = set()
|
18 |
+
|
19 |
+
if 'processor' not in st.session_state:
|
20 |
+
try:
|
21 |
+
st.session_state.processor = None # Will be initialized in StreamlitDocProcessor
|
22 |
+
except Exception as e:
|
23 |
+
st.error(f"Error initializing processor: {str(e)}")
|
24 |
+
|
25 |
+
class StreamlitDocProcessor:
|
26 |
+
def __init__(self):
|
27 |
+
if st.session_state.processor is None:
|
28 |
+
try:
|
29 |
+
groq_api_key = st.secrets["GROQ_API_KEY"]
|
30 |
+
# Initialize processor with persistent ChromaDB
|
31 |
+
st.session_state.processor = self.initialize_processor(groq_api_key)
|
32 |
+
# Update processed files after initializing processor
|
33 |
+
st.session_state.processed_files = self.get_processed_files()
|
34 |
+
except Exception as e:
|
35 |
+
st.error(f"Error initializing processor: {str(e)}")
|
36 |
+
return
|
37 |
+
|
38 |
+
def initialize_processor(self, groq_api_key):
|
39 |
+
"""Initialize the processor with persistent ChromaDB"""
|
40 |
+
class PersistentUnifiedDocumentProcessor(UnifiedDocumentProcessor):
|
41 |
+
def __init__(self, api_key, collection_name="unified_content", persist_dir=None):
|
42 |
+
self.groq_client = Groq(api_key=api_key)
|
43 |
+
self.max_elements_per_chunk = 50
|
44 |
+
self.pdf_chunk_size = 500
|
45 |
+
self.pdf_overlap = 50
|
46 |
+
self._initialize_nltk()
|
47 |
+
|
48 |
+
# Initialize persistent ChromaDB
|
49 |
+
self.chroma_client = chromadb.PersistentClient(
|
50 |
+
path=persist_dir,
|
51 |
+
settings=Settings(
|
52 |
+
allow_reset=True,
|
53 |
+
is_persistent=True
|
54 |
+
)
|
55 |
+
)
|
56 |
+
|
57 |
+
# Get or create collection
|
58 |
+
try:
|
59 |
+
self.collection = self.chroma_client.get_collection(
|
60 |
+
name=collection_name,
|
61 |
+
embedding_function=CustomEmbeddingFunction()
|
62 |
+
)
|
63 |
+
except:
|
64 |
+
self.collection = self.chroma_client.create_collection(
|
65 |
+
name=collection_name,
|
66 |
+
embedding_function=CustomEmbeddingFunction()
|
67 |
+
)
|
68 |
+
|
69 |
+
return PersistentUnifiedDocumentProcessor(
|
70 |
+
groq_api_key,
|
71 |
+
persist_dir=st.session_state.CHROMADB_DIR
|
72 |
+
)
|
73 |
+
|
74 |
+
def get_processed_files(self) -> set:
|
75 |
+
"""Get list of processed files from ChromaDB"""
|
76 |
+
try:
|
77 |
+
if st.session_state.processor:
|
78 |
+
available_files = st.session_state.processor.get_available_files()
|
79 |
+
return set(available_files['pdf'] + available_files['xml'])
|
80 |
+
return set()
|
81 |
+
except Exception as e:
|
82 |
+
st.error(f"Error getting processed files: {str(e)}")
|
83 |
+
return set()
|
84 |
+
|
85 |
+
def run(self):
|
86 |
+
st.title("AAS Assistant")
|
87 |
+
|
88 |
+
# Create sidebar for navigation
|
89 |
+
page = st.sidebar.selectbox(
|
90 |
+
"Choose a page",
|
91 |
+
["Upload & Process", "Query"]
|
92 |
+
)
|
93 |
+
|
94 |
+
if page == "Upload & Process":
|
95 |
+
self.upload_and_process_page()
|
96 |
+
else:
|
97 |
+
self.qa_page()
|
98 |
+
|
99 |
+
def upload_and_process_page(self):
|
100 |
+
st.header("Upload and Process Documents")
|
101 |
+
|
102 |
+
# File uploader
|
103 |
+
uploaded_files = st.file_uploader(
|
104 |
+
"Upload PDF or XML files",
|
105 |
+
type=['pdf', 'xml'],
|
106 |
+
accept_multiple_files=True
|
107 |
+
)
|
108 |
+
|
109 |
+
if uploaded_files:
|
110 |
+
for uploaded_file in uploaded_files:
|
111 |
+
# Create progress bar
|
112 |
+
progress_bar = st.progress(0)
|
113 |
+
status_text = st.empty()
|
114 |
+
|
115 |
+
if uploaded_file.name not in st.session_state.processed_files:
|
116 |
+
try:
|
117 |
+
# Create a temporary file
|
118 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
119 |
+
tmp_file.write(uploaded_file.getbuffer())
|
120 |
+
temp_path = tmp_file.name
|
121 |
+
|
122 |
+
# Process the file
|
123 |
+
status_text.text(f'Processing {uploaded_file.name}...')
|
124 |
+
progress_bar.progress(25)
|
125 |
+
|
126 |
+
result = st.session_state.processor.process_file(temp_path)
|
127 |
+
progress_bar.progress(75)
|
128 |
+
|
129 |
+
if result['success']:
|
130 |
+
st.session_state.processed_files.add(uploaded_file.name)
|
131 |
+
progress_bar.progress(100)
|
132 |
+
status_text.success(f"Successfully processed {uploaded_file.name}")
|
133 |
+
else:
|
134 |
+
progress_bar.progress(100)
|
135 |
+
status_text.error(f"Failed to process {uploaded_file.name}: {result['error']}")
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
status_text.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
139 |
+
finally:
|
140 |
+
# Clean up temporary file
|
141 |
+
try:
|
142 |
+
os.unlink(temp_path)
|
143 |
+
except:
|
144 |
+
pass
|
145 |
+
else:
|
146 |
+
status_text.info(f"{uploaded_file.name} has already been processed")
|
147 |
+
progress_bar.progress(100)
|
148 |
+
|
149 |
+
# Display processed files
|
150 |
+
if st.session_state.processed_files:
|
151 |
+
st.subheader("Processed Files")
|
152 |
+
for file in sorted(st.session_state.processed_files):
|
153 |
+
st.text(f"✓ {file}")
|
154 |
+
|
155 |
+
def qa_page(self):
|
156 |
+
st.header("Query our database")
|
157 |
+
|
158 |
+
try:
|
159 |
+
# Refresh available files
|
160 |
+
st.session_state.processed_files = self.get_processed_files()
|
161 |
+
|
162 |
+
if not st.session_state.processed_files:
|
163 |
+
st.warning("No processed files available. Please upload and process some files first.")
|
164 |
+
return
|
165 |
+
|
166 |
+
# Enhanced file selection with type indicators
|
167 |
+
available_files = self.get_processed_files()
|
168 |
+
xml_files = [f"📱 {f}" for f in available_files['xml']]
|
169 |
+
pdf_files = [f"📄 {f}" for f in available_files['pdf']]
|
170 |
+
all_files = sorted(xml_files + pdf_files)
|
171 |
+
|
172 |
+
selected_files = st.multiselect(
|
173 |
+
"Select files to search through",
|
174 |
+
all_files,
|
175 |
+
default=all_files,
|
176 |
+
help="📱 = XML files, 📄 = PDF files"
|
177 |
+
)
|
178 |
+
|
179 |
+
# Clean up the file names (remove emojis) for processing
|
180 |
+
selected_files = [f[2:] for f in selected_files] # Remove emoji prefix
|
181 |
+
|
182 |
+
if not selected_files:
|
183 |
+
st.warning("Please select at least one file to search through.")
|
184 |
+
return
|
185 |
+
|
186 |
+
# Question input with suggested prompts for XML
|
187 |
+
xml_selected = any(f.endswith('.xml') for f in selected_files)
|
188 |
+
if xml_selected:
|
189 |
+
st.info("Suggested questions for XML content:\n" +
|
190 |
+
"• What are the main components and their relationships?\n" +
|
191 |
+
"• What data types and properties are defined?\n" +
|
192 |
+
"• How are the elements structured and organized?")
|
193 |
+
|
194 |
+
question = st.text_input("Enter your question:")
|
195 |
+
|
196 |
+
if st.button("Ask Question") and question:
|
197 |
+
try:
|
198 |
+
with st.spinner("Searching for answer..."):
|
199 |
+
answer = st.session_state.processor.ask_question_selective(
|
200 |
+
question,
|
201 |
+
selected_files
|
202 |
+
)
|
203 |
+
|
204 |
+
# Display the answer in a structured way
|
205 |
+
st.write("Answer:", answer)
|
206 |
+
|
207 |
+
# If XML files were queried, show additional metadata
|
208 |
+
if xml_selected:
|
209 |
+
with st.expander("Show XML Structure Details"):
|
210 |
+
st.write("Related XML Elements:")
|
211 |
+
# Get the structure information from the processor
|
212 |
+
xml_details = st.session_state.processor.get_xml_structure_info(
|
213 |
+
selected_files,
|
214 |
+
question
|
215 |
+
)
|
216 |
+
for detail in xml_details:
|
217 |
+
st.code(detail, language="xml")
|
218 |
+
|
219 |
+
except Exception as e:
|
220 |
+
st.error(f"Error getting answer: {str(e)}")
|
221 |
+
|
222 |
+
except Exception as e:
|
223 |
+
st.error(f"Error in Q&A interface: {str(e)}")
|
224 |
+
|
225 |
+
|
226 |
+
def main():
|
227 |
+
# Initialize session state
|
228 |
+
initialize_session_state()
|
229 |
+
|
230 |
+
# Create and run app
|
231 |
+
app = StreamlitDocProcessor()
|
232 |
+
app.run()
|
233 |
+
|
234 |
+
if __name__ == "__main__":
|
235 |
+
main()
|
requirements.txt.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
groq
|
2 |
+
chromadb
|
3 |
+
sentence-transformers
|
4 |
+
PyPDF2
|
5 |
+
nltk
|
unified_document_processor.py
ADDED
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Union
|
2 |
+
from groq import Groq
|
3 |
+
import chromadb
|
4 |
+
import os
|
5 |
+
import datetime
|
6 |
+
import json
|
7 |
+
import xml.etree.ElementTree as ET
|
8 |
+
import nltk
|
9 |
+
from nltk.tokenize import sent_tokenize
|
10 |
+
import PyPDF2
|
11 |
+
from sentence_transformers import SentenceTransformer
|
12 |
+
|
13 |
+
class CustomEmbeddingFunction:
|
14 |
+
def __init__(self):
|
15 |
+
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
16 |
+
|
17 |
+
def __call__(self, input: List[str]) -> List[List[float]]:
|
18 |
+
embeddings = self.model.encode(input)
|
19 |
+
return embeddings.tolist()
|
20 |
+
|
21 |
+
class UnifiedDocumentProcessor:
|
22 |
+
def __init__(self, groq_api_key, collection_name="unified_content"):
|
23 |
+
"""Initialize the processor with necessary clients"""
|
24 |
+
self.groq_client = Groq(api_key=groq_api_key)
|
25 |
+
|
26 |
+
# XML-specific settings
|
27 |
+
self.max_elements_per_chunk = 50
|
28 |
+
|
29 |
+
# PDF-specific settings
|
30 |
+
self.pdf_chunk_size = 500
|
31 |
+
self.pdf_overlap = 50
|
32 |
+
|
33 |
+
# Initialize NLTK
|
34 |
+
self._initialize_nltk()
|
35 |
+
|
36 |
+
# Initialize ChromaDB with a single collection for all document types
|
37 |
+
self.chroma_client = chromadb.Client()
|
38 |
+
existing_collections = self.chroma_client.list_collections()
|
39 |
+
collection_exists = any(col.name == collection_name for col in existing_collections)
|
40 |
+
|
41 |
+
if collection_exists:
|
42 |
+
print(f"Using existing collection: {collection_name}")
|
43 |
+
self.collection = self.chroma_client.get_collection(
|
44 |
+
name=collection_name,
|
45 |
+
embedding_function=CustomEmbeddingFunction()
|
46 |
+
)
|
47 |
+
else:
|
48 |
+
print(f"Creating new collection: {collection_name}")
|
49 |
+
self.collection = self.chroma_client.create_collection(
|
50 |
+
name=collection_name,
|
51 |
+
embedding_function=CustomEmbeddingFunction()
|
52 |
+
)
|
53 |
+
|
54 |
+
def _initialize_nltk(self):
|
55 |
+
"""Ensure both NLTK resources are available."""
|
56 |
+
try:
|
57 |
+
nltk.download('punkt')
|
58 |
+
try:
|
59 |
+
nltk.data.find('tokenizers/punkt_tab')
|
60 |
+
except LookupError:
|
61 |
+
nltk.download('punkt_tab')
|
62 |
+
except Exception as e:
|
63 |
+
print(f"Warning: Error downloading NLTK resources: {str(e)}")
|
64 |
+
print("Falling back to basic sentence splitting...")
|
65 |
+
|
66 |
+
def _basic_sentence_split(self, text: str) -> List[str]:
|
67 |
+
"""Fallback method for sentence tokenization"""
|
68 |
+
sentences = []
|
69 |
+
current = ""
|
70 |
+
|
71 |
+
for char in text:
|
72 |
+
current += char
|
73 |
+
if char in ['.', '!', '?'] and len(current.strip()) > 0:
|
74 |
+
sentences.append(current.strip())
|
75 |
+
current = ""
|
76 |
+
|
77 |
+
if current.strip():
|
78 |
+
sentences.append(current.strip())
|
79 |
+
|
80 |
+
return sentences
|
81 |
+
|
82 |
+
def extract_text_from_pdf(self, pdf_path: str) -> str:
|
83 |
+
"""Extract text from PDF file"""
|
84 |
+
try:
|
85 |
+
text = ""
|
86 |
+
with open(pdf_path, 'rb') as file:
|
87 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
88 |
+
for page in pdf_reader.pages:
|
89 |
+
text += page.extract_text() + " "
|
90 |
+
return text.strip()
|
91 |
+
except Exception as e:
|
92 |
+
raise Exception(f"Error extracting text from PDF: {str(e)}")
|
93 |
+
|
94 |
+
def chunk_text(self, text: str) -> List[str]:
|
95 |
+
"""Split text into chunks while preserving sentence boundaries"""
|
96 |
+
try:
|
97 |
+
sentences = sent_tokenize(text)
|
98 |
+
except Exception as e:
|
99 |
+
print(f"Warning: Using fallback sentence splitting: {str(e)}")
|
100 |
+
sentences = self._basic_sentence_split(text)
|
101 |
+
|
102 |
+
chunks = []
|
103 |
+
current_chunk = []
|
104 |
+
current_size = 0
|
105 |
+
|
106 |
+
for sentence in sentences:
|
107 |
+
words = sentence.split()
|
108 |
+
sentence_size = len(words)
|
109 |
+
|
110 |
+
if current_size + sentence_size > self.pdf_chunk_size:
|
111 |
+
if current_chunk:
|
112 |
+
chunks.append(' '.join(current_chunk))
|
113 |
+
overlap_words = current_chunk[-self.pdf_overlap:] if self.pdf_overlap > 0 else []
|
114 |
+
current_chunk = overlap_words + words
|
115 |
+
current_size = len(current_chunk)
|
116 |
+
else:
|
117 |
+
current_chunk = words
|
118 |
+
current_size = sentence_size
|
119 |
+
else:
|
120 |
+
current_chunk.extend(words)
|
121 |
+
current_size += sentence_size
|
122 |
+
|
123 |
+
if current_chunk:
|
124 |
+
chunks.append(' '.join(current_chunk))
|
125 |
+
|
126 |
+
return chunks
|
127 |
+
|
128 |
+
def store_in_vector_db(self, text: str, metadata: Dict) -> str:
|
129 |
+
"""Store content in vector database"""
|
130 |
+
doc_id = f"{metadata['source_file']}_{metadata['content_type']}_{metadata['chunk_id']}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
131 |
+
|
132 |
+
self.collection.add(
|
133 |
+
documents=[text],
|
134 |
+
metadatas=[metadata],
|
135 |
+
ids=[doc_id]
|
136 |
+
)
|
137 |
+
|
138 |
+
return doc_id
|
139 |
+
|
140 |
+
def process_file(self, file_path: str) -> Dict:
|
141 |
+
"""Process any supported file type"""
|
142 |
+
try:
|
143 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
144 |
+
|
145 |
+
if file_extension == '.xml':
|
146 |
+
return self.process_xml_file(file_path)
|
147 |
+
elif file_extension == '.pdf':
|
148 |
+
return self.process_pdf_file(file_path)
|
149 |
+
else:
|
150 |
+
return {
|
151 |
+
'success': False,
|
152 |
+
'error': f'Unsupported file type: {file_extension}'
|
153 |
+
}
|
154 |
+
except Exception as e:
|
155 |
+
return {
|
156 |
+
'success': False,
|
157 |
+
'error': f'Error processing file: {str(e)}'
|
158 |
+
}
|
159 |
+
|
160 |
+
def process_xml_file(self, xml_file_path: str) -> Dict:
|
161 |
+
"""Process XML file with direct embedding"""
|
162 |
+
try:
|
163 |
+
tree = ET.parse(xml_file_path)
|
164 |
+
root = tree.getroot()
|
165 |
+
|
166 |
+
# Process XML into semantic chunks with context
|
167 |
+
chunks = []
|
168 |
+
current_path = []
|
169 |
+
|
170 |
+
def process_element(element, context=None):
|
171 |
+
if context is None:
|
172 |
+
context = {}
|
173 |
+
|
174 |
+
# Create element description
|
175 |
+
current_path.append(element.tag)
|
176 |
+
element_info = []
|
177 |
+
|
178 |
+
# Add tag information
|
179 |
+
element_info.append(f"Element: {element.tag}")
|
180 |
+
element_info.append(f"Path: {'/' + '/'.join(current_path)}")
|
181 |
+
|
182 |
+
# Process namespace if present
|
183 |
+
if '}' in element.tag:
|
184 |
+
namespace = element.tag.split('}')[0].strip('{')
|
185 |
+
element_info.append(f"Namespace: {namespace}")
|
186 |
+
|
187 |
+
# Process attributes with improved structure
|
188 |
+
if element.attrib:
|
189 |
+
for key, value in element.attrib.items():
|
190 |
+
element_info.append(f"Attribute - {key}: {value}")
|
191 |
+
|
192 |
+
# Process text content
|
193 |
+
if element.text and element.text.strip():
|
194 |
+
element_info.append(f"Content: {element.text.strip()}")
|
195 |
+
|
196 |
+
# Create chunk text
|
197 |
+
chunk_text = " | ".join(element_info)
|
198 |
+
|
199 |
+
# Store chunk with metadata
|
200 |
+
chunks.append({
|
201 |
+
'text': chunk_text,
|
202 |
+
'path': '/' + '/'.join(current_path),
|
203 |
+
'context': context.copy(),
|
204 |
+
'element_type': element.tag
|
205 |
+
})
|
206 |
+
|
207 |
+
# Process children
|
208 |
+
child_context = context.copy()
|
209 |
+
if element.attrib:
|
210 |
+
child_context[element.tag] = element.attrib
|
211 |
+
|
212 |
+
for child in element:
|
213 |
+
process_element(child, child_context)
|
214 |
+
|
215 |
+
current_path.pop()
|
216 |
+
|
217 |
+
# Start processing from root
|
218 |
+
process_element(root)
|
219 |
+
print(f"Generated {len(chunks)} XML chunks")
|
220 |
+
|
221 |
+
results = []
|
222 |
+
for i, chunk in enumerate(chunks):
|
223 |
+
try:
|
224 |
+
metadata = {
|
225 |
+
'source_file': os.path.basename(xml_file_path),
|
226 |
+
'content_type': 'xml',
|
227 |
+
'chunk_id': i,
|
228 |
+
'total_chunks': len(chunks),
|
229 |
+
'xml_path': chunk['path'],
|
230 |
+
'element_type': chunk['element_type'],
|
231 |
+
'context': json.dumps(chunk['context']),
|
232 |
+
'timestamp': str(datetime.datetime.now())
|
233 |
+
}
|
234 |
+
|
235 |
+
# Store directly in vector database
|
236 |
+
doc_id = self.store_in_vector_db(chunk['text'], metadata)
|
237 |
+
|
238 |
+
results.append({
|
239 |
+
'chunk': i,
|
240 |
+
'success': True,
|
241 |
+
'doc_id': doc_id,
|
242 |
+
'text': chunk['text']
|
243 |
+
})
|
244 |
+
|
245 |
+
except Exception as e:
|
246 |
+
print(f"Error processing chunk {i}: {str(e)}")
|
247 |
+
results.append({
|
248 |
+
'chunk': i,
|
249 |
+
'success': False,
|
250 |
+
'error': str(e)
|
251 |
+
})
|
252 |
+
|
253 |
+
return {
|
254 |
+
'success': True,
|
255 |
+
'total_chunks': len(chunks),
|
256 |
+
'results': results
|
257 |
+
}
|
258 |
+
|
259 |
+
except Exception as e:
|
260 |
+
return {
|
261 |
+
'success': False,
|
262 |
+
'error': str(e)
|
263 |
+
}
|
264 |
+
|
265 |
+
def process_pdf_file(self, pdf_file_path: str) -> Dict:
|
266 |
+
"""Process PDF file with direct embedding"""
|
267 |
+
try:
|
268 |
+
full_text = self.extract_text_from_pdf(pdf_file_path)
|
269 |
+
chunks = self.chunk_text(full_text)
|
270 |
+
|
271 |
+
print(f"Split PDF into {len(chunks)} chunks")
|
272 |
+
results = []
|
273 |
+
|
274 |
+
for i, chunk in enumerate(chunks):
|
275 |
+
try:
|
276 |
+
metadata = {
|
277 |
+
'source_file': os.path.basename(pdf_file_path),
|
278 |
+
'content_type': 'pdf',
|
279 |
+
'chunk_id': i,
|
280 |
+
'total_chunks': len(chunks),
|
281 |
+
'timestamp': str(datetime.datetime.now()),
|
282 |
+
'chunk_size': len(chunk.split())
|
283 |
+
}
|
284 |
+
|
285 |
+
# Store directly in vector database
|
286 |
+
doc_id = self.store_in_vector_db(chunk, metadata)
|
287 |
+
|
288 |
+
results.append({
|
289 |
+
'chunk': i,
|
290 |
+
'success': True,
|
291 |
+
'doc_id': doc_id,
|
292 |
+
'text': chunk[:200] + "..." if len(chunk) > 200 else chunk
|
293 |
+
})
|
294 |
+
except Exception as e:
|
295 |
+
results.append({
|
296 |
+
'chunk': i,
|
297 |
+
'success': False,
|
298 |
+
'error': str(e)
|
299 |
+
})
|
300 |
+
|
301 |
+
return {
|
302 |
+
'success': True,
|
303 |
+
'total_chunks': len(chunks),
|
304 |
+
'results': results
|
305 |
+
}
|
306 |
+
|
307 |
+
except Exception as e:
|
308 |
+
return {
|
309 |
+
'success': False,
|
310 |
+
'error': str(e)
|
311 |
+
}
|
312 |
+
|
313 |
+
def get_available_files(self) -> Dict[str, List[str]]:
|
314 |
+
"""Get list of all files in the database"""
|
315 |
+
try:
|
316 |
+
all_entries = self.collection.get(
|
317 |
+
include=['metadatas']
|
318 |
+
)
|
319 |
+
|
320 |
+
files = {
|
321 |
+
'pdf': set(),
|
322 |
+
'xml': set()
|
323 |
+
}
|
324 |
+
|
325 |
+
for metadata in all_entries['metadatas']:
|
326 |
+
file_type = metadata['content_type']
|
327 |
+
file_name = metadata['source_file']
|
328 |
+
files[file_type].add(file_name)
|
329 |
+
|
330 |
+
return {
|
331 |
+
'pdf': sorted(list(files['pdf'])),
|
332 |
+
'xml': sorted(list(files['xml']))
|
333 |
+
}
|
334 |
+
except Exception as e:
|
335 |
+
print(f"Error getting available files: {str(e)}")
|
336 |
+
return {'pdf': [], 'xml': []}
|
337 |
+
|
338 |
+
def ask_question_selective(self, question: str, selected_files: List[str], n_results: int = 5) -> str:
|
339 |
+
"""Ask a question using only the selected files"""
|
340 |
+
try:
|
341 |
+
filter_dict = {
|
342 |
+
'source_file': {'$in': selected_files}
|
343 |
+
}
|
344 |
+
|
345 |
+
results = self.collection.query(
|
346 |
+
query_texts=[question],
|
347 |
+
n_results=n_results,
|
348 |
+
where=filter_dict,
|
349 |
+
include=["documents", "metadatas"]
|
350 |
+
)
|
351 |
+
|
352 |
+
if not results['documents'][0]:
|
353 |
+
return "No relevant content found in the selected files."
|
354 |
+
|
355 |
+
# Format answer based on content type
|
356 |
+
formatted_answer = []
|
357 |
+
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
|
358 |
+
if meta['content_type'] == 'xml':
|
359 |
+
formatted_answer.append(f"Found in XML path: {meta['xml_path']}\n{doc}")
|
360 |
+
else:
|
361 |
+
formatted_answer.append(doc)
|
362 |
+
|
363 |
+
# Create response using the matched content
|
364 |
+
prompt = f"""Based on these relevant sections, please answer: {question}
|
365 |
+
|
366 |
+
Relevant Content:
|
367 |
+
{' '.join(formatted_answer)}
|
368 |
+
|
369 |
+
Please provide a clear, concise answer based on the above content."""
|
370 |
+
|
371 |
+
response = self.groq_client.chat.completions.create(
|
372 |
+
messages=[{"role": "user", "content": prompt}],
|
373 |
+
model="llama3-8b-8192",
|
374 |
+
temperature=0.2
|
375 |
+
)
|
376 |
+
|
377 |
+
return response.choices[0].message.content
|
378 |
+
|
379 |
+
except Exception as e:
|
380 |
+
return f"Error processing your question: {str(e)}"
|