Nde Dilan commited on
Commit
d4e21df
·
1 Parent(s): a5363fd

Add application file

Browse files
Files changed (1) hide show
  1. streamlit.py +130 -0
streamlit.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from pathlib import Path
4
+ import time
5
+ from main import PDFProcessor, SecurityException
6
+
7
+ # Configure page
8
+ st.set_page_config(
9
+ page_title="PDF Query Engine",
10
+ page_icon="📚",
11
+ layout="wide",
12
+ )
13
+
14
+ # Initialize processor
15
+ @st.cache_resource
16
+ def get_processor():
17
+ return PDFProcessor()
18
+
19
+ processor = get_processor()
20
+
21
+ # Create upload directory if it doesn't exist
22
+ upload_dir = Path("./uploads")
23
+ upload_dir.mkdir(exist_ok=True)
24
+
25
+ # Title and description
26
+ st.title("PDF Query Engine 🔍")
27
+ st.markdown("""
28
+ This application allows you to extract information from PDF documents using natural language queries.
29
+ Upload a PDF, wait for it to be processed, then ask questions about its content!
30
+ """)
31
+
32
+ # Sidebar
33
+ with st.sidebar:
34
+ st.header("About")
35
+ st.info("""
36
+ This tool uses natural language processing to extract and query information from PDFs.
37
+
38
+ **Features:**
39
+ - Extract text from PDFs
40
+ - Process into semantic chunks
41
+ - Query using natural language
42
+ - Get relevant context from the document
43
+ """)
44
+
45
+ st.header("Instructions")
46
+ st.markdown("""
47
+ 1. Upload a PDF file (max 26MB)
48
+ 2. Wait for processing to complete
49
+ 3. Type your question in the query box
50
+ 4. Review the results
51
+ """)
52
+
53
+ # File uploader
54
+ uploaded_file = st.file_uploader("Upload a PDF document", type=["pdf"])
55
+
56
+ # Process the uploaded file
57
+ if uploaded_file is not None:
58
+ # Save the uploaded file temporarily
59
+ temp_file_path = os.path.join(upload_dir, uploaded_file.name)
60
+ with open(temp_file_path, "wb") as f:
61
+ f.write(uploaded_file.getbuffer())
62
+
63
+ # Check if file has already been processed
64
+ file_hash = processor.get_file_hash(temp_file_path)
65
+ persist_directory = os.path.join(processor.config["db_directory"], file_hash)
66
+ already_processed = os.path.exists(persist_directory)
67
+
68
+ # Display file info
69
+ col1, col2 = st.columns(2)
70
+ with col1:
71
+ st.success(f"File uploaded: {uploaded_file.name}")
72
+
73
+ # Show file size
74
+ file_size = os.path.getsize(temp_file_path) / (1024 * 1024) # Convert to MB
75
+ st.info(f"File size: {file_size:.2f} MB")
76
+
77
+ with col2:
78
+ if already_processed:
79
+ st.info("This file has already been processed and is ready for querying.")
80
+ process_button = st.button("Re-process file")
81
+ else:
82
+ st.warning("This file needs to be processed before querying.")
83
+ process_button = st.button("Process file")
84
+
85
+ # Process the file when button is clicked
86
+ if process_button:
87
+ try:
88
+ with st.spinner("Processing PDF... This may take a minute."):
89
+ # Process file
90
+ vector_store = processor.process_file(temp_file_path)
91
+
92
+ if vector_store:
93
+ st.success("PDF processed successfully! You can now query the document.")
94
+ else:
95
+ st.error("Failed to process PDF. The file might be empty or corrupted.")
96
+ except SecurityException as e:
97
+ st.error(f"Security error: {str(e)}")
98
+ except Exception as e:
99
+ st.error(f"Error processing file: {str(e)}")
100
+
101
+ # Query interface
102
+ st.header("Ask questions about the document")
103
+
104
+ # Check if the document can be queried
105
+ can_query = os.path.exists(persist_directory)
106
+
107
+ if can_query:
108
+ query = st.text_input("Enter your question:")
109
+ k_value = st.slider("Number of results to return", min_value=1, max_value=10, value=3)
110
+
111
+ if st.button("Search") and query:
112
+ with st.spinner("Searching for answers..."):
113
+ try:
114
+ results = processor.query_document(temp_file_path, query, k=k_value)
115
+
116
+ if not results:
117
+ st.info("No relevant information found. Try rephrasing your question.")
118
+ else:
119
+ st.subheader("Search Results")
120
+ for i, doc in enumerate(results):
121
+ with st.expander(f"Result {i+1}"):
122
+ st.markdown(doc.page_content)
123
+ except Exception as e:
124
+ st.error(f"Error during query: {str(e)}")
125
+ else:
126
+ st.info("Please process the document before querying.")
127
+
128
+ # Add footer
129
+ st.markdown("---")
130
+ st.markdown("PDF Query Engine | Built with Streamlit and LangChain")