sidbhasin commited on
Commit
1be21c8
·
verified ·
1 Parent(s): 6fce530

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -105
app.py CHANGED
@@ -4,7 +4,6 @@ import pdfplumber
4
  import torch
5
  from PyPDF2 import PdfReader
6
  import re
7
- import textwrap
8
 
9
  # Set page config
10
  st.set_page_config(
@@ -13,74 +12,71 @@ st.set_page_config(
13
  layout="wide"
14
  )
15
 
16
- # Custom CSS with improved styling
17
  st.markdown("""
18
  <style>
19
- .stApp {
20
- max-width: 1200px;
21
- margin: 0 auto;
 
 
 
 
22
  }
23
- .chat-message {
24
- padding: 1.5rem;
25
- border-radius: 0.5rem;
26
- margin-bottom: 1rem;
27
  display: flex;
28
  flex-direction: column;
29
- color: #ffffff;
 
 
 
30
  }
31
- .chat-message.user {
32
  background-color: #2b313e;
 
 
33
  }
34
- .chat-message.assistant {
35
- background-color: #475063;
 
 
36
  }
37
- .chat-message .content {
38
- display: flex;
39
- margin-bottom: 0.5rem;
40
- padding: 1rem;
41
- border-radius: 0.5rem;
 
42
  }
43
- .chat-message .metadata {
44
- font-size: 0.85rem;
45
- color: #a8a8a8;
46
- margin-top: 0.5rem;
 
 
47
  }
48
  .chat-input {
49
  position: fixed;
50
  bottom: 0;
51
  left: 0;
52
  right: 0;
53
- padding: 1rem;
54
- background-color: #262730;
55
- }
56
- .source-info {
57
- font-size: 0.8rem;
58
- color: #666;
59
- margin-top: 0.5rem;
60
- padding: 0.5rem;
61
- background-color: #f0f2f6;
62
- border-radius: 0.3rem;
63
  }
64
  </style>
65
  """, unsafe_allow_html=True)
66
 
67
- # Initialize session state
68
- if 'messages' not in st.session_state:
69
- st.session_state.messages = []
70
- if 'text_data' not in st.session_state:
71
- st.session_state.text_data = None
72
-
73
  @st.cache_resource
74
- def load_model():
75
  return pipeline(
76
  "question-answering",
77
  model="deepset/roberta-base-squad2",
78
  tokenizer="deepset/roberta-base-squad2"
79
  )
80
 
81
- def extract_text_with_metadata(pdf_file):
82
  text_data = []
83
-
84
  with pdfplumber.open(pdf_file) as pdf:
85
  for page_num, page in enumerate(pdf.pages, 1):
86
  text = page.extract_text()
@@ -88,46 +84,58 @@ def extract_text_with_metadata(pdf_file):
88
  paragraphs = text.split('\n\n')
89
  for para_num, paragraph in enumerate(paragraphs, 1):
90
  if paragraph.strip():
91
- lines = paragraph.split('\n')
92
- for line_num, line in enumerate(lines, 1):
93
- text_data.append({
94
- 'text': line.strip(),
95
- 'page': page_num,
96
- 'paragraph': para_num,
97
- 'line': line_num,
98
- 'full_paragraph': paragraph.strip()
99
- })
100
  return text_data
101
 
102
- def find_answer(question, text_data, qa_model):
103
- full_text = ' '.join([item['text'] for item in text_data])
104
-
105
- try:
106
- result = qa_model(question=question, context=full_text)
107
-
108
- answer_text = result['answer']
109
- answer_score = result['score']
110
-
111
- # Find the source paragraph
112
- for item in text_data:
113
- if answer_text in item['text']:
114
- return {
115
- 'answer': answer_text,
116
- 'confidence': answer_score,
117
- 'page': item['page'],
118
- 'paragraph': item['paragraph'],
119
- 'line': item['line'],
120
- 'context': item['full_paragraph']
 
 
121
  }
122
- except Exception as e:
123
- st.error(f"Error processing question: {str(e)}")
124
- return None
 
 
 
 
 
 
125
 
126
  def main():
127
- st.title("📚 PDF AI Chat")
128
 
 
 
 
 
 
 
 
129
  try:
130
- qa_model = load_model()
131
  except Exception as e:
132
  st.error(f"Error loading model: {str(e)}")
133
  return
@@ -135,67 +143,91 @@ def main():
135
  # File upload
136
  pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
137
 
138
- if pdf_file and not st.session_state.text_data:
139
  with st.spinner("Processing PDF..."):
140
  try:
141
- st.session_state.text_data = extract_text_with_metadata(pdf_file)
142
- st.success("PDF processed successfully!")
143
  except Exception as e:
144
  st.error(f"Error processing PDF: {str(e)}")
145
  return
146
 
147
- # Display chat messages
 
 
 
148
  for message in st.session_state.messages:
149
- with st.chat_message(message["role"]):
150
- st.write(message["content"])
151
- if "metadata" in message:
152
- st.markdown(f"""
 
 
 
 
 
 
153
  <div class="source-info">
154
- Source: Page {message['metadata']['page']},
155
- Paragraph {message['metadata']['paragraph']},
156
- Line {message['metadata']['line']}
157
- <br>Confidence: {message['metadata']['confidence']:.2%}
 
 
158
  </div>
159
- """, unsafe_allow_html=True)
 
160
 
161
- # Chat input
162
- if st.session_state.text_data:
163
- if question := st.chat_input("Ask a question about the document"):
164
- # Add user message
 
 
 
 
165
  st.session_state.messages.append({"role": "user", "content": question})
166
 
167
  # Generate answer
168
  with st.spinner("Finding answer..."):
169
- result = find_answer(question, st.session_state.text_data, qa_model)
 
 
 
 
170
 
171
- if result:
172
- # Add assistant message
173
  st.session_state.messages.append({
174
  "role": "assistant",
175
- "content": result['answer'],
176
  "metadata": {
177
- "page": result['page'],
178
- "paragraph": result['paragraph'],
179
- "line": result['line'],
180
- "confidence": result['confidence']
181
  }
182
  })
183
 
184
- # Rerun to update chat display
185
  st.rerun()
 
 
 
186
  else:
187
  st.markdown("""
188
  ### Instructions:
189
  1. Upload a PDF document using the file uploader above
190
  2. Wait for the document to be processed
191
- 3. Start asking questions about the document
192
- 4. Get detailed answers with source information
193
 
194
  ### Features:
195
- - Chat-like interface
196
- - Source tracking
197
  - Confidence scores
198
- - Context preservation
 
199
  """)
200
 
201
  if __name__ == "__main__":
 
4
  import torch
5
  from PyPDF2 import PdfReader
6
  import re
 
7
 
8
  # Set page config
9
  st.set_page_config(
 
12
  layout="wide"
13
  )
14
 
15
+ # Custom CSS for better styling
16
  st.markdown("""
17
  <style>
18
+ .chat-container {
19
+ display: flex;
20
+ flex-direction: column;
21
+ gap: 20px;
22
+ padding: 20px;
23
+ height: calc(100vh - 200px);
24
+ overflow-y: auto;
25
  }
26
+ .message-container {
 
 
 
27
  display: flex;
28
  flex-direction: column;
29
+ gap: 10px;
30
+ padding: 15px;
31
+ border-radius: 10px;
32
+ max-width: 90%;
33
  }
34
+ .user-message {
35
  background-color: #2b313e;
36
+ color: white;
37
+ align-self: flex-end;
38
  }
39
+ .assistant-message {
40
+ background-color: #f0f2f6;
41
+ color: black;
42
+ align-self: flex-start;
43
  }
44
+ .source-info {
45
+ font-size: 0.8em;
46
+ color: #666;
47
+ border-top: 1px solid #ddd;
48
+ margin-top: 10px;
49
+ padding-top: 10px;
50
  }
51
+ .context-box {
52
+ background-color: #f8f9fa;
53
+ border-left: 3px solid #1f77b4;
54
+ padding: 10px;
55
+ margin-top: 10px;
56
+ font-size: 0.9em;
57
  }
58
  .chat-input {
59
  position: fixed;
60
  bottom: 0;
61
  left: 0;
62
  right: 0;
63
+ padding: 20px;
64
+ background: white;
65
+ border-top: 1px solid #ddd;
 
 
 
 
 
 
 
66
  }
67
  </style>
68
  """, unsafe_allow_html=True)
69
 
 
 
 
 
 
 
70
  @st.cache_resource
71
+ def load_qa_model():
72
  return pipeline(
73
  "question-answering",
74
  model="deepset/roberta-base-squad2",
75
  tokenizer="deepset/roberta-base-squad2"
76
  )
77
 
78
+ def process_pdf(pdf_file):
79
  text_data = []
 
80
  with pdfplumber.open(pdf_file) as pdf:
81
  for page_num, page in enumerate(pdf.pages, 1):
82
  text = page.extract_text()
 
84
  paragraphs = text.split('\n\n')
85
  for para_num, paragraph in enumerate(paragraphs, 1):
86
  if paragraph.strip():
87
+ text_data.append({
88
+ 'text': paragraph.strip(),
89
+ 'page': page_num,
90
+ 'paragraph': para_num,
91
+ 'context': paragraph.strip()
92
+ })
 
 
 
93
  return text_data
94
 
95
+ def find_best_answer(question, text_data, qa_model):
96
+ best_answer = None
97
+ max_score = 0
98
+ relevant_context = []
99
+
100
+ for chunk in text_data:
101
+ try:
102
+ result = qa_model(
103
+ question=question,
104
+ context=chunk['text'],
105
+ max_answer_len=100
106
+ )
107
+
108
+ if result['score'] > max_score:
109
+ max_score = result['score']
110
+ best_answer = {
111
+ 'answer': result['answer'],
112
+ 'confidence': result['score'],
113
+ 'page': chunk['page'],
114
+ 'paragraph': chunk['paragraph'],
115
+ 'context': chunk['context']
116
  }
117
+
118
+ # Collect relevant contexts
119
+ if result['score'] > 0.1: # Threshold for relevance
120
+ relevant_context.append(chunk['context'])
121
+
122
+ except Exception as e:
123
+ continue
124
+
125
+ return best_answer, relevant_context[:3] # Return top 3 relevant contexts
126
 
127
  def main():
128
+ st.title("📚 Advanced PDF Question Answering")
129
 
130
+ # Initialize session state
131
+ if 'messages' not in st.session_state:
132
+ st.session_state.messages = []
133
+ if 'pdf_data' not in st.session_state:
134
+ st.session_state.pdf_data = None
135
+
136
+ # Load QA model
137
  try:
138
+ qa_model = load_qa_model()
139
  except Exception as e:
140
  st.error(f"Error loading model: {str(e)}")
141
  return
 
143
  # File upload
144
  pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
145
 
146
+ if pdf_file and not st.session_state.pdf_data:
147
  with st.spinner("Processing PDF..."):
148
  try:
149
+ st.session_state.pdf_data = process_pdf(pdf_file)
150
+ st.success("PDF processed successfully! You can now ask questions.")
151
  except Exception as e:
152
  st.error(f"Error processing PDF: {str(e)}")
153
  return
154
 
155
+ # Chat interface
156
+ st.markdown('<div class="chat-container">', unsafe_allow_html=True)
157
+
158
+ # Display chat history
159
  for message in st.session_state.messages:
160
+ if message["role"] == "user":
161
+ st.markdown(f"""
162
+ <div class="message-container user-message">
163
+ {message["content"]}
164
+ </div>
165
+ """, unsafe_allow_html=True)
166
+ else:
167
+ st.markdown(f"""
168
+ <div class="message-container assistant-message">
169
+ <div>{message["content"]}</div>
170
  <div class="source-info">
171
+ Source: Page {message["metadata"]["page"]},
172
+ Paragraph {message["metadata"]["paragraph"]}
173
+ (Confidence: {message["metadata"]["confidence"]:.1%})
174
+ </div>
175
+ <div class="context-box">
176
+ {message["metadata"]["context"]}
177
  </div>
178
+ </div>
179
+ """, unsafe_allow_html=True)
180
 
181
+ st.markdown('</div>', unsafe_allow_html=True)
182
+
183
+ # Question input
184
+ if st.session_state.pdf_data:
185
+ question = st.text_input("Ask a question about the document:", key="question_input")
186
+
187
+ if question:
188
+ # Add user question to chat history
189
  st.session_state.messages.append({"role": "user", "content": question})
190
 
191
  # Generate answer
192
  with st.spinner("Finding answer..."):
193
+ answer, relevant_contexts = find_best_answer(
194
+ question,
195
+ st.session_state.pdf_data,
196
+ qa_model
197
+ )
198
 
199
+ if answer:
200
+ # Add assistant response to chat history
201
  st.session_state.messages.append({
202
  "role": "assistant",
203
+ "content": answer["answer"],
204
  "metadata": {
205
+ "page": answer["page"],
206
+ "paragraph": answer["paragraph"],
207
+ "confidence": answer["confidence"],
208
+ "context": answer["context"]
209
  }
210
  })
211
 
212
+ # Force refresh
213
  st.rerun()
214
+ else:
215
+ st.error("Sorry, I couldn't find a relevant answer in the document.")
216
+
217
  else:
218
  st.markdown("""
219
  ### Instructions:
220
  1. Upload a PDF document using the file uploader above
221
  2. Wait for the document to be processed
222
+ 3. Start asking questions about the content
223
+ 4. Get detailed answers with source information and context
224
 
225
  ### Features:
226
+ - Natural conversation interface
227
+ - Source tracking with page numbers
228
  - Confidence scores
229
+ - Relevant context display
230
+ - Multiple question support
231
  """)
232
 
233
  if __name__ == "__main__":