Jashan1 commited on
Commit
a5ecab2
1 Parent(s): cca5fb8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +520 -0
app.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import requests
4
+ import streamlit as st
5
+ from openai import OpenAI
6
+ from PyPDF2 import PdfReader
7
+ import urllib.parse
8
+ from dotenv import load_dotenv
9
+ from openai import OpenAI
10
+ from io import BytesIO
11
+ from streamlit_extras.colored_header import colored_header
12
+ from streamlit_extras.add_vertical_space import add_vertical_space
13
+ from streamlit_extras.switch_page_button import switch_page
14
+ import json
15
+ import pandas as pd
16
+ from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode, DataReturnMode
17
+ import time
18
+ import random
19
+ import aiohttp
20
+ import asyncio
21
+ from PyPDF2 import PdfWriter
22
+
23
+ load_dotenv()
24
+
25
+ # ---------------------- Configuration ----------------------
26
+ st.set_page_config(page_title="Building Regulations Chatbot", layout="wide", initial_sidebar_state="expanded")
27
+ # Load environment variables from .env file
28
+ load_dotenv()
29
+
30
+ # Set OpenAI API key
31
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
32
+
33
+ # ---------------------- Session State Initialization ----------------------
34
+
35
+ if 'pdf_contents' not in st.session_state:
36
+ st.session_state.pdf_contents = []
37
+ if 'chat_history' not in st.session_state:
38
+ st.session_state.chat_history = []
39
+ if 'processed_pdfs' not in st.session_state:
40
+ st.session_state.processed_pdfs = False
41
+ if 'id_counter' not in st.session_state:
42
+ st.session_state.id_counter = 0
43
+ if 'assistant_id' not in st.session_state:
44
+ st.session_state.assistant_id = None
45
+ if 'thread_id' not in st.session_state:
46
+ st.session_state.thread_id = None
47
+ if 'file_ids' not in st.session_state:
48
+ st.session_state.file_ids = []
49
+
50
+
51
+ # ---------------------- Helper Functions ----------------------
52
+
53
+ def get_vector_stores():
54
+ try:
55
+ vector_stores = client.beta.vector_stores.list()
56
+ return vector_stores
57
+ except Exception as e:
58
+ return f"Error retrieving vector stores: {str(e)}"
59
+
60
+
61
+ def fetch_pdfs(city_code):
62
+ url = f"http://91.203.213.50:5000/oereblex/{city_code}"
63
+ response = requests.get(url)
64
+ if response.status_code == 200:
65
+ data = response.json()
66
+ print("First data:", data.get('data', [])[0] if data.get('data') else None)
67
+ return data.get('data', [])
68
+ else:
69
+ st.error(f"Failed to fetch PDFs for city code {city_code}")
70
+ return None
71
+
72
+
73
+ def download_pdf(url, doc_title):
74
+ # Add 'https://' scheme if it's missing
75
+ if not url.startswith(('http://', 'https://')):
76
+ url = 'https://' + url
77
+
78
+ try:
79
+ response = requests.get(url)
80
+ response.raise_for_status() # Raise an exception for bad status codes
81
+
82
+ # Sanitize doc_title to create a valid filename
83
+ sanitized_title = ''.join(c for c in doc_title if c.isalnum() or c in (' ', '_', '-')).rstrip()
84
+ sanitized_title = sanitized_title.replace(' ', '_')
85
+ filename = f"{sanitized_title}.pdf"
86
+
87
+ # Ensure filename is unique by appending the id_counter if necessary
88
+ if os.path.exists(filename):
89
+ filename = f"{sanitized_title}_{st.session_state.id_counter}.pdf"
90
+ st.session_state.id_counter += 1
91
+
92
+ # Save the PDF content to a file
93
+ with open(filename, 'wb') as f:
94
+ f.write(response.content)
95
+
96
+ return filename
97
+ except requests.RequestException as e:
98
+ st.error(f"Failed to download PDF from {url}. Error: {str(e)}")
99
+ return None
100
+
101
+
102
+ # Helper function to upload file to OpenAI
103
+ def upload_file_to_openai(file_path):
104
+ try:
105
+ file = client.files.create(
106
+ file=open(file_path, 'rb'),
107
+ purpose='assistants'
108
+ )
109
+ return file.id
110
+ except Exception as e:
111
+ st.error(f"Failed to upload file {file_path}. Error: {str(e)}")
112
+ return None
113
+
114
+
115
+ def create_assistant():
116
+ assistant = client.beta.assistants.create(
117
+ name="Building Regulations Assistant",
118
+ instructions="You are an expert on building regulations. Use the provided documents to answer questions accurately.",
119
+ model="gpt-4o-mini",
120
+ tools=[{"type": "file_search"}]
121
+ )
122
+ st.session_state.assistant_id = assistant.id
123
+ return assistant.id
124
+
125
+
126
+ def format_response(response, citations):
127
+ """Format the response with proper markdown structure."""
128
+ formatted_text = f"""
129
+ ### Response
130
+ {response}
131
+
132
+ {"### Citations" if citations else ""}
133
+ {"".join([f"- {citation}\n" for citation in citations]) if citations else ""}
134
+ """
135
+ return formatted_text.strip()
136
+
137
+ def response_generator(response, citations):
138
+ """Generator for streaming response with structured output."""
139
+ # First yield the response header
140
+ yield "### Response\n\n"
141
+ time.sleep(0.1)
142
+
143
+ # Yield the main response word by word
144
+ words = response.split()
145
+ for i, word in enumerate(words):
146
+ yield word + " "
147
+ # Add natural pauses at punctuation
148
+ if word.endswith(('.', '!', '?', ':')):
149
+ time.sleep(0.1)
150
+ else:
151
+ time.sleep(0.05)
152
+
153
+ # If there are citations, yield them with proper formatting
154
+ if citations:
155
+ # Add some spacing before citations
156
+ yield "\n\n### Citations\n\n"
157
+ time.sleep(0.1)
158
+
159
+ for citation in citations:
160
+ yield f"- {citation}\n"
161
+ time.sleep(0.05)
162
+
163
+ def chat_with_assistant(file_ids, user_message):
164
+ print("----- Starting chat_with_assistant -----")
165
+ print("Received file_ids:", file_ids)
166
+ print("Received user_message:", user_message)
167
+
168
+ # Create attachments for each file_id
169
+ attachments = [{"file_id": file_id, "tools": [{"type": "file_search"}]} for file_id in file_ids]
170
+ print("Attachments created:", attachments)
171
+
172
+ if st.session_state.thread_id is None:
173
+ print("No existing thread_id found. Creating a new thread.")
174
+ thread = client.beta.threads.create(
175
+ messages=[
176
+ {
177
+ "role": "user",
178
+ "content": user_message,
179
+ "attachments": attachments,
180
+ }
181
+ ]
182
+ )
183
+ st.session_state.thread_id = thread.id
184
+ print("New thread created with id:", st.session_state.thread_id)
185
+ else:
186
+ print(f"Existing thread_id found: {st.session_state.thread_id}. Adding message to the thread.")
187
+ message = client.beta.threads.messages.create(
188
+ thread_id=st.session_state.thread_id,
189
+ role="user",
190
+ content=user_message,
191
+ attachments=attachments
192
+ )
193
+ print("Message added to thread with id:", message.id)
194
+
195
+ try:
196
+ thread = client.beta.threads.retrieve(thread_id=st.session_state.thread_id)
197
+ print("Retrieved thread:", thread)
198
+ except Exception as e:
199
+ print(f"Error retrieving thread with id {st.session_state.thread_id}: {e}")
200
+ return "An error occurred while processing your request.", []
201
+
202
+ try:
203
+ run = client.beta.threads.runs.create_and_poll(
204
+ thread_id=thread.id, assistant_id=st.session_state.assistant_id
205
+ )
206
+ print("Run created and polled:", run)
207
+ except Exception as e:
208
+ print("Error during run creation and polling:", e)
209
+ return "An error occurred while processing your request.", []
210
+
211
+ try:
212
+ messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
213
+ print("Retrieved messages:", messages)
214
+ except Exception as e:
215
+ print("Error retrieving messages:", e)
216
+ return "An error occurred while retrieving messages.", []
217
+
218
+ # Process the first message content
219
+ if messages and messages[0].content:
220
+ message_content = messages[0].content[0].text
221
+ print("Raw message content:", message_content)
222
+
223
+ annotations = message_content.annotations
224
+ citations = []
225
+ seen_citations = set()
226
+
227
+ # Process annotations and citations
228
+ for index, annotation in enumerate(annotations):
229
+ message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
230
+ if file_citation := getattr(annotation, "file_citation", None):
231
+ try:
232
+ cited_file = client.files.retrieve(file_citation.file_id)
233
+ citation_entry = f"[{index}] {cited_file.filename}"
234
+ if citation_entry not in seen_citations:
235
+ citations.append(citation_entry)
236
+ seen_citations.add(citation_entry)
237
+ except Exception as e:
238
+ print(f"Error retrieving cited file for annotation {index}: {e}")
239
+
240
+ # Create a container for the response with proper styling
241
+ response_container = st.container()
242
+ with response_container:
243
+ message_placeholder = st.empty()
244
+ streaming_content = ""
245
+
246
+ # Stream the response with structure
247
+ for chunk in response_generator(message_content.value, citations):
248
+ streaming_content += chunk
249
+ # Use markdown for proper formatting during streaming
250
+ message_placeholder.markdown(streaming_content + "▌")
251
+
252
+ # Final formatted response
253
+ final_formatted_response = format_response(message_content.value, citations)
254
+ message_placeholder.markdown(final_formatted_response)
255
+
256
+ return final_formatted_response, citations
257
+ else:
258
+ return "No response received from the assistant.", []
259
+
260
+
261
+ # ---------------------- Streamlit App ----------------------
262
+
263
+ # ---------------------- Custom CSS Injection ----------------------
264
+
265
+ # Inject custom CSS to style chat messages
266
+ st.markdown("""
267
+ <style>
268
+ /* Style for the chat container */
269
+ .chat-container {
270
+ display: flex;
271
+ flex-direction: column;
272
+ gap: 1.5rem;
273
+ }
274
+
275
+ /* Style for individual chat messages */
276
+ .chat-message {
277
+ margin-bottom: 1.5rem;
278
+ }
279
+
280
+ /* Style for user messages */
281
+ .chat-message.user > div:first-child {
282
+ color: #1E90FF; /* Dodger Blue for "You" */
283
+ font-weight: bold;
284
+ margin-bottom: 0.5rem;
285
+ }
286
+
287
+ /* Style for assistant messages */
288
+ .chat-message.assistant > div:first-child {
289
+ color: #32CD32; /* Lime Green for "Assistant" */
290
+ font-weight: bold;
291
+ margin-bottom: 0.5rem;
292
+ }
293
+
294
+ /* Style for the message content */
295
+ .message-content {
296
+ padding: 1rem;
297
+ border-radius: 0.5rem;
298
+ line-height: 1.5;
299
+ }
300
+
301
+ .message-content h3 {
302
+ color: #444;
303
+ margin-top: 1rem;
304
+ margin-bottom: 0.5rem;
305
+ font-size: 1.1rem;
306
+ }
307
+
308
+ .message-content ul {
309
+ margin-top: 0.5rem;
310
+ margin-bottom: 0.5rem;
311
+ padding-left: 1.5rem;
312
+ }
313
+
314
+ .message-content li {
315
+ margin-bottom: 0.25rem;
316
+ }
317
+ </style>
318
+ """, unsafe_allow_html=True)
319
+
320
+ page = st.sidebar.selectbox("Choose a page", ["Documents", "Home", "Admin"])
321
+
322
+ if page == "Home":
323
+ st.title("Building Regulations Chatbot", anchor=False)
324
+
325
+ # Sidebar improvements
326
+ with st.sidebar:
327
+ colored_header("Selected Documents", description="Documents for chat")
328
+ if 'selected_pdfs' in st.session_state and not st.session_state.selected_pdfs.empty:
329
+ for _, pdf in st.session_state.selected_pdfs.iterrows():
330
+ st.write(f"- {pdf['Doc Title']}")
331
+ else:
332
+ st.write("No documents selected. Please go to the Documents page.")
333
+
334
+ # Main chat area improvements
335
+ colored_header("Chat", description="Ask questions about building regulations")
336
+
337
+ # Chat container with custom CSS class
338
+ st.markdown('<div class="chat-container" id="chat-container">', unsafe_allow_html=True)
339
+ for chat in st.session_state.chat_history:
340
+ with st.container():
341
+ if chat['role'] == 'user':
342
+ st.markdown(f"""
343
+ <div class="chat-message user">
344
+ <div><strong>You</strong></div>
345
+ <div class="message-content">{chat['content']}</div>
346
+ </div>
347
+ """, unsafe_allow_html=True)
348
+ else:
349
+ st.markdown(f"""
350
+ <div class="chat-message assistant">
351
+ <div><strong>Assistant</strong></div>
352
+ <div class="message-content">{chat['content']}</div>
353
+ </div>
354
+ """, unsafe_allow_html=True)
355
+ st.markdown('</div>', unsafe_allow_html=True)
356
+
357
+ # Inject JavaScript to auto-scroll the chat container
358
+ st.markdown("""
359
+ <script>
360
+ const chatContainer = document.getElementById('chat-container');
361
+ if (chatContainer) {
362
+ chatContainer.scrollTop = chatContainer.scrollHeight;
363
+ }
364
+ </script>
365
+ """, unsafe_allow_html=True)
366
+
367
+ # Chat input improvements
368
+ with st.form("chat_form", clear_on_submit=True):
369
+ user_input = st.text_area("Ask a question about building regulations...", height=100)
370
+ col1, col2 = st.columns([3, 1])
371
+ with col2:
372
+ submit = st.form_submit_button("Send", use_container_width=True)
373
+
374
+ if submit and user_input.strip() != "":
375
+ # Add user message to chat history
376
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
377
+
378
+ if not st.session_state.file_ids:
379
+ st.error("Please process PDFs first.")
380
+ else:
381
+ with st.spinner("Generating response..."):
382
+ try:
383
+ response, citations = chat_with_assistant(st.session_state.file_ids, user_input)
384
+ # The response is already formatted, so we can add it directly to chat history
385
+ st.session_state.chat_history.append({
386
+ "role": "assistant",
387
+ "content": response
388
+ })
389
+ except Exception as e:
390
+ st.error(f"Error generating response: {str(e)}")
391
+
392
+ # Rerun the app to update the chat display
393
+ st.rerun()
394
+
395
+ # Footer improvements
396
+ add_vertical_space(2)
397
+ st.markdown("---")
398
+ col1, col2 = st.columns(2)
399
+ with col1:
400
+ st.caption("Powered by OpenAI GPT-4 and Pinecone")
401
+ with col2:
402
+ st.caption("© 2023 Your Company Name")
403
+
404
+ elif page == "Documents":
405
+ st.title("Document Selection")
406
+
407
+ city_code_input = st.text_input("Enter city code:", key="city_code_input")
408
+ load_documents_button = st.button("Load Documents", key="load_documents_button")
409
+
410
+ if load_documents_button and city_code_input:
411
+ with st.spinner("Fetching PDFs..."):
412
+ pdfs = fetch_pdfs(city_code_input)
413
+ if pdfs:
414
+ st.session_state.available_pdfs = pdfs
415
+ st.success(f"Found {len(pdfs)} PDFs")
416
+ else:
417
+ st.error("No PDFs found")
418
+
419
+ if 'available_pdfs' in st.session_state:
420
+ st.write(f"Total PDFs: {len(st.session_state.available_pdfs)}")
421
+
422
+ # Create a DataFrame from the available PDFs
423
+ df = pd.DataFrame(st.session_state.available_pdfs)
424
+
425
+ # Select and rename only the specified columns
426
+ df = df[['municipality', 'abbreviation', 'doc_title', 'file_title', 'file_href', 'enactment_date', 'prio']]
427
+ df = df.rename(columns={
428
+ "municipality": "Municipality",
429
+ "abbreviation": "Abbreviation",
430
+ "doc_title": "Doc Title",
431
+ "file_title": "File Title",
432
+ "file_href": "File Href",
433
+ "enactment_date": "Enactment Date",
434
+ "prio": "Prio"
435
+ })
436
+
437
+ # Add a checkbox column to the DataFrame at the beginning
438
+ df.insert(0, "Select", False)
439
+
440
+ # Configure grid options
441
+ gb = GridOptionsBuilder.from_dataframe(df)
442
+ gb.configure_default_column(enablePivot=True, enableValue=True, enableRowGroup=True)
443
+ gb.configure_column("Select", header_name="Select", cellRenderer='checkboxRenderer')
444
+ gb.configure_column("File Href", cellRenderer='linkRenderer')
445
+ gb.configure_selection(selection_mode="multiple", use_checkbox=True)
446
+ gb.configure_side_bar()
447
+ gridOptions = gb.build()
448
+
449
+ # Display the AgGrid
450
+ grid_response = AgGrid(
451
+ df,
452
+ gridOptions=gridOptions,
453
+ enable_enterprise_modules=True,
454
+ update_mode=GridUpdateMode.MODEL_CHANGED,
455
+ data_return_mode=DataReturnMode.FILTERED_AND_SORTED,
456
+ fit_columns_on_grid_load=False,
457
+ )
458
+
459
+ # Get the selected rows
460
+ selected_rows = grid_response['selected_rows']
461
+
462
+ # Debug: Print the structure of selected_rows
463
+ st.write("Debug - Selected Rows Structure:", selected_rows)
464
+
465
+ if st.button("Process Selected PDFs"):
466
+ if len(selected_rows) > 0: # Check if there are any selected rows
467
+ # Convert selected_rows to a DataFrame
468
+ st.session_state.selected_pdfs = pd.DataFrame(selected_rows)
469
+ st.session_state.assistant_id = create_assistant()
470
+ with st.spinner("Processing PDFs and creating/updating assistant..."):
471
+ file_ids = []
472
+
473
+ for _, pdf in st.session_state.selected_pdfs.iterrows():
474
+ # Debug: Print each pdf item
475
+ st.write("Debug - PDF item:", pdf)
476
+
477
+ file_href = pdf['File Href']
478
+ doc_title = pdf['Doc Title']
479
+
480
+ # Pass doc_title to download_pdf
481
+ file_name = download_pdf(file_href, doc_title)
482
+ if file_name:
483
+ file_path = f"./{file_name}"
484
+ file_id = upload_file_to_openai(file_path)
485
+ if file_id:
486
+ file_ids.append(file_id)
487
+ else:
488
+ st.warning(f"Failed to upload {doc_title}. Skipping this file.")
489
+ else:
490
+ st.warning(f"Failed to download {doc_title}. Skipping this file.")
491
+
492
+ st.session_state.file_ids = file_ids
493
+ st.success("PDFs processed successfully. You can now chat on the Home page.")
494
+ else:
495
+ st.warning("Select at least one PDF.")
496
+
497
+
498
+ elif page == "Admin":
499
+ st.title("Admin Panel")
500
+ st.header("Vector Stores Information")
501
+
502
+ vector_stores = get_vector_stores()
503
+ json_vector_stores = json.dumps([vs.model_dump() for vs in vector_stores])
504
+ st.write(json_vector_stores)
505
+
506
+ # Add a button to go back to the main page
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+