import os import io import requests import streamlit as st from openai import OpenAI from PyPDF2 import PdfReader import urllib.parse from dotenv import load_dotenv from openai import OpenAI from io import BytesIO from streamlit_extras.colored_header import colored_header from streamlit_extras.add_vertical_space import add_vertical_space from streamlit_extras.switch_page_button import switch_page import json import pandas as pd from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode, DataReturnMode # ---------------------- Configuration ---------------------- st.set_page_config(page_title="Building Regulations Chatbot", layout="wide", initial_sidebar_state="expanded") # Load environment variables from .env file load_dotenv() # Set OpenAI API key client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # ---------------------- Session State Initialization ---------------------- if 'pdf_contents' not in st.session_state: st.session_state.pdf_contents = [] if 'chat_history' not in st.session_state: st.session_state.chat_history = [] if 'processed_pdfs' not in st.session_state: st.session_state.processed_pdfs = False if 'id_counter' not in st.session_state: st.session_state.id_counter = 0 if 'assistant_id' not in st.session_state: st.session_state.assistant_id = None if 'thread_id' not in st.session_state: st.session_state.thread_id = None # ---------------------- Helper Functions ---------------------- def get_vector_stores(): try: client = OpenAI() vector_stores = client.beta.vector_stores.list() return vector_stores except Exception as e: return f"Error retrieving vector stores: {str(e)}" def fetch_pdfs(city_code): url = f"http://91.203.213.50:5000/oereblex/{city_code}" response = requests.get(url) if response.status_code == 200: data = response.json() print("First data:", data.get('data', [])[0] if data.get('data') else None) return data.get('data', []) else: st.error(f"Failed to fetch PDFs for city code {city_code}") return None def download_pdf(url): # Add 'https://' scheme if it's missing if not url.startswith(('http://', 'https://')): url = 'https://' + url try: response = requests.get(url) response.raise_for_status() # Raise an exception for bad status codes # Generate a unique filename filename = f"downloaded_pdf_{st.session_state.id_counter}.pdf" st.session_state.id_counter += 1 # Save the PDF content to a file with open(filename, 'wb') as f: f.write(response.content) return filename except requests.RequestException as e: st.error(f"Failed to download PDF from {url}. Error: {str(e)}") return None # Helper function to upload file to OpenAI def upload_file_to_openai(file_path): try: file = client.files.create( file=open(file_path, 'rb'), purpose='assistants' ) return file.id except Exception as e: st.error(f"Failed to upload file {file_path}. Error: {str(e)}") return None def create_assistant(): assistant = client.beta.assistants.create( name="Building Regulations Assistant", instructions="You are an expert on building regulations. Use the provided documents to answer questions accurately.", model="gpt-4o-mini", tools=[{"type": "file_search"}] ) st.session_state.assistant_id = assistant.id return assistant.id def chat_with_assistant(file_ids, user_message): for file_id in file_ids: print("File ID:", file_id) # Create a thread and attach the file to the message print("final file id:", file_id) attachments = [{"file_id": file_id, "tools": [{"type": "file_search"}]} for file_id in file_ids] print("attachments:", attachments) if st.session_state.thread_id is None: thread = client.beta.threads.create( messages=[ { "role": "user", "content": user_message, "attachments": attachments, } ] ) st.session_state.thread_id = thread.id else: thread = client.beta.threads.messages.create( thread_id=st.session_state.thread_id, role="user", content=user_message, attachments=attachments ) # The thread now has a vector store with that file in its tool resources. print(thread.tool_resources.file_search) print("assistant_id:", st.session_state.assistant_id) print("thread_id:", thread.id) run = client.beta.threads.runs.create_and_poll( thread_id=thread.id, assistant_id=st.session_state.assistant_id ) print("run:", run) messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id)) message_content = messages[0].content[0].text annotations = message_content.annotations citations = [] for index, annotation in enumerate(annotations): message_content.value = message_content.value.replace(annotation.text, f"[{index}]") if file_citation := getattr(annotation, "file_citation", None): cited_file = client.files.retrieve(file_citation.file_id) citations.append(f"[{index}] {cited_file.filename}") print(message_content.value) print("\n".join(citations)) return message_content.value, citations # ---------------------- Streamlit App ---------------------- page = st.sidebar.selectbox("Choose a page", ["Documents", "Home", "Admin"]) if page == "Home": st.title("Building Regulations Chatbot", anchor=False) # Sidebar improvements with st.sidebar: colored_header("Selected Documents", description="Documents for chat") if 'selected_pdfs' in st.session_state and not st.session_state.selected_pdfs.empty: for _, pdf in st.session_state.selected_pdfs.iterrows(): st.write(f"- {pdf['Doc Title']}") else: st.write("No documents selected. Please go to the Documents page.") # Main chat area improvements colored_header("Chat", description="Ask questions about building regulations") # Display chat messages with improved styling for chat in st.session_state.chat_history: with st.container(): if chat['role'] == 'user': st.markdown(f"""
""", unsafe_allow_html=True) else: st.markdown(f""" """, unsafe_allow_html=True) # Chat input improvements with st.form("chat_form", clear_on_submit=True): user_input = st.text_area("Ask a question about building regulations...", height=100) col1, col2 = st.columns([3, 1]) with col2: submit = st.form_submit_button("Send", use_container_width=True) if submit and user_input.strip() != "": # Add user message to chat history st.session_state.chat_history.append({"role": "user", "content": user_input}) if not st.session_state.file_ids: st.error("Please process PDFs first.") else: with st.spinner("Generating response..."): try: response, citations = chat_with_assistant(st.session_state.file_ids, user_input) # Add assistant response to chat history st.session_state.chat_history.append({"role": "assistant", "content": response+"\n\n"+"\n".join(citations)}) except Exception as e: st.error(f"Error generating response: {str(e)}") # Rerun the app to update the chat display st.rerun() # Footer improvements add_vertical_space(2) st.markdown("---") col1, col2 = st.columns(2) with col1: st.caption("Powered by OpenAI GPT-4 and Pinecone") with col2: st.caption("© 2023 Your Company Name") elif page == "Documents": st.title("Document Selection") city_code_input = st.text_input("Enter city code:", key="city_code_input") load_documents_button = st.button("Load Documents", key="load_documents_button") if load_documents_button and city_code_input: with st.spinner("Fetching PDFs..."): pdfs = fetch_pdfs(city_code_input) if pdfs: st.session_state.available_pdfs = pdfs st.success(f"Found {len(pdfs)} PDFs") else: st.error("No PDFs found") if 'available_pdfs' in st.session_state: st.write(f"Total PDFs: {len(st.session_state.available_pdfs)}") # Create a DataFrame from the available PDFs df = pd.DataFrame(st.session_state.available_pdfs) # Select and rename only the specified columns df = df[['municipality', 'abbreviation', 'doc_title', 'file_title', 'file_href', 'enactment_date', 'prio']] df = df.rename(columns={ "municipality": "Municipality", "abbreviation": "Abbreviation", "doc_title": "Doc Title", "file_title": "File Title", "file_href": "File Href", "enactment_date": "Enactment Date", "prio": "Prio" }) # Add a checkbox column to the DataFrame at the beginning df.insert(0, "Select", False) # Configure grid options gb = GridOptionsBuilder.from_dataframe(df) gb.configure_default_column(enablePivot=True, enableValue=True, enableRowGroup=True) gb.configure_column("Select", header_name="Select", cellRenderer='checkboxRenderer') gb.configure_column("File Href", cellRenderer='linkRenderer') gb.configure_selection(selection_mode="multiple", use_checkbox=True) gb.configure_side_bar() gridOptions = gb.build() # Display the AgGrid grid_response = AgGrid( df, gridOptions=gridOptions, enable_enterprise_modules=True, update_mode=GridUpdateMode.MODEL_CHANGED, data_return_mode=DataReturnMode.FILTERED_AND_SORTED, fit_columns_on_grid_load=False, ) # Get the selected rows selected_rows = grid_response['selected_rows'] # Debug: Print the structure of selected_rows st.write("Debug - Selected Rows Structure:", selected_rows) if st.button("Process Selected PDFs"): if len(selected_rows) > 0: # Check if there are any selected rows # Convert selected_rows to a DataFrame st.session_state.selected_pdfs = pd.DataFrame(selected_rows) st.session_state.assistant_id = create_assistant() with st.spinner("Processing PDFs and creating/updating assistant..."): file_ids = [] for _, pdf in st.session_state.selected_pdfs.iterrows(): # Debug: Print each pdf item st.write("Debug - PDF item:", pdf) file_href = pdf['File Href'] doc_title = pdf['Doc Title'] file_name = download_pdf(file_href) if file_name: file_path = f"./{file_name}" file_id = upload_file_to_openai(file_path) if file_id: file_ids.append(file_id) else: st.warning(f"Failed to upload {doc_title}. Skipping this file.") else: st.warning(f"Failed to download {doc_title}. Skipping this file.") st.session_state.file_ids = file_ids st.success("PDFs processed successfully. You can now chat on the Home page.") else: st.warning("Select at least one PDF.") if st.button("Go to Home"): switch_page("Home") elif page == "Admin": st.title("Admin Panel") st.header("Vector Stores Information") vector_stores = get_vector_stores() json_vector_stores = json.dumps([vs.model_dump() for vs in vector_stores]) st.write(json_vector_stores) # Add a button to go back to the main page if st.button("Back to Home"): switch_page("Home")