import os import base64 from io import BytesIO from PIL import Image import streamlit as st from langchain.memory import ConversationSummaryBufferMemory from langchain_google_genai import ChatGoogleGenerativeAI from datetime import datetime from langchain_core.messages import HumanMessage from dotenv import load_dotenv load_dotenv() os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # Define title and layout st.set_page_config(page_title="Vision Bot", layout="wide") # GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY") os.environ["GOOGLE_API_KEY"] = "AIzaSyAFPijT_v7G_Gm31QXgcIsqIO-JN4fCFsA" st.title("Vision Bot") llm = ChatGoogleGenerativeAI( model="gemini-1.5-flash", max_tokens=4000 ) IMAGE_SAVE_FOLDER = "./uploaded_images" if not os.path.exists(IMAGE_SAVE_FOLDER): os.makedirs(IMAGE_SAVE_FOLDER) st.markdown( """ """, unsafe_allow_html=True, ) # Initialize session states if "messages" not in st.session_state: st.session_state.messages = [] if "llm" not in st.session_state: st.session_state.llm = llm if "rag_memory" not in st.session_state: st.session_state.rag_memory = ConversationSummaryBufferMemory(llm=st.session_state.llm, max_token_limit=5000) if "current_image" not in st.session_state: st.session_state.current_image = None if "last_displayed_image" not in st.session_state: st.session_state.last_displayed_image = None container = st.container() with st.sidebar: st.markdown( """

Vision Bot

This is Vision Bot where you can ask any question regarding any image. It can perform various tasks such as:

Image Captioning
Answering text-related queries inside the image
OCR (Optical Character Recognition)
Image Analysis & Description

""", unsafe_allow_html=True, ) # Upload image # Upload image uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png","webp"], key="image_uploader") # Check if a new image is uploaded if uploaded_image and uploaded_image != st.session_state.current_image: st.session_state.current_image = uploaded_image # Fix image size here st.image(uploaded_image, caption="Newly Uploaded Image", width=300) # Adjust width to a smaller size # Add a system message to mark the new image in the conversation st.session_state.messages.append({ "role": "system", "content": f"New image uploaded: {uploaded_image.name}", "image": uploaded_image }) # Display messages for message in st.session_state.messages: with container.chat_message(message["role"]): if message["role"] == "system" and "image" in message: # Display image in chat history with fixed size st.image(message["image"], width=300) # Adjust width to a smaller size st.write(message["content"]) # Take prompt if prompt := st.chat_input("Enter your query here..."): with container.chat_message("user"): st.write(prompt) # Save user input in session state st.session_state.messages.append({"role": "user", "content": prompt}) if st.session_state.current_image: # Save uploaded image to disk image = Image.open(st.session_state.current_image) current_date = datetime.now().strftime("%Y%m%d") image_name = f"{current_date}_{st.session_state.current_image.name}" image_path = os.path.join(IMAGE_SAVE_FOLDER, image_name) image.save(image_path) # Encode image in base64 with open(image_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode() # Send image and text to the model chat = HumanMessage( content=[ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_string}"}}, ] ) else: # Send only text to the model if no image is uploaded chat = HumanMessage(content=prompt) # Get AI response ai_msg = llm.invoke([chat]).content with container.chat_message("assistant"): st.write(ai_msg) # Save the conversation context in memory st.session_state.rag_memory.save_context({'input': prompt}, {'output': ai_msg}) # Append the assistant's message to the session state st.session_state.messages.append({"role": "assistant", "content": ai_msg})