Spaces:

bacancydataprophets
/

Vision-bot

Sleeping

File size: 5,012 Bytes

import os 
import base64
from io import BytesIO
from PIL import Image
import streamlit as st
from langchain.memory import ConversationSummaryBufferMemory
from langchain_google_genai import ChatGoogleGenerativeAI
from datetime import datetime
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
load_dotenv()

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# Define title and layout
st.set_page_config(page_title="Vision Bot", layout="wide")
# GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = "AIzaSyAFPijT_v7G_Gm31QXgcIsqIO-JN4fCFsA"
st.title("Vision Bot")


llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    max_tokens=4000
)

IMAGE_SAVE_FOLDER = "./uploaded_images"
if not os.path.exists(IMAGE_SAVE_FOLDER):
    os.makedirs(IMAGE_SAVE_FOLDER)

st.markdown(
    """
<style>
    .sidebar-content {
        background-color: #f1f3f6;
        padding: 20px;
        border-radius: 10px;
        text-align: left;
        box-shadow: 0px 0px 10px rgba(0,0,0,0.1);
    }
    .st-emotion-cache-janbn0 {
        flex-direction: row-reverse;
        text-align: right;
    }
    .uploaded-image {
        border: 2px solid #D1D1D1;
        border-radius: 8px;
        margin-top: 10px;
    }
</style>
""",
    unsafe_allow_html=True,
)
# Initialize session states
if "messages" not in st.session_state:
    st.session_state.messages = []
if "llm" not in st.session_state:
    st.session_state.llm = llm
if "rag_memory" not in st.session_state:
    st.session_state.rag_memory = ConversationSummaryBufferMemory(llm=st.session_state.llm, max_token_limit=5000)
if "current_image" not in st.session_state:
    st.session_state.current_image = None
if "last_displayed_image" not in st.session_state:
    st.session_state.last_displayed_image = None

container = st.container()
with st.sidebar:
    st.markdown(
        """
        <div class="sidebar-content">
        <h2>Vision Bot</h2>
        <p>This is Vision Bot where you can ask any question regarding any image. It can perform various tasks such as:</p>
        <ul>
            <li><b>Image Captioning</b></li>
            <li><b>Answering text-related queries inside the image</b></li>
            <li><b>OCR (Optical Character Recognition)</b></li>
            <li><b>Image Analysis & Description</b></li>
        </ul>
        </div>
        """,
        unsafe_allow_html=True,
    )
# Upload image
# Upload image
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png","webp"], key="image_uploader")

# Check if a new image is uploaded
if uploaded_image and uploaded_image != st.session_state.current_image:
    st.session_state.current_image = uploaded_image
    
    # Fix image size here
    st.image(uploaded_image, caption="Newly Uploaded Image", width=300)  # Adjust width to a smaller size
    
    # Add a system message to mark the new image in the conversation
    st.session_state.messages.append({
        "role": "system", 
        "content": f"New image uploaded: {uploaded_image.name}",
        "image": uploaded_image
    })

# Display messages
for message in st.session_state.messages:
    with container.chat_message(message["role"]):
        if message["role"] == "system" and "image" in message:
            # Display image in chat history with fixed size
            st.image(message["image"], width=300)  # Adjust width to a smaller size
        st.write(message["content"])

# Take prompt
if prompt := st.chat_input("Enter your query here..."):
    with container.chat_message("user"):
        st.write(prompt)

    # Save user input in session state
    st.session_state.messages.append({"role": "user", "content": prompt})

    if st.session_state.current_image:
        # Save uploaded image to disk
        image = Image.open(st.session_state.current_image)
        current_date = datetime.now().strftime("%Y%m%d")
        image_name = f"{current_date}_{st.session_state.current_image.name}"
        image_path = os.path.join(IMAGE_SAVE_FOLDER, image_name)
        image.save(image_path)

        # Encode image in base64
        with open(image_path, "rb") as image_file:
            encoded_string = base64.b64encode(image_file.read()).decode()

        # Send image and text to the model
        chat = HumanMessage(
            content=[
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_string}"}},
            ]
        )
    else:
        # Send only text to the model if no image is uploaded
        chat = HumanMessage(content=prompt)

    # Get AI response
    ai_msg = llm.invoke([chat]).content
    with container.chat_message("assistant"): 
        st.write(ai_msg)

    # Save the conversation context in memory
    st.session_state.rag_memory.save_context({'input': prompt}, {'output': ai_msg})
    
    # Append the assistant's message to the session state
    st.session_state.messages.append({"role": "assistant", "content": ai_msg})