Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from PIL import Image
|
4 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
5 |
+
from io import BytesIO
|
6 |
+
import replicate
|
7 |
+
from llama_index.llms.palm import PaLM
|
8 |
+
from llama_index import ServiceContext, VectorStoreIndex, Document
|
9 |
+
from llama_index.memory import ChatMemoryBuffer
|
10 |
+
import os
|
11 |
+
|
12 |
+
# Function to get image caption via Kosmos2 (as in your original code)
|
13 |
+
def get_image_caption(image_data):
|
14 |
+
input_data = {
|
15 |
+
"image": image_data,
|
16 |
+
"description_type": "Brief"
|
17 |
+
}
|
18 |
+
output = replicate.run(
|
19 |
+
"lucataco/kosmos-2:3e7b211c29c092f4bcc8853922cc986baa52efe255876b80cac2c2fbb4aff805",
|
20 |
+
input=input_data
|
21 |
+
)
|
22 |
+
# Split the output string on the newline character and take the first item
|
23 |
+
text_description = output.split('\n\n')[0]
|
24 |
+
return text_description
|
25 |
+
|
26 |
+
# Function to create the chat engine (as in your original code)
|
27 |
+
def create_chat_engine(img_desc, api_key):
|
28 |
+
llm = PaLM(api_key=api_key)
|
29 |
+
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local")
|
30 |
+
doc = Document(text=img_desc)
|
31 |
+
index = VectorStoreIndex.from_documents([doc], service_context=service_context)
|
32 |
+
chatmemory = ChatMemoryBuffer.from_defaults(token_limit=1500)
|
33 |
+
|
34 |
+
chat_engine = index.as_chat_engine(
|
35 |
+
chat_mode="context",
|
36 |
+
system_prompt=(
|
37 |
+
f"You are a chatbot, able to have normal interactions, as well as talk. "
|
38 |
+
"You always answer in great detail and are polite. Your responses always descriptive. "
|
39 |
+
"Your job is to talk about an image the user has uploaded. Image description: {img_desc}."
|
40 |
+
),
|
41 |
+
verbose=True,
|
42 |
+
memory=chatmemory
|
43 |
+
)
|
44 |
+
return chat_engine
|
45 |
+
|
46 |
+
# Function to handle image upload and chat interaction
|
47 |
+
def process_image_and_chat(image_file, user_input):
|
48 |
+
if image_file is None:
|
49 |
+
return "Please upload an image."
|
50 |
+
|
51 |
+
image_data = BytesIO(image_file.read())
|
52 |
+
img_desc = get_image_caption(image_data)
|
53 |
+
chat_engine = create_chat_engine(img_desc, os.environ["GOOGLE_API_KEY"])
|
54 |
+
|
55 |
+
if user_input:
|
56 |
+
try:
|
57 |
+
response = chat_engine.chat(user_input)
|
58 |
+
return response
|
59 |
+
except Exception as e:
|
60 |
+
return f'An error occurred: {str(e)}'
|
61 |
+
else:
|
62 |
+
return "Ask me anything about the uploaded image."
|
63 |
+
|
64 |
+
# Define Gradio interface
|
65 |
+
image_input = gr.inputs.Image(type="file")
|
66 |
+
text_input = gr.inputs.Textbox(label="Ask me about the image:")
|
67 |
+
output_text = gr.outputs.Textbox(label="Response")
|
68 |
+
|
69 |
+
iface = gr.Interface(
|
70 |
+
fn=process_image_and_chat,
|
71 |
+
inputs=[image_input, text_input],
|
72 |
+
outputs=output_text,
|
73 |
+
title="My version of ChatGPT vision",
|
74 |
+
description="You can upload an image and start chatting with the LLM about the image",
|
75 |
+
allow_flagging="never"
|
76 |
+
)
|
77 |
+
|
78 |
+
# Set Replicate and Google API keys
|
79 |
+
os.environ['REPLICATE_API_TOKEN'] = 'your_replicate_api_token' # Replace with your actual key
|
80 |
+
os.environ["GOOGLE_API_KEY"] = 'your_google_api_key' # Replace with your actual key
|
81 |
+
|
82 |
+
# Launch the app
|
83 |
+
iface.launch()
|