placed bot_comms and reversions to the chimera which is now multimodal_and_generation()
Browse files
app.py
CHANGED
@@ -7,6 +7,10 @@ import accelerate
|
|
7 |
import spaces
|
8 |
from PIL import Image
|
9 |
import threading
|
|
|
|
|
|
|
|
|
10 |
|
11 |
DESCRIPTION = '''
|
12 |
<div>
|
@@ -47,8 +51,7 @@ refiner = DiffusionPipeline.from_pretrained(
|
|
47 |
refiner.to('cuda')
|
48 |
|
49 |
# All Installed. Let's instance them in the function
|
50 |
-
|
51 |
-
def chimera(message, history):
|
52 |
"""
|
53 |
Receives input from gradio from the prompt but also
|
54 |
if any images were passed that i also placed for formatting
|
@@ -70,22 +73,17 @@ def chimera(message, history):
|
|
70 |
prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
71 |
|
72 |
if image_path is None:
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
denoising_start=0.8,
|
83 |
-
image=image
|
84 |
-
).images[0]
|
85 |
-
return image
|
86 |
|
87 |
else:
|
88 |
-
|
89 |
# Time to instance the llava
|
90 |
image = Image.open(image_path)
|
91 |
inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16)
|
@@ -95,12 +93,85 @@ def chimera(message, history):
|
|
95 |
thread = threading.Thread(target=llava_model.generate, kwargs=generation_kwargs)
|
96 |
thread.start()
|
97 |
|
98 |
-
buffer = ""
|
99 |
-
for new_text in streamer:
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
generated_text_no_prompt = buffer
|
105 |
yield generated_text_no_prompt
|
106 |
|
@@ -110,7 +181,7 @@ chat_input = gr.MultimodalTextbox(interactive=True, file_types=["images"], place
|
|
110 |
with gr.Blocks(fill_height=True) as demo:
|
111 |
gr.Markdown(DESCRIPTION)
|
112 |
gr.ChatInterface(
|
113 |
-
fn=
|
114 |
chatbot=chatbot,
|
115 |
fill_height=True,
|
116 |
multimodal=True,
|
|
|
7 |
import spaces
|
8 |
from PIL import Image
|
9 |
import threading
|
10 |
+
from openai import OpenAI
|
11 |
+
import os
|
12 |
+
|
13 |
+
API_KEY = os.getenv('OPEN_AI_API_KEYS')
|
14 |
|
15 |
DESCRIPTION = '''
|
16 |
<div>
|
|
|
51 |
refiner.to('cuda')
|
52 |
|
53 |
# All Installed. Let's instance them in the function
|
54 |
+
def multimodal_and_generation(message, history):
|
|
|
55 |
"""
|
56 |
Receives input from gradio from the prompt but also
|
57 |
if any images were passed that i also placed for formatting
|
|
|
73 |
prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
74 |
|
75 |
if image_path is None:
|
76 |
+
# GPT Generation
|
77 |
+
client = OpenAI(api_key=API_KEY)
|
78 |
+
stream = client.chat.completions.create(
|
79 |
+
model="gpt-3.5-turbo",
|
80 |
+
messages=[{"role": "system", "content": "You are a helpful assistant called 'chimera'."},
|
81 |
+
{"role": "user", "content": message}],
|
82 |
+
stream=True,
|
83 |
+
)
|
84 |
+
return stream
|
|
|
|
|
|
|
|
|
85 |
|
86 |
else:
|
|
|
87 |
# Time to instance the llava
|
88 |
image = Image.open(image_path)
|
89 |
inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16)
|
|
|
93 |
thread = threading.Thread(target=llava_model.generate, kwargs=generation_kwargs)
|
94 |
thread.start()
|
95 |
|
96 |
+
# buffer = ""
|
97 |
+
# for new_text in streamer:
|
98 |
+
# # find <|eot_id|> and remove it from the new_text
|
99 |
+
# if "<|eot_id|>" in new_text:
|
100 |
+
# new_text = new_text.split("<|eot_id|>")[0]
|
101 |
+
# buffer += new_text
|
102 |
+
# generated_text_no_prompt = buffer
|
103 |
+
# yield generated_text_no_prompt
|
104 |
+
return streamer
|
105 |
+
|
106 |
+
def diffusing(prompt):
|
107 |
+
"""
|
108 |
+
Uses stable diffusion on the prompt and
|
109 |
+
returns the image.
|
110 |
+
"""
|
111 |
+
image = base(
|
112 |
+
prompt=prompt,
|
113 |
+
num_inference_steps=40,
|
114 |
+
denoising_end=0.8,
|
115 |
+
output_type="latent",
|
116 |
+
).images
|
117 |
+
image = refiner(
|
118 |
+
prompt=prompt,
|
119 |
+
num_inference_steps=40,
|
120 |
+
denoising_start=0.8,
|
121 |
+
image=image
|
122 |
+
).images[0]
|
123 |
+
return image
|
124 |
+
|
125 |
+
def check_cuda_availability():
|
126 |
+
if torch.cuda.is_available():
|
127 |
+
result = f"GPU: {torch.cuda.get_device_name(0)}"
|
128 |
+
return result
|
129 |
+
else:
|
130 |
+
return "No CUDA device found."
|
131 |
+
|
132 |
+
mode = ""
|
133 |
+
|
134 |
+
@spaces.GPU(duration=120)
|
135 |
+
def bot_comms(message,
|
136 |
+
history):
|
137 |
+
"""
|
138 |
+
Communication between gradio and the models.
|
139 |
+
"""
|
140 |
+
global mode
|
141 |
+
|
142 |
+
if message == "check cuda":
|
143 |
+
result = check_cuda_availability()
|
144 |
+
yield result
|
145 |
+
return
|
146 |
+
|
147 |
+
if message == "imagery":
|
148 |
+
mode = message
|
149 |
+
yield "Imagery On! Type your prompt to make the image πΌοΈ"
|
150 |
+
return
|
151 |
+
|
152 |
+
if message == "chatting":
|
153 |
+
mode = message
|
154 |
+
yield "Imagery Off. Ask me any questions. βοΈ"
|
155 |
+
return
|
156 |
+
|
157 |
+
if mode == "imagery":
|
158 |
+
image = diffusing(
|
159 |
+
message=message,
|
160 |
+
history=history,
|
161 |
+
)
|
162 |
+
return image
|
163 |
+
|
164 |
+
buffer = ""
|
165 |
+
if mode == "chatting" or mode == "":
|
166 |
+
stream = multimodal_and_generation(
|
167 |
+
message=message,
|
168 |
+
history=history,
|
169 |
+
)
|
170 |
+
for text in stream:
|
171 |
+
# find <|eot_id|> and remove it from the text
|
172 |
+
if "<|eot_id|>" in text:
|
173 |
+
text = text.split("<|eot_id|>")[0]
|
174 |
+
buffer += text
|
175 |
generated_text_no_prompt = buffer
|
176 |
yield generated_text_no_prompt
|
177 |
|
|
|
181 |
with gr.Blocks(fill_height=True) as demo:
|
182 |
gr.Markdown(DESCRIPTION)
|
183 |
gr.ChatInterface(
|
184 |
+
fn=bot_comms,
|
185 |
chatbot=chatbot,
|
186 |
fill_height=True,
|
187 |
multimodal=True,
|