File size: 6,998 Bytes
7f412e3
 
 
 
 
 
 
 
 
fc1b391
 
8ecf076
 
fc1b391
 
7f412e3
 
 
8b17745
 
7f412e3
 
a423985
7f412e3
 
 
 
 
a423985
7f412e3
 
 
a423985
7f412e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc1b391
44a8369
0464368
7f412e3
a423985
7f412e3
 
 
 
 
a423985
 
7f412e3
 
0464368
fc1b391
 
 
a423985
 
 
 
fc1b391
 
 
7f412e3
a423985
7f412e3
 
a423985
7f412e3
 
 
 
 
fc1b391
a423985
fc1b391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ecf076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdcaa9a
8ecf076
fc1b391
 
 
 
8ecf076
a423985
fc1b391
 
8ecf076
a423985
fc1b391
 
8ecf076
a423985
fc1b391
0d815c8
8ecf076
 
a423985
fc1b391
0d815c8
8ecf076
7e85e00
 
 
 
 
8ecf076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f412e3
 
 
8ecf076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import torch
from transformers import TextIteratorStreamer, AutoProcessor, LlavaForConditionalGeneration
from diffusers import DiffusionPipeline
import gradio as gr
import numpy as np
import accelerate
import spaces
from PIL import Image
import threading
from openai import OpenAI
import os
import asyncio
from typing import Any

API_KEY = os.getenv('OPEN_AI_API_KEYS')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Chimera πŸͺ</h1>
<p>This contains a Stable Diffusor from <a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0"><b>stabilityai/stable-diffusion-xl-base-1.0</b></a> and a Multimodal from <a href="https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers"><b>xtuner/llava-llama-3-8b-v1_1-transformers</b></a></p>
</div>
'''

# Llava Installed
llava_model = LlavaForConditionalGeneration.from_pretrained(
    "xtuner/llava-llama-3-8b-v1_1-transformers",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

llava_model.to("cuda:0")
processor = AutoProcessor.from_pretrained("xtuner/llava-llama-3-8b-v1_1-transformers")
llava_model.generation_config.eos_token_id = 128009

# Stable Diffusor Installed
base = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True,
)
base.to('cuda')

refiner = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    text_encoder_2=base.text_encoder_2,
    vae=base.vae,
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16",
)
refiner.to('cuda')

def multimodal_and_generation(message, history):
    print(f"Message:\n{message}\nType:\n{type(message)}")
    image_path = None
    if message["files"]:
        if isinstance(message["files"][-1], dict):
            image_path = message["files"][-1]["path"]
        else:
            image_path = message["files"][-1]
    else:
        for hist in history:
            if isinstance(hist[0], tuple):
                image_path = hist[0][0]

    if image_path is None:
        input_prompt = message["text"]
        client = OpenAI(api_key=API_KEY)
        stream = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant called 'chimera'."},
                {"role": "user", "content": input_prompt}
            ],
            stream=True,
        )
        return stream
    else:
        prompt = f"user\n\n<image>\n{message['text']}assistant\n\n"
        image = Image.open(image_path)
        inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16)
        streamer = TextIteratorStreamer(processor.tokenizer, **{"skip_special_tokens": False, "skip_prompt": True})
        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False)

        thread = threading.Thread(target=llava_model.generate, kwargs=generation_kwargs)
        thread.start()

        return streamer

def diffusing(prompt):
    image = base(
        prompt=prompt,
        num_inference_steps=40,
        denoising_end=0.8,
        output_type="latent",
    ).images
    image = refiner(
        prompt=prompt,
        num_inference_steps=40,
        denoising_start=0.8,
        image=image
    ).images[0]
    return image

def check_cuda_availability():
    if torch.cuda.is_available():
        result = f"GPU: {torch.cuda.get_device_name(0)}"
        return result
    else:
        return "No CUDA device found."

mode = ""

# @spaces.GPU(duration=120)
# async def bot_comms(message, history):
#     global mode

#     if message == "check cuda":
#         result = check_cuda_availability()
#         yield result
#         return

#     if message == "imagery":
#         mode = message
#         yield "Imagery On! Type your prompt to make the image πŸ–ΌοΈ"
#         return

#     if message == "chatting":
#         mode = message
#         yield "Imagery Off. Ask me any questions. β˜„οΈ"
#         return

#     if mode == "imagery":
#         print("On imagery\n\n")
#         image = diffusing(
#             prompt=message,
#         )
#         yield image
#         return

#     if mode == "chatting" or mode == "":
#         print("On chatting or no mode.\n\n")
#         stream = multimodal_and_generation(
#             message=message,
#             history=history,
#         )
#         gpt_outputs = []
#         async for chunk in stream:
#             if chunk.choices[0].delta.content is not None:
#                 text = chunk.choices[0].delta.content
#                 gpt_outputs.append(text)
#                 yield "".join(gpt_outputs)

@spaces.GPU(duration=120)
async def bot_comms_async(message, history):
    global mode

    if message == "check cuda":
        result = check_cuda_availability()
        return [result]

    if message == "imagery":
        mode = message
        return ["Imagery On! Type your prompt to make the image πŸ–ΌοΈ"]

    if message == "chatting":
        mode = message
        return ["Imagery Off. Ask me any questions. β˜„οΈ"]

    if mode == "imagery":
        print("On imagery\n\n")
        image = diffusing(prompt=message)
        return [image]

    if mode == "chatting" or mode == "":
        print("On chatting or no mode.\n\n")
        stream = multimodal_and_generation(message=message, history=history)
        gpt_outputs = []
        async for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                gpt_outputs.append(text)
        return ["".join(gpt_outputs)]

def bot_comms(message: str, history: Any):
    return asyncio.run(bot_comms_async(message, history))

# Define your Gradio UI as usual
import gradio as gr

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    with gr.Row():
        submit = gr.Button("Submit")

    def user(message, history):
        return "", history + [[message, None]]

    def bot_response(message, history):
        response = bot_comms(message, history)
        return history + [[message, response]]

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, [msg, chatbot], [msg, chatbot]
    )
    submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, [msg, chatbot], [msg, chatbot]
    )

if __name__ == "__main__":
    demo.launch(share=True)

# chatbot = gr.Chatbot(height=600, label="Chimera AI")
# chat_input = gr.MultimodalTextbox(interactive=True, file_types=["images"], placeholder="Enter your question or upload an image.", show_label=False)
# with gr.Blocks(fill_height=True) as demo:
#     gr.Markdown(DESCRIPTION)
#     gr.ChatInterface(
#         fn=bot_comms,
#         chatbot=chatbot,
#         fill_height=True,
#         multimodal=True,
#         textbox=chat_input,
#     )

# if __name__ == "__main__":
#     demo.launch()