Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,188 Bytes
fda8dae 5781b89 07d11bb 5781b89 1322687 5781b89 6d0cb8a 12e7969 5781b89 60e7a28 1322687 bac7d5d 5781b89 5c4fa84 1250026 b3a8054 1250026 5781b89 fda8dae 5781b89 b2d3b41 5781b89 6b26249 5781b89 07d11bb 5781b89 25d4410 1250026 5781b89 1322687 5781b89 07d11bb 5781b89 07d11bb 5781b89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
model_id = "vikhyatk/moondream2"
revision = "2024-05-20"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision,
torch_dtype=torch.bfloat16, device_map={"": "cuda"},
attn_implementation="flash_attention_2"
)
moondream.eval()
control_vectors = torch.load("control_vectors.pt", map_location="cpu")
control_vectors = [t.to('cuda', dtype=torch.bfloat16) for t in control_vectors]
class LayerWrapper(torch.nn.Module):
def __init__(self, og_layer, control_vectors, scale=4.2):
super().__init__()
self.og_layer = og_layer
self.control_vectors = control_vectors
self.scale = scale
def forward(self, *args, **kwargs):
layer_outputs = self.og_layer(*args, **kwargs)
layer_outputs = (layer_outputs[0] + self.scale * self.control_vectors, *layer_outputs[1:])
return layer_outputs
moondream.text_model.transformer.h = torch.nn.ModuleList([
LayerWrapper(layer, vector, 4.2)
for layer, vector in zip(moondream.text_model.transformer.h, control_vectors)
])
@spaces.GPU(duration=10)
def answer_question(img, prompt):
image_embeds = moondream.encode_image(img)
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream.answer_question,
kwargs={
"image_embeds": image_embeds,
"question": prompt,
"tokenizer": tokenizer,
"streamer": streamer,
"repetition_penalty": 1.2,
"temperature": 0.1,
"do_sample": True,
"length_penalty": 1.2
},
)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer.strip()
def extract_floats(text):
# Regular expression to match an array of four floating point numbers
pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
match = re.search(pattern, text)
if match:
# Extract the numbers and convert them to floats
return [float(num) for num in match.groups()]
return None # Return None if no match is found
def extract_bbox(text):
bbox = None
if extract_floats(text) is not None:
x1, y1, x2, y2 = extract_floats(text)
bbox = (x1, y1, x2, y2)
return bbox
def process_answer(img, answer):
if extract_bbox(answer) is not None:
x1, y1, x2, y2 = extract_bbox(answer)
draw_image = Resize(768)(img)
width, height = draw_image.size
x1, x2 = int(x1 * width), int(x2 * width)
y1, y2 = int(y1 * height), int(y2 * height)
bbox = (x1, y1, x2, y2)
ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
return gr.update(visible=True, value=draw_image)
return gr.update(visible=False, value=None)
with gr.Blocks() as demo:
gr.Markdown(
"""
# 🌜 contemplative moondream
a demo of [moondream](http://moondream.ai) steered to discuss the meaning of life using [activation vectors](https://github.com/vikhyat/moondream/blob/main/notebooks/RepEng.ipynb)
"""
)
with gr.Row():
prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
submit = gr.Button("Submit")
with gr.Row():
img = gr.Image(type="pil", label="Upload an Image")
with gr.Column():
output = gr.Markdown(label="Response")
ann = gr.Image(visible=False, label="Annotated Image")
submit.click(answer_question, [img, prompt], output)
prompt.submit(answer_question, [img, prompt], output)
output.change(process_answer, [img, output], ann, show_progress=False)
demo.queue().launch()
|