Molmo-4bit / app.py
zamal's picture
Create app.py
7ac4196 verified
raw
history blame
2.24 kB
import gradio as gr
from transformers import (
AutoModelForCausalLM,
AutoProcessor,
GenerationConfig,
BitsAndBytesConfig,
)
from PIL import Image
import torch
# Configuration for 4-bit quantization and GPU offloading
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
)
# Model repository
repo_name = "cyan2k/molmo-7B-O-bnb-4bit"
# Load the processor and model
processor = AutoProcessor.from_pretrained(repo_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
repo_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
quantization_config=bnb_config,
)
# Ensure model is on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def describe_images(images):
descriptions = []
for image in images:
if isinstance(image, str):
image = Image.open(image)
# Process the image
inputs = processor.process(
images=[image],
text="Describe this image in great detail.",
)
# Move inputs to the same device as the model
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate output
with torch.no_grad():
output = model.generate_from_batch(
inputs,
GenerationConfig(max_new_tokens=200, stop_strings=["<|endoftext|>"]),
tokenizer=processor.tokenizer,
)
# Decode generated tokens to text
generated_tokens = output[0, inputs["input_ids"].size(1):]
generated_text = processor.tokenizer.decode(
generated_tokens, skip_special_tokens=True
)
descriptions.append(generated_text.strip())
return "\n\n".join(descriptions)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("<h3><center>Image Description Generator</center></h3>")
with gr.Row():
image_input = gr.File(
file_types=["image"], label="Upload Image(s)", multiple=True
)
generate_button = gr.Button("Generate Descriptions")
output_text = gr.Textbox(label="Descriptions", lines=15)
generate_button.click(describe_images, inputs=image_input, outputs=output_text)
demo.launch()