import gradio as gr from transformers import ( AutoModelForCausalLM, AutoProcessor, GenerationConfig, BitsAndBytesConfig, ) from PIL import Image import torch # Configuration for 4-bit quantization and GPU offloading bnb_config = BitsAndBytesConfig( load_in_4bit=True, ) # Model repository repo_name = "cyan2k/molmo-7B-O-bnb-4bit" # Load the processor and model processor = AutoProcessor.from_pretrained(repo_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( repo_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, quantization_config=bnb_config, ) # Ensure model is on GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def describe_images(images): descriptions = [] for image in images: if isinstance(image, str): image = Image.open(image) # Process the image inputs = processor.process( images=[image], text="Describe this image in great detail.", ) # Move inputs to the same device as the model inputs = {k: v.to(device) for k, v in inputs.items()} # Generate output with torch.no_grad(): output = model.generate_from_batch( inputs, GenerationConfig(max_new_tokens=200, stop_strings=["<|endoftext|>"]), tokenizer=processor.tokenizer, ) # Decode generated tokens to text generated_tokens = output[0, inputs["input_ids"].size(1):] generated_text = processor.tokenizer.decode( generated_tokens, skip_special_tokens=True ) descriptions.append(generated_text.strip()) return "\n\n".join(descriptions) # Gradio interface with gr.Blocks() as demo: gr.Markdown("

Image Description Generator

") with gr.Row(): image_input = gr.File( file_types=["image"], label="Upload Image(s)", multiple=True ) generate_button = gr.Button("Generate Descriptions") output_text = gr.Textbox(label="Descriptions", lines=15) generate_button.click(describe_images, inputs=image_input, outputs=output_text) demo.launch()