import gradio as gr from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig from PIL import Image import requests from io import BytesIO import spaces # Import spaces for ZeroGPU support # Load the model and processor repo_name = "cyan2k/molmo-7B-O-bnb-4bit" arguments = { "device_map": "auto", # Device will be set automatically "torch_dtype": "auto", # Use appropriate precision "trust_remote_code": True # Allow loading remote code } # Load the processor (this part doesn't need GPU yet) processor = AutoProcessor.from_pretrained(repo_name, **arguments) # Define the function for image description @spaces.GPU # This ensures the function gets GPU access when needed def describe_image(image, question): # Load the model inside the function and move it to GPU model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments).to('cuda') # Process the uploaded image along with the user's question inputs = processor.process( images=[image], text=question if question else "Describe this image in great detail without missing any piece of information" ) # Move inputs to model device (GPU) inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()} # Generate output using the model on GPU output = model.generate_from_batch( inputs, GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"), tokenizer=processor.tokenizer, ) # Decode the generated tokens generated_tokens = output[0, inputs["input_ids"].size(1):] generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) return generated_text # Gradio interface def gradio_app(): with gr.Blocks() as demo: gr.Markdown("# Long Image Description with Molmo-7B 4 bit quantized\n### Note: This model size has been reduced by six times without much of loss in Performance.\n### Upload an image and ask a question about it!") with gr.Row(): image_input = gr.Image(type="pil", label="Upload an Image") question_input = gr.Textbox(placeholder="Ask a question about the image (e.g., 'What is happening in this image?')", label="Question (Optional)") output_text = gr.Textbox(label="Image Description", interactive=False) # Submit button to generate the description submit_btn = gr.Button("Generate Description") # Callback to run when submit button is clicked submit_btn.click( fn=describe_image, inputs=[image_input, question_input], outputs=output_text ) # Launch the Gradio interface demo.launch() # Launch the Gradio app gradio_app()