miike-ai/r1-11b-vision · Hugging Face

import os
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

# Path to your locally saved merged multimodal model
model_id = "miike-ai/r1-11b-vision"

# Load the model and processor
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

def multimodal_inference(text_prompt, image_path=None):
    """
    Runs a single inference on the multimodal model.
    Args:
        text_prompt (str): The user prompt for text-based input.
        image_path (str, optional): Path or URL to an image (if any).
    Returns:
        str: Model-generated response.
    """

    # Prepare user message
    user_message = {"role": "user", "content": [{"type": "text", "text": text_prompt}]}

    # Load image if provided
    image = None
    if image_path:
        try:
            if image_path.startswith("http"):  # Handle URLs
                image = Image.open(requests.get(image_path, stream=True).raw)
            else:  # Handle local file
                image = Image.open(image_path)

            print(f"📷 Loaded image: {image.size}")  # Debugging
            user_message["content"].insert(0, {"type": "image"})  # Add image token to message
        except Exception as e:
            print(f"⚠️ Error loading image: {e}")
            return None

    # Format input for the model
    input_text = processor.apply_chat_template([user_message], add_generation_prompt=True)

    # Convert input to model tensors
    if image is not None:
        inputs = processor(images=[image], text=[input_text], add_special_tokens=True, return_tensors="pt").to(model.device)
    else:
        inputs = processor(text=[input_text], add_special_tokens=True, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=256)

    # Decode model output
    response_text = processor.decode(output[0], skip_special_tokens=True)
    
    return response_text

# Example usage
text_input = "What is in this image?"
image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"  # Can be a URL or local file path

response = multimodal_inference(text_input, image_path)
print("\n🧠 Assistant:", response)
miike-ai
/

r1-11b-vision

Model tree for miike-ai/r1-11b-vision

Space using miike-ai/r1-11b-vision 1