import os
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
# Path to your locally saved merged multimodal model
model_id = "miike-ai/r1-11b-vision"
# Load the model and processor
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)
def multimodal_inference(text_prompt, image_path=None):
"""
Runs a single inference on the multimodal model.
Args:
text_prompt (str): The user prompt for text-based input.
image_path (str, optional): Path or URL to an image (if any).
Returns:
str: Model-generated response.
"""
# Prepare user message
user_message = {"role": "user", "content": [{"type": "text", "text": text_prompt}]}
# Load image if provided
image = None
if image_path:
try:
if image_path.startswith("http"): # Handle URLs
image = Image.open(requests.get(image_path, stream=True).raw)
else: # Handle local file
image = Image.open(image_path)
print(f"📷 Loaded image: {image.size}") # Debugging
user_message["content"].insert(0, {"type": "image"}) # Add image token to message
except Exception as e:
print(f"⚠️ Error loading image: {e}")
return None
# Format input for the model
input_text = processor.apply_chat_template([user_message], add_generation_prompt=True)
# Convert input to model tensors
if image is not None:
inputs = processor(images=[image], text=[input_text], add_special_tokens=True, return_tensors="pt").to(model.device)
else:
inputs = processor(text=[input_text], add_special_tokens=True, return_tensors="pt").to(model.device)
# Generate response
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=256)
# Decode model output
response_text = processor.decode(output[0], skip_special_tokens=True)
return response_text
# Example usage
text_input = "What is in this image?"
image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" # Can be a URL or local file path
response = multimodal_inference(text_input, image_path)
print("\n🧠 Assistant:", response)
- Downloads last month
- 126
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model has no library tag.
Model tree for miike-ai/r1-11b-vision
Base model
meta-llama/Llama-3.2-11B-Vision-Instruct