Qwen2-VL-2B / app.py
lucianotonet's picture
Remove detailed request format documentation from endpoint
cf98129
from fastapi import FastAPI, Body
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import torch
from typing import List, Dict, Union
import base64
import requests
from PIL import Image
from io import BytesIO
from qwen_vl_utils import process_vision_info
app = FastAPI()
# Carrega o modelo e o processor
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
def process_image(image_data: str) -> Image.Image:
"""Processa uma imagem a partir de URL, base64 ou caminho local."""
if image_data.startswith(("http://", "https://")):
response = requests.get(image_data)
response.raise_for_status()
img = Image.open(BytesIO(response.content))
elif image_data.startswith("data:image"):
img_data = base64.b64decode(image_data.split(",")[1])
img = Image.open(BytesIO(img_data))
else:
img = Image.open(image_data)
return img
@app.post("/predict")
async def predict(messages: List[Dict[str, Union[str, List[Dict[str, Union[str, None]]]]]] = Body(...)):
"""
Endpoint para prever respostas com base nas mensagens fornecidas.
"""
# Processa as mensagens para texto e imagens
texts = []
image_inputs = []
video_inputs = []
# Utiliza o qwen_vl_utils para processar as informações visuais
for message in messages:
content = message.get("content")
if isinstance(content, str):
texts.append(processor.apply_chat_template(content, tokenize=False, add_generation_prompt=True))
elif isinstance(content, list):
for item in content:
if item.get("type") == "text":
texts.append(processor.apply_chat_template(item["text"], tokenize=False, add_generation_prompt=True))
elif item.get("type") == "image":
image = process_image(item["image"])
image_inputs.append(image)
else:
raise ValueError(f"Formato inválido para o item: {item}")
else:
raise ValueError(f"Formato inválido para o conteúdo: {content}")
# Prepara inputs para o modelo
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text for text in texts],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
).to("cpu")
# Gera as respostas
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return {"response": output_texts}