Qwen2-VL-2B / app.py
lucianotonet's picture
Aprimora processamento de imagens e validação de entrada
690d40a
raw
history blame
2.88 kB
from fastapi import FastAPI, Body
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from typing import List, Dict, Union
import base64
import requests
from PIL import Image
from io import BytesIO
app = FastAPI()
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
def process_image(image_data: str) -> Image.Image:
if image_data.startswith("http://") or image_data.startswith("https://"):
response = requests.get(image_data)
response.raise_for_status()
img = Image.open(BytesIO(response.content))
elif image_data.startswith("data:image"):
img_data = base64.b64decode(image_data.split(",")[1])
img = Image.open(BytesIO(img_data))
else:
img = Image.open(image_data)
return img
@app.post("/predict")
async def predict(messages: List[Dict[str, Union[str, List[Dict[str, str]]]]] = Body(...)):
texts = []
image_inputs = []
video_inputs = []
for message in messages:
content = message.get("content")
if isinstance(content, str):
texts.append(processor.apply_chat_template(content, tokenize=False, add_generation_prompt=True))
elif isinstance(content, list):
for item in content:
if isinstance(item, dict) and "type" in item:
if item["type"] == "text":
texts.append(processor.apply_chat_template(item["text"], tokenize=False, add_generation_prompt=True))
elif item["type"] == "image":
image = process_image(item["image"])
image_inputs.append(image)
else:
raise ValueError(f"Formato inválido para o item: {item}")
else:
raise ValueError(f"Formato inválido para o conteúdo: {content}")
if not image_inputs:
raise ValueError("Nenhuma imagem fornecida para processamento.")
print(f"Imagens processadas: {image_inputs}")
inputs = processor(
text=texts,
images=[image_inputs], # Passa as imagens como uma lista de listas
videos=video_inputs,
padding=True,
return_tensors="pt"
)
inputs = inputs.to("cpu")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return {"response": output_texts}