from fastapi import FastAPI, Body from transformers import Qwen2VLForConditionalGeneration, AutoProcessor import torch from typing import List, Dict, Union import base64 import requests from PIL import Image from io import BytesIO from qwen_vl_utils import process_vision_info app = FastAPI() # Carrega o modelo e o processor model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto") processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") def process_image(image_data: str) -> Image.Image: """Processa uma imagem a partir de URL, base64 ou caminho local.""" if image_data.startswith(("http://", "https://")): response = requests.get(image_data) response.raise_for_status() img = Image.open(BytesIO(response.content)) elif image_data.startswith("data:image"): img_data = base64.b64decode(image_data.split(",")[1]) img = Image.open(BytesIO(img_data)) else: img = Image.open(image_data) return img @app.post("/predict") async def predict(messages: List[Dict[str, Union[str, List[Dict[str, Union[str, None]]]]]] = Body(...)): """ Endpoint para prever respostas com base nas mensagens fornecidas. """ # Processa as mensagens para texto e imagens texts = [] image_inputs = [] video_inputs = [] # Utiliza o qwen_vl_utils para processar as informações visuais for message in messages: content = message.get("content") if isinstance(content, str): texts.append(processor.apply_chat_template(content, tokenize=False, add_generation_prompt=True)) elif isinstance(content, list): for item in content: if item.get("type") == "text": texts.append(processor.apply_chat_template(item["text"], tokenize=False, add_generation_prompt=True)) elif item.get("type") == "image": image = process_image(item["image"]) image_inputs.append(image) else: raise ValueError(f"Formato inválido para o item: {item}") else: raise ValueError(f"Formato inválido para o conteúdo: {content}") # Prepara inputs para o modelo image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text for text in texts], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt" ).to("cpu") # Gera as respostas generated_ids = model.generate(**inputs, max_new_tokens=128) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_texts = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return {"response": output_texts}