from fastapi import FastAPI, Body from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info import torch from typing import List, Dict, Union import base64 import requests from PIL import Image from io import BytesIO app = FastAPI() model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto") min_pixels = 256 * 28 * 28 max_pixels = 1280 * 28 * 28 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) def process_image(image_data: str) -> Image.Image: if image_data.startswith("http://") or image_data.startswith("https://"): response = requests.get(image_data) response.raise_for_status() # Adiciona verificação de erro na requisição img = Image.open(BytesIO(response.content)) elif image_data.startswith("data:image"): img_data = base64.b64decode(image_data.split(",")[1]) img = Image.open(BytesIO(img_data)) else: # Assume que é um caminho de arquivo local img = Image.open(image_data) return img @app.post("/predict") async def predict(messages: List[Dict[str, Union[str, List[Dict[str, str]]]]] = Body(...)): # Processamento e inferência texts = [] image_inputs = [] video_inputs = [] for message in messages: content = message.get("content") if isinstance(content, str): texts.append(processor.apply_chat_template(content, tokenize=False, add_generation_prompt=True)) elif isinstance(content, list): for item in content: if isinstance(item, dict) and "type" in item: if item["type"] == "text": texts.append(processor.apply_chat_template(item["text"], tokenize=False, add_generation_prompt=True)) elif item["type"] == "image": image_inputs.append(process_image(item["image"])) else: raise ValueError(f"Formato inválido para o item: {item}") else: raise ValueError(f"Formato inválido para o conteúdo: {content}") inputs = processor( text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt" ) inputs = inputs.to("cpu") # Altere para "cuda" se tiver GPU disponível generated_ids = model.generate(**inputs, max_new_tokens=128) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_texts = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return {"response": output_texts}