import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer from threading import Thread import re import time import torch import spaces import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) from io import BytesIO processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct") model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", _attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16).to("cuda:0") @spaces.GPU def model_inference( input_dict, history, max_tokens ): text = input_dict["text"] images = [] user_content = [] media_queue = [] if history == []: text = input_dict["text"].strip() for file in input_dict.get("files", []): if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")): media_queue.append({"type": "image", "path": file}) elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")): media_queue.append({"type": "video", "path": file}) if "" in text or "