qwen-vl / app.py
artificialguybr's picture
Update app.py
32f13d0
raw
history blame
2.45 kB
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import re
import requests
from io import BytesIO
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
def generate_predictions(image_input, text_input, with_grounding):
user_image_path = "/tmp/user_input_test_image.jpg"
Image.fromarray((255 - (image_input * 255).astype('uint8'))).save(user_image_path)
if with_grounding == "Yes":
text_input += " with grounding"
query = tokenizer.from_list_format([
{'image': user_image_path},
{'text': text_input},
])
inputs = tokenizer(query, return_tensors='pt')
inputs = inputs.to(model.device)
pred = model.generate(**inputs)
full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
print("Generated Caption:", frontend_response) # Debugging line
image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
if image_with_boxes:
temp_path = "/tmp/image_with_boxes.jpg"
image_with_boxes.save(temp_path)
image_with_boxes = Image.open(temp_path)
return image_with_boxes, frontend_response
iface = gr.Interface(
fn=generate_predictions,
inputs=[
gr.inputs.Image(label="Image Input"),
gr.inputs.Textbox(default="Generate a caption for that image:", label="Prompt"),
gr.inputs.Radio(["No", "Yes"], label="With Grounding", default="No")
],
outputs=[
gr.outputs.Image(type='pil', label="Image"),
gr.outputs.Textbox(label="Generated")
],
title="Qwen-VL Demonstration",
description = """
## Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud
**Space by [@Artificialguybr](https://twitter.com/artificialguybr)**
### Key Features:
- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.
- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.
- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
""",
)
iface.launch()