artificialguybr commited on
Commit
32f13d0
1 Parent(s): 101c1f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -15
app.py CHANGED
@@ -2,21 +2,20 @@ import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  from PIL import Image
5
- import re # Importando o módulo de expressões regulares
6
  import requests
7
  from io import BytesIO
8
 
9
- # Carregar o modelo Qwen-VL e o tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
11
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
12
 
13
- def generate_predictions(image_input, text_input):
14
- # Inverter a imagem para corrigir o negativo
15
  user_image_path = "/tmp/user_input_test_image.jpg"
16
  Image.fromarray((255 - (image_input * 255).astype('uint8'))).save(user_image_path)
17
 
 
 
18
 
19
- # Preparar as entradas
20
  query = tokenizer.from_list_format([
21
  {'image': user_image_path},
22
  {'text': text_input},
@@ -24,34 +23,30 @@ def generate_predictions(image_input, text_input):
24
  inputs = tokenizer(query, return_tensors='pt')
25
  inputs = inputs.to(model.device)
26
 
27
- # Gerar a legenda
28
  pred = model.generate(**inputs)
29
  full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
30
 
31
- # Remover o texto de input e outras partes indesejadas da resposta completa
32
  frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
 
33
 
34
- # Desenhar caixas delimitadoras, se houver
35
  image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
36
 
37
- # Salvar e recarregar a imagem para garantir que seja uma imagem PIL
38
  if image_with_boxes:
39
  temp_path = "/tmp/image_with_boxes.jpg"
40
  image_with_boxes.save(temp_path)
41
  image_with_boxes = Image.open(temp_path)
42
 
43
- return image_with_boxes, frontend_response # Retornando a resposta formatada para o frontend
44
 
45
- # Criar interface Gradio
46
- # Create Gradio interface
47
  iface = gr.Interface(
48
  fn=generate_predictions,
49
  inputs=[
50
  gr.inputs.Image(label="Image Input"),
51
- gr.inputs.Textbox(default="Generate a caption for that image with grounding:", label="Prompt")
 
52
  ],
53
  outputs=[
54
- gr.outputs.Image(type='pil', label="Image"), # Explicitly set type to 'pil'
55
  gr.outputs.Textbox(label="Generated")
56
  ],
57
  title="Qwen-VL Demonstration",
@@ -65,4 +60,4 @@ iface = gr.Interface(
65
  - **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
66
  """,
67
  )
68
- iface.launch()
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  from PIL import Image
5
+ import re
6
  import requests
7
  from io import BytesIO
8
 
 
9
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
10
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
11
 
12
+ def generate_predictions(image_input, text_input, with_grounding):
 
13
  user_image_path = "/tmp/user_input_test_image.jpg"
14
  Image.fromarray((255 - (image_input * 255).astype('uint8'))).save(user_image_path)
15
 
16
+ if with_grounding == "Yes":
17
+ text_input += " with grounding"
18
 
 
19
  query = tokenizer.from_list_format([
20
  {'image': user_image_path},
21
  {'text': text_input},
 
23
  inputs = tokenizer(query, return_tensors='pt')
24
  inputs = inputs.to(model.device)
25
 
 
26
  pred = model.generate(**inputs)
27
  full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
28
 
 
29
  frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
30
+ print("Generated Caption:", frontend_response) # Debugging line
31
 
 
32
  image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
33
 
 
34
  if image_with_boxes:
35
  temp_path = "/tmp/image_with_boxes.jpg"
36
  image_with_boxes.save(temp_path)
37
  image_with_boxes = Image.open(temp_path)
38
 
39
+ return image_with_boxes, frontend_response
40
 
 
 
41
  iface = gr.Interface(
42
  fn=generate_predictions,
43
  inputs=[
44
  gr.inputs.Image(label="Image Input"),
45
+ gr.inputs.Textbox(default="Generate a caption for that image:", label="Prompt"),
46
+ gr.inputs.Radio(["No", "Yes"], label="With Grounding", default="No")
47
  ],
48
  outputs=[
49
+ gr.outputs.Image(type='pil', label="Image"),
50
  gr.outputs.Textbox(label="Generated")
51
  ],
52
  title="Qwen-VL Demonstration",
 
60
  - **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
61
  """,
62
  )
63
+ iface.launch()