sitammeur commited on
Commit
341d44c
1 Parent(s): 737c8c2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing the requirements
2
+ import gradio as gr
3
+ import torch
4
+ from PIL import Image
5
+ from transformers import AutoModel, AutoTokenizer
6
+ import spaces
7
+
8
+ # Device for the model
9
+ device = "cuda"
10
+
11
+ # Load the model and tokenizer
12
+ model = AutoModel.from_pretrained(
13
+ "openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True, torch_dtype=torch.float16
14
+ )
15
+ model = model.to(device="cuda")
16
+ tokenizer = AutoTokenizer.from_pretrained(
17
+ "openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True
18
+ )
19
+ model.eval()
20
+
21
+
22
+ @spaces.GPU
23
+ def answer_question(image, question):
24
+ """
25
+ Generates an answer to a given question based on the provided image and text.
26
+
27
+ Args:
28
+ image (str): The path to the image file.
29
+ question (str): The question text.
30
+
31
+ Returns:
32
+ str: The generated answer to the question.
33
+ """
34
+
35
+ # Message format for the model
36
+ msgs = [{"role": "user", "content": question}]
37
+
38
+ # Generate the answer
39
+ res = model.chat(
40
+ image=image,
41
+ msgs=msgs,
42
+ tokenizer=tokenizer,
43
+ sampling=True,
44
+ temperature=0.7,
45
+ stream=True,
46
+ )
47
+
48
+ # Return the answer
49
+ return "".join(res)
50
+
51
+
52
+ # Image and text inputs for the interface
53
+ image = gr.Image(type="pil", label="Image")
54
+ question = gr.Textbox(label="Question")
55
+
56
+ # Output for the interface
57
+ answer = gr.Textbox(label="Predicted answer")
58
+
59
+ # Examples for the interface
60
+ examples = [
61
+ ["cat.jpg", "How many cats are there?"],
62
+ ["dog.jpg", "What color is the dog?"],
63
+ ["bird.jpg", "What is the bird doing?"],
64
+ ]
65
+
66
+ # Title, description, and article for the interface
67
+ title = "Visual Question Answering"
68
+ description = "Gradio Demo for the MiniCPM Llama3 Vision Language Understanding and Generation model. This model can answer questions about images in natural language. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
69
+ article = "<p style='text-align: center'><a href='https://github.com/OpenBMB/MiniCPM-V' target='_blank'>Model GitHub Repo</a> | <a href='https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5' target='_blank'>Model Page</a></p>"
70
+
71
+
72
+ # Launch the interface
73
+ interface = gr.Interface(
74
+ fn=answer_question,
75
+ inputs=[image, question],
76
+ outputs=answer,
77
+ examples=examples,
78
+ title=title,
79
+ description=description,
80
+ article=article,
81
+ theme="Soft",
82
+ allow_flagging="never",
83
+ )
84
+ interface.launch(debug=False)