File size: 1,359 Bytes
6b714ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import streamlit as st
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image

def load_model():
    processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
    model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
    return processor, model

def predict(image, text, processor, model):
    encoding = processor(image, text, return_tensors="pt")
    outputs = model(**encoding)
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    return model.config.id2label[idx]

def main():
    st.title("VQA")
    st.write("Upload an image and input a question to get an answer.")

    uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

    if uploaded_image is not None:
        image = Image.open(uploaded_image)
        question = st.text_input("Question about the image:")
        
        if question:
            processor, model = load_model()
            answer = predict(image, question, processor, model)
            
            col1, col2 = st.columns(2)
            with col1:
                st.image(image, caption='Uploaded Image.', use_column_width=True)
            with col2:
                st.write(f"**Question:** {question}")
                st.write(f"**Answer:** {answer}")

if __name__ == "__main__":
    main()