Spaces:

flax-community
/

koclip

Build error

File size: 2,168 Bytes

f1d50b1
e4b9c8b
8ff0261
5dce03a
e4b9c8b
f1d50b1
 
 
 
 
e4b9c8b
f1d50b1
e4b9c8b
2cf3514
 
42c971d
 
2cf3514
 
e4b9c8b
 
b3a4deb
e4b9c8b
 
 
 
 
 
8ff0261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4b9c8b

import streamlit as st
import numpy as np
import jax
import jax.numpy as jnp
from PIL import Image

from utils import load_model


def app(model_name):
    model, processor = load_model(f"koclip/{model_name}")

    st.title("Zero-shot Image Classification")
    st.markdown(
        """
        This demonstration explores capability of KoCLIP in the field of Zero-Shot Prediction. This demo takes a set of image and captions from, and predicts the most likely label among the different captions given. 
KoCLIP is a retraining of OpenAI's CLIP model using 82,783 images from MSCOCO dataset and Korean caption annotations. Korean translation of caption annotations were obtained from AI Hub. Base model koclip uses klue/roberta as text encoder and openai/clip-vit-base-patch32 as image encoder. Larger model koclip-large uses klue/roberta as text encoder and bigger google/vit-large-patch16-224 as image encoder.
    """
    )

    query = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
    captions = st.text_input("사용하실 캡션을 쉼표 단위로 구분해서 적어주세요", value="고양이,강아지,느티나무...")

    if st.button("질문 (Query)"):
        if query is None:
            st.error("Please upload an image query.")
        else:
            image = Image.open(query)
            st.image(image)
            # pixel_values = processor(
            #     text=[""], images=image, return_tensors="jax", padding=True
            # ).pixel_values
            # pixel_values = jnp.transpose(pixel_values, axes=[0, 2, 3, 1])
            # vec = np.asarray(model.get_image_features(pixel_values))
            captions = captions.split(",")
            inputs = processor(text=captions, images=image, return_tensors="jax", padding=True)
            inputs["pixel_values"] = jnp.transpose(
                inputs["pixel_values"], axes=[0, 2, 3, 1]
            )
            outputs = model(**inputs)
            probs = jax.nn.softmax(outputs.logits_per_image, axis=1)

            for idx, prob in sorted(enumerate(*probs), key=lambda x: x[1], reverse=True):
                st.text(f"Score: `{prob}`, {captions[idx]}")