Spaces:

myscale
/

object-detection-safari

Running

App Files Files Community

Fangrui Liu commited on Nov 11, 2022

Commit

3f1124e

1 Parent(s): e66a418

init repo

Browse files

Files changed (7) hide show

README.md +1 -1
TestSet.py +120 -0
app.py +393 -0
box_utils.py +133 -0
card_model.py +94 -0
classifier.py +121 -0
query_model.py +108 -0

README.md CHANGED Viewed

@@ -6,7 +6,7 @@ colorTo: purple
 sdk: streamlit
 sdk_version: 1.10.0
 app_file: app.py
-pinned: false
 license: lgpl-3.0
 ---

 sdk: streamlit
 sdk_version: 1.10.0
 app_file: app.py
+pinned: true
 license: lgpl-3.0
 ---

TestSet.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import time
+import requests
+from io import BytesIO
+from os import path
+from torch.utils.data import Dataset
+from PIL import Image
+class TestImageSetOnline(Dataset):
+    """ Test Image set with hugging face CLIP preprocess interface
+    Args:
+        Dataset (torch.utils.data.Dataset):
+    """
+    def __init__(self, processor, image_list, timeout_base=0.5, timeout_mul=2):
+        """
+        Args:
+            processor (CLIP preprocessor): process data to a CLIP digestable format
+            image_list (pandas.DataFrame): pandas.DataFrame that contains image metadata
+            timeout_base (float, optional): initial timeout parameter. Defaults to 0.5.
+            timeout_mul (int, optional): multiplier on timeout every time reqeust fails. Defaults to 2.
+        """
+        self.image_list = image_list
+        self.processor = processor
+        self.timeout_base = timeout_base
+        self.timeout = self.timeout_base
+        self.timeout_mul = timeout_mul
+    def __getitem__(self, index):
+        row = self.image_list[index]
+        url = str(row['coco_url'])
+        _id = str(row['id'])
+        txt, img = None, None
+        flag = True
+        while flag:
+            try:
+                # Get images online
+                response = requests.get(url)
+                img = Image.open(BytesIO(response.content))
+                img_s = img.size
+                if img.mode in ['L', 'CMYK', 'RGBA']:
+                    # L is grayscale, CMYK uses alternative color channels
+                    img = img.convert('RGB')
+                # Preprocess image
+                ret = self.processor(text=txt, images=img, return_tensor='pt')
+                img = ret['pixel_values'][0]
+                # If success, then there will be no need to run this again
+                flag = False
+                # Relief the timeout param
+                if self.timeout > self.timeout_base:
+                    self.timeout /= self.timeout_mul
+            except Exception as e:
+                print(f"{_id} {url}: {str(e)}")
+                if type(e) is KeyboardInterrupt:
+                    raise e
+                time.sleep(self.timeout)
+                # Tension the timeout param and turn into a new request
+                self.timeout *= self.timeout_mul
+        return _id, url, img, img_s
+    def get(self, url):
+        _id = url
+        txt, img = None, None
+        flag = True
+        while flag:
+            try:
+                # Get images online
+                response = requests.get(url)
+                img = Image.open(BytesIO(response.content))
+                img_s = img.size
+                if img.mode in ['L', 'CMYK', 'RGBA']:
+                    # L is grayscale, CMYK uses alternative color channels
+                    img = img.convert('RGB')
+                # Preprocess image
+                ret = self.processor(text=txt, images=img, return_tensor='pt')
+                img = ret['pixel_values'][0]
+                # If success, then there will be no need to run this again
+                flag = False
+                # Relief the timeout param
+                if self.timeout > self.timeout_base:
+                    self.timeout /= self.timeout_mul
+            except Exception as e:
+                print(f"{_id} {url}: {str(e)}")
+                if type(e) is KeyboardInterrupt:
+                    raise e
+                time.sleep(self.timeout)
+                # Tension the timeout param and turn into a new request
+                self.timeout *= self.timeout_mul
+        return _id, url, img, img_s
+    def __len__(self,):
+        return len(self.image_list)
+    def __add__(self, other):
+        self.image_list += other.image_list
+        return self
+class TestImageSet(TestImageSetOnline):
+    def __init__(self, droot, processor, image_list, timeout_base=0.5, timeout_mul=2):
+        super().__init__(processor, image_list, timeout_base, timeout_mul)
+        self.droot = droot
+    def __getitem__(self, index):
+        row = self.image_list[index]
+        url = str(row['coco_url'])
+        _id = '_'.join([url.split('/')[-2], str(row['id'])])
+        txt, img = None, None
+        # Get images online
+        img = Image.open(path.join(self.droot,
+                                   url.split('http://images.cocodataset.org/')[1]))
+        img_s = img.size
+        if img.mode in ['L', 'CMYK', 'RGBA']:
+            # L is grayscale, CMYK uses alternative color channels
+            img = img.convert('RGB')
+        # Preprocess image
+        ret = self.processor(text=txt, images=img, return_tensor='pt')
+        img = ret['pixel_values'][0]
+        # If success, then there will be no need to run this again
+        return _id, url, img, img_s

app.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from time import time
+import aiohttp
+from io import BytesIO
+import torch
+import streamlit as st
+import streamlit.components.v1 as components
+import numpy as np
+import torch
+import logging
+from os import environ
+from transformers import OwlViTProcessor, OwlViTForObjectDetection
+from myscaledb import Client
+from classifier import Classifier, prompt2vec, tune, SplitLayer
+from query_model import simple_query, topk_obj_query, rev_query
+from card_model import card, obj_card, style
+from box_utils import postprocess
+environ['TOKENIZERS_PARALLELISM'] = 'true'
+OBJ_DB_NAME = "mqdb_demo.coco_owl_vit_b_32_objects"
+IMG_DB_NAME = "mqdb_demo.coco_owl_vit_b_32_images"
+MODEL_ID = 'google/owlvit-base-patch32'
+DIMS = 512
+qtime = 0
+def build_model(name="google/owlvit-base-patch32"):
+    """Model builder function
+    Args:
+        name (str, optional): Name for HuggingFace OwlViT model. Defaults to "google/owlvit-base-patch32".
+    Returns:
+        (model, processor): OwlViT model and its processor for both image and text
+    """
+    device = 'cpu'
+    if torch.cuda.is_available():
+        device = 'cuda'
+    model = OwlViTForObjectDetection.from_pretrained(name).to(device)
+    processor = OwlViTProcessor.from_pretrained(name)
+    return model, processor
+@st.experimental_singleton(show_spinner=False)
+def init_owlvit():
+    """ Initialize OwlViT Model
+    Returns:
+        model, processor
+    """
+    model, processor = build_model(MODEL_ID)
+    return model, processor
+@st.experimental_singleton(show_spinner=False)
+def init_db():
+    """ Initialize the Database Connection
+    Returns:
+        meta_field: Meta field that records if an image is viewed or not
+        client:     Database connection object
+    """
+    meta = []
+    client = Client(
+        url=st.secrets["DB_URL"], user=st.secrets["USER"], password=st.secrets["PASSWD"])
+    # We can check if the connection is alive
+    assert client.is_alive()
+    return meta, client
+def refresh_index():
+    """ Clean the session
+    """
+    del st.session_state["meta"]
+    st.session_state.meta = []
+    st.session_state.query_num = 0
+    logging.info(f"Refresh for '{st.session_state.meta}'")
+    # Need to clear singleton function with streamlit API
+    init_db.clear()
+    # refresh session states
+    st.session_state.meta, st.session_state.index = init_db()
+    if 'clf' in st.session_state:
+        del st.session_state.clf
+    if 'xq' in st.session_state:
+        del st.session_state.xq
+    if 'topk_img_id' in st.session_state:
+        del st.session_state.topk_img_id
+def query(xq, exclude_list=None):
+    """ Query matched w.r.t a given vector
+    In this part, we will retrieve A LOT OF data from the server,
+    including TopK boxes and their embeddings, the counterpart of non-TopK boxes in TopK images.
+    Args:
+        xq (numpy.ndarray or list of floats): Query vector
+    Returns:
+        matches: list of Records object. Keys referrring to selected columns group by images.
+                 Exclude the user's viewlist.
+        img_matches: list of Records object. Containing other non-TopK but hit objects among TopK images.
+        side_matches: list of Records object. Containing REAL TopK objects disregard the user's view history
+    """
+    attempt = 0
+    xq = xq
+    xq = xq / np.linalg.norm(xq, axis=-1, ord=2, keepdims=True)
+    status_bar = [st.empty(), st.empty()]
+    status_bar[0].write("Retrieving Another TopK Images...")
+    pbar = status_bar[1].progress(0)
+    while attempt < 3:
+        try:
+            matches = topk_obj_query(
+                st.session_state.index, xq, IMG_DB_NAME, OBJ_DB_NAME,
+                exclude_list=exclude_list, topk=5000)
+            img_ids = [r['img_id'] for r in matches]
+            if 'topk_img_id' not in st.session_state:
+                st.session_state.topk_img_id = img_ids
+            status_bar[0].write("Retrieving TopK Images...")
+            pbar.progress(25)
+            o_matches = rev_query(
+                st.session_state.index, xq, st.session_state.topk_img_id,
+                IMG_DB_NAME, OBJ_DB_NAME, thresh=0.1)
+            status_bar[0].write("Retrieving TopKs Objects...")
+            pbar.progress(50)
+            side_matches = simple_query(st.session_state.index, xq, IMG_DB_NAME, OBJ_DB_NAME,
+                                        thresh=-1, topk=5000)
+            status_bar[0].write(
+                "Retrieving Non-TopK in Another TopK Images...")
+            pbar.progress(75)
+            if len(img_ids) > 0:
+                img_matches = rev_query(
+                    st.session_state.index, xq, img_ids, IMG_DB_NAME, OBJ_DB_NAME,
+                    thresh=0.1)
+            else:
+                img_matches = []
+            status_bar[0].write("DONE!")
+            pbar.progress(100)
+            break
+        except Exception as e:
+            # force reload if we have trouble on connections or something else
+            logging.warning(str(e))
+            st.session_state.meta, st.session_state.index = init_db()
+            attempt += 1
+            matches = []
+    _ = [s.empty() for s in status_bar]
+    if len(matches) == 0:
+        logging.error(f"No matches found for '{OBJ_DB_NAME}'")
+    return matches, img_matches, side_matches, o_matches
+@st.experimental_singleton(show_spinner=False)
+def init_random_query():
+    """Initialize a random query vector
+    Returns:
+        xq: a random vector
+    """
+    xq = np.random.rand(1, DIMS)
+    xq /= np.linalg.norm(xq, keepdims=True, axis=-1)
+    return xq
+def submit(meta):
+    """ Tune the model w.r.t given score from user.
+    """
+    # Only updating the meta if the train button is pressed
+    st.session_state.meta.extend(meta)
+    st.session_state.step += 1
+    matches = st.session_state.matched_boxes
+    X, y = list(zip(*((v[-1],
+                       st.session_state.text_prompts.index(
+                           st.session_state[f"label-{i}"])) for i, v in matches.items())))
+    st.session_state.xq = tune(st.session_state.clf,
+                               X, y, iters=int(st.session_state.iters))
+    st.session_state.matches, \
+        st.session_state.img_matches, \
+        st.session_state.side_matches, \
+        st.session_state.o_matches = query(
+            st.session_state.xq, st.session_state.meta)
+# st.set_page_config(layout="wide")
+# To hack the streamlit style we define our own style.
+# Boxes are drawn in SVGs.
+st.write(style(), unsafe_allow_html=True)
+with st.spinner("Connecting DB..."):
+    st.session_state.meta, st.session_state.index = init_db()
+with st.spinner("Loading Models..."):
+    # Initialize model
+    model, tokenizer = init_owlvit()
+# If its a fresh start... (query not set)
+if 'xq' not in st.session_state:
+    with st.container():
+        st.title('Object Detection Safari')
+        start = [st.empty() for _ in range(8)]
+        start[0].info("""
+                      We extracted boxes from **287,104** images in COCO Dataset, including its train / val / test /
+                      unlabeled images, collecting **165,371,904 boxes** which are then filtered with common prompts.
+                      You can search with almost any words or phrases you can think of. Please enjoy your journey of
+                      an adventure to COCO.
+                      """)
+        prompt = start[1].text_input(
+            "Prompt:", value="", placeholder="Examples: football, billboard, stop sign, watermark ...",)
+        with start[2].container():
+            st.write(
+                'You can search with multiple keywords. Plese separate with commas but with no space.')
+            st.write('For example: `cat,dog,tree`')
+            st.markdown('''
+                        <p style="color:gray;"> Don\'t know what to search? Try <b>Random</b>!</p>
+                        ''',
+                        unsafe_allow_html=True)
+        upld_model = start[4].file_uploader(
+            "Or you can upload your previous run!", type='onnx')
+        upld_btn = start[5].button(
+            "Use Loaded Weights", disabled=upld_model is None, on_click=refresh_index)
+        with start[3]:
+            col = st.columns(8)
+            has_no_prompt = (len(prompt) == 0 and upld_model is None)
+            prompt_xq = col[6].button("Prompt", disabled=len(
+                prompt) == 0, on_click=refresh_index)
+            random_xq = col[7].button(
+                "Random", disabled=not has_no_prompt, on_click=refresh_index)
+        matches = []
+        img_matches = []
+        if random_xq:
+            xq = init_random_query()
+            st.session_state.xq = xq
+            prompt = 'unknown'
+            st.session_state.text_prompts = prompt.split(',') + ['none']
+            _ = [elem.empty() for elem in start]
+            t0 = time()
+            matches, img_matches, side_matches, o_matches = query(
+                st.session_state.xq, st.session_state.meta)
+            t1 = time()
+            qtime = (t1-t0) * 1000
+        elif prompt_xq or upld_btn:
+            if upld_model is not None:
+                import onnx
+                from onnx import numpy_helper
+                _model = onnx.load(upld_model)
+                st.session_state.text_prompts = [
+                    node.name for node in _model.graph.output] + ['none']
+                weights = _model.graph.initializer
+                xq = numpy_helper.to_array(weights[0]).T
+                assert xq.shape[0] == len(
+                    st.session_state.text_prompts)-1 and xq.shape[1] == DIMS
+                st.session_state.xq = xq
+                _ = [elem.empty() for elem in start]
+            else:
+                logging.info(f"Input prompt is {prompt}")
+                st.session_state.text_prompts = prompt.split(',') + ['none']
+                input_ids, xq = prompt2vec(
+                    st.session_state.text_prompts[:-1], model, tokenizer)
+                st.session_state.xq = xq
+                _ = [elem.empty() for elem in start]
+            t0 = time()
+            st.session_state.matches, \
+                st.session_state.img_matches, \
+                st.session_state.side_matches, \
+                st.session_state.o_matches = query(
+                    st.session_state.xq, st.session_state.meta)
+            t1 = time()
+            qtime = (t1-t0) * 1000
+# If its not a fresh start (query is set)
+if 'xq' in st.session_state:
+    o_matches = st.session_state.o_matches
+    side_matches = st.session_state.side_matches
+    img_matches = st.session_state.img_matches
+    matches = st.session_state.matches
+    # initialize classifier
+    if 'clf' not in st.session_state:
+        st.session_state.clf = Classifier(st.session_state.xq)
+        st.session_state.step = 0
+    if qtime > 0:
+        st.info("Query done in {0:.2f} ms and returned {1:d} images with {2:d} boxes".format(
+            qtime, len(matches), sum([len(m["box_id"]) + len(im["box_id"]) for m, im in zip(matches, img_matches)])))
+    # export the model into executable ONNX
+    st.session_state.dnld_model = BytesIO()
+    torch.onnx.export(torch.nn.Sequential(st.session_state.clf.model, SplitLayer()),
+                      torch.zeros([1, len(st.session_state.xq[0])]),
+                      st.session_state.dnld_model,
+                      input_names=['input'],
+                      output_names=st.session_state.text_prompts[:-1])
+    dnld_nam = st.text_input('Download Name:',
+                             f'{("_".join([i.replace(" ", "-") for i in st.session_state.text_prompts[:-1]]) if "text_prompts" in st.session_state else "model")}.onnx',
+                             max_chars=50)
+    dnld_btn = st.download_button('Download your classifier!',
+                                  st.session_state.dnld_model,
+                                  dnld_nam)
+    # build up a sidebar to display REAL TopK in DB
+    # this will change during user's finetune. But sometime it would lead to bad results
+    side_bar_len = min(240 // len(st.session_state.text_prompts), 120)
+    with st.sidebar:
+        with st.expander("Top-K Images"):
+            with st.container():
+                boxes_w_img, _ = postprocess(o_matches, st.session_state.text_prompts,
+                                             None)
+                boxes_w_img = sorted(
+                    boxes_w_img, key=lambda x: x[4], reverse=True)
+                for img_id, img_url, img_w, img_h, img_score, boxes in boxes_w_img:
+                    args = img_url, img_w, img_h, boxes
+                    st.write(card(*args), unsafe_allow_html=True)
+        with st.expander("Top-K Objects", expanded=True):
+            side_cols = st.columns(
+                len(st.session_state.text_prompts[:-1]))
+            for _cols, m in zip(side_cols, side_matches):
+                with _cols.container():
+                    for cx, cy, w, h, logit, img_url, img_w, img_h \
+                        in zip(m['cx'], m['cy'], m['w'], m['h'], m['logit'],
+                               m['img_url'], m['img_w'], m['img_h']):
+                        st.write("{:s}: {:.4f}".format(
+                            st.session_state.text_prompts[m['label']], logit))
+                        _html = obj_card(
+                            img_url, img_w, img_h, cx, cy, w, h, dst_len=side_bar_len)
+                        components.html(
+                            _html, side_bar_len, side_bar_len)
+    with st.container():
+        # Here let the user interact with batch labeling
+        with st.form("batch", clear_on_submit=False):
+            col = st.columns([1, 9])
+            # If there is nothing to show about
+            if len(matches) <= 0:
+                st.warning(
+                    'Oops! We didn\'t find anything relevant to your query! Pleas try another one :/')
+            else:
+                st.session_state.iters = st.slider(
+                    "Number of Iterations to Update", min_value=0, max_value=10, step=1, value=2)
+            # No matter what happened the user wants a way back
+            col[1].form_submit_button(
+                "Choose a new prompt", on_click=refresh_index)
+            # If there are things to show
+            if len(matches) > 0:
+                with st.container():
+                    prompt_labels = st.session_state.text_prompts
+                    # Post processing boxes regarding to their score, intersection
+                    boxes_w_img, meta = postprocess(matches, st.session_state.text_prompts,
+                                                    img_matches)
+                    # Sort the result according to their relavancy
+                    boxes_w_img = sorted(
+                        boxes_w_img, key=lambda x: x[4], reverse=True)
+                    st.session_state.matched_boxes = {}
+                    # For each images in the retrieved images, DISPLAY
+                    for img_id, img_url, img_w, img_h, img_score, boxes in boxes_w_img:
+                        # prepare inputs for training
+                        st.session_state.matched_boxes.update(
+                            {b[0]: b for b in boxes})
+                        args = img_url, img_w, img_h, boxes
+                        # display boxes
+                        with st.expander("{:s}: {:.4f}".format(img_id, img_score), expanded=True):
+                            ind_b = 0
+                            # 4 columns: (img, obj, obj, obj)
+                            img_row = st.columns([4, 2, 2, 2])
+                            img_row[0].write(
+                                card(*args), unsafe_allow_html=True)
+                            # crop objects out of the original image
+                            for b in boxes:
+                                _id, cx, cy, w, h, label, logit, is_selected, _ = b
+                                with img_row[1 + ind_b % 3].container():
+                                    st.write(
+                                        "{:s}: {:.4f}".format(label, logit))
+                                    # quite hacky: with streamlit components API
+                                    _html = \
+                                        obj_card(img_url, img_w, img_h,
+                                                 *b[1:5], dst_len=120)
+                                    components.html(_html, 120, 120)
+                                    # the user will choose the right label of the given object
+                                    st.selectbox(
+                                        "Class",
+                                        prompt_labels,
+                                        index=prompt_labels.index(label),
+                                        key=f"label-{_id}")
+                                ind_b += 1
+                col[0].form_submit_button(
+                    "Train!", on_click=lambda: submit(meta))

box_utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import numpy as np
+def cxywh2xywh(cx, cy, w, h):
+    """ CxCyWH format to XYWH format conversion
+    """
+    x = cx - w / 2
+    y = cy - h / 2
+    return x, y, w, h
+def cxywh2ltrb(cx, cy, w, h):
+    """CxCyWH format to LeftRightTopBottom format
+    """
+    l = cx - w / 2
+    t = cy - h / 2
+    r = cx + w / 2
+    b = cy + h / 2
+    return l, t, r, b
+def iou(ba, bb):
+    """Calculate Intersection-Over-Union
+    Args:
+        ba (tuple): CxCyWH format with score
+        bb (tuple): CxCyWH format with score
+    Returns:
+        IoU with size of length of given box
+    """
+    a_l, a_t, a_r, a_b, sa = ba
+    b_l, b_t, b_r, b_b, sb = bb
+    x1 = np.maximum(a_l, b_l)
+    y1 = np.maximum(a_t, b_t)
+    x2 = np.minimum(a_r, b_r)
+    y2 = np.minimum(a_b, b_b)
+    w = np.maximum(0, x2 - x1)
+    h = np.maximum(0, y2 - y1)
+    intersec = w * h
+    iou = (intersec) / (sa + sb - intersec)
+    return iou.squeeze()
+def nms(cx, cy, w, h, s, iou_thresh=0.3):
+    """Bounding box Non-maximum Suppression
+    Args:
+        cx, cy, w, h, s: CxCyWH Format with score boxes
+        iou_thresh (float, optional): IoU threshold. Defaults to 0.3.
+    Returns:
+        res: indexes of the selected boxes
+    """
+    l, t, r, b = cxywh2ltrb(cx, cy, w, h)
+    areas = w * h
+    res = []
+    sort_ind = np.argsort(s, axis=-1)[::-1]
+    while sort_ind.shape[0] > 0:
+        i = sort_ind[0]
+        res.append(i)
+        _iou = iou((l[i], t[i], r[i], b[i], areas[i]),
+                   (l[sort_ind[1:]], t[sort_ind[1:]],
+                    r[sort_ind[1:]], b[sort_ind[1:]],  areas[sort_ind[1:]]))
+        sel_ind = np.where(_iou <= iou_thresh)[0]
+        sort_ind = sort_ind[sel_ind + 1]
+    return res
+def filter_nonpos(boxes, agnostic_ratio=0.5, class_ratio=0.7):
+    """filter out insignificant boxes
+    Args:
+        boxes (list of records): returned query to be filtered
+    """
+    ret = []
+    labelwise = {}
+    for _id, cx, cy, w, h, label, logit, is_selected, _ in boxes:
+        if label not in labelwise:
+            labelwise[label] = []
+        labelwise[label].append(logit)
+    labelwise = {l: max(s) for l, s in labelwise.items()}
+    agnostic = max([v for _, v in labelwise.items()])
+    for b in boxes:
+        _id, cx, cy, w, h, label, logit, is_selected, _ = b
+        if logit > class_ratio * labelwise[label] \
+                and logit > agnostic_ratio * agnostic:
+            ret.append(b)
+    return ret
+def postprocess(matches, prompt_labels, img_matches=None):
+    meta = []
+    boxes_w_img = []
+    matches_ = {m['img_id']: m for m in matches}
+    if img_matches is not None:
+        img_matches_ = {m['img_id']: m for m in img_matches}
+    for k in matches_.keys():
+        m = matches_[k]
+        boxes = []
+        boxes += list(map(list, zip(m['box_id'], m['cx'], m['cy'], m['w'], m['h'],
+                                    [prompt_labels[int(l)]
+                                        for l in m['label']],
+                                    m['logit'], [1] *
+                                    len(m['box_id']),
+                                    list(np.array(m['cls_emb'])))))
+        if img_matches is not None:
+            img_m = img_matches_[k]
+            # and also those non-TopK hits and those non-topk are not anticipating training
+            boxes += [i for i in map(list, zip(img_m['box_id'], img_m['cx'], img_m['cy'], img_m['w'], img_m['h'],
+                                               [prompt_labels[int(
+                                                l)] for l in img_m['label']], img_m['logit'],
+                                               [0] * len(img_m['box_id']), list(np.array(img_m['cls_emb']))))
+                      if i[0] not in [b[0] for b in boxes]]
+        # update record metadata after query
+        for b in boxes:
+            meta.append(b[0])
+        # remove some non-significant boxes
+        boxes = filter_nonpos(
+            boxes, agnostic_ratio=0.4, class_ratio=0.7)
+        # doing non-maximum suppression
+        cx, cy, w, h, s = list(map(lambda x: np.array(x),
+                                   list(zip(*[(*b[1:5], b[6]) for b in boxes]))))
+        ind = nms(cx, cy, w, h, s, 0.3)
+        boxes = [boxes[i] for i in ind]
+        img_score = img_m['img_score'] if img_matches is not None else m['img_score']
+        boxes_w_img.append(
+            (m["img_id"], m["img_url"], m["img_w"], m["img_h"], img_score, boxes))
+    return boxes_w_img, meta

card_model.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import base64
+from box_utils import cxywh2ltrb, cxywh2xywh
+def style():
+    """ Style string for card models
+    """
+    return """
+        <link
+        rel="stylesheet"
+        href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"
+        />
+        <style>
+            .img-overlay-wrap {
+                position: relative;
+                display: inline-block;
+            }
+            .img-overlay-wrap {
+                position: relative;
+                display: inline-block;
+                /* <= shrinks container to image size */
+                transition: transform 150ms ease-in-out;
+            }
+            .img-overlay-wrap img {
+                /* <= optional, for responsiveness */
+                display: block;
+                max-width: 100%;
+                height: auto;
+            }
+            .img-overlay-wrap svg {
+                position: absolute;
+                top: 0;
+                left: 0;
+            }
+        </style>
+        """
+def card(img_url, img_w, img_h, boxes):
+    """ This is a hack to streamlit
+    Solution thanks to: https://discuss.streamlit.io/t/display-svg/172/5
+    Converting SVG to Base64 and display with <img> tag.
+    Also we used the
+    """
+    _boxes = ""
+    for _id, cx, cy, w, h, label, logit, is_selected, _ in boxes:
+        x, y, w, h = cxywh2xywh(cx, cy, w, h)
+        x = round(img_w * x)
+        y = round(img_h * y)
+        w = round(img_w * w)
+        h = round(img_h * h)
+        logit = "%.3f" % logit
+        _boxes += f'''
+        <text fill="white" font-size="20" x="{x}" y="{y}" style="fill:white;opacity:0.7">{label}: {logit}</text>
+        <rect x="{x}" y="{y}" width="{w}" height="{h}" style="fill:none;stroke:{"red" if is_selected else "green"};
+        stroke-width:4;opacity:0.5" />
+        '''
+    _svg = f'''
+            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {img_w} {img_h}">
+            {_boxes}
+            </svg>
+            '''
+    _svg = r'<img style="position:absolute;top:0;left:0;" src="data:image/svg+xml;base64,%s"/>' % \
+        base64.b64encode(_svg.encode('utf-8')).decode('utf-8')
+    _img_d = f'''
+        <div class="img-overlay-wrap" width="{img_w}" height="{img_h}">
+            <img width="{img_w}" height="{img_h}" src="{img_url}">
+            {_svg}
+        </div>
+    '''
+    return _img_d
+def obj_card(img_url, img_w, img_h, cx, cy, w, h, *args, dst_len=100):
+    """object card for displaying cropped object
+    Args:
+        Retrieved image and object info
+    Returns:
+        _obj_html: html string to display object
+    """
+    w = img_w * w
+    h = img_h * h
+    s = max(w, h)
+    x = round(img_w * cx - s / 2)
+    y = round(img_h * cy - s / 2)
+    scale = dst_len / s
+    _obj_html = f'''
+        <div style="transform-origin:0 0;transform:scale({scale});">
+            <img src="{img_url}" style="margin:{-y}px 0px 0px {-x}px;">
+        </div>
+    '''
+    return _obj_html

classifier.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+def extract_text_feature(prompt, model, processor, device='cpu'):
+    """Extract text features
+    Args:
+        prompt: a single text query
+        model: OwlViT model
+        processor: OwlViT processor
+        device (str, optional): device to run. Defaults to 'cpu'.
+    """
+    device = 'cpu'
+    if torch.cuda.is_available():
+        device = 'cuda'
+    with torch.no_grad():
+        input_ids = torch.as_tensor(processor(text=prompt)[
+                                    'input_ids']).to(device)
+        print(input_ids.device)
+        text_outputs = model.owlvit.text_model(
+            input_ids=input_ids,
+            attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+        )
+        text_embeds = text_outputs[1]
+        text_embeds = model.owlvit.text_projection(text_embeds)
+        text_embeds /= text_embeds.norm(p=2, dim=-1, keepdim=True) + 1e-6
+        query_embeds = text_embeds
+    return input_ids, query_embeds
+def prompt2vec(prompt: str, model, processor):
+    """ Convert prompt into a computational vector
+    Args:
+        prompt (str): Text to be tokenized
+    Returns:
+        xq: vector from the tokenizer, representing the original prompt
+    """
+    # inputs = tokenizer(prompt, return_tensors='pt')
+    # out = clip.get_text_features(**inputs)
+    input_ids, xq = extract_text_feature(prompt, model, processor)
+    input_ids = input_ids.detach().cpu().numpy()
+    xq = xq.detach().cpu().numpy()
+    return input_ids, xq
+def tune(clf, X, y, iters=2):
+    """ Train the Zero-shot Classifier
+    Args:
+        X (numpy.ndarray): Input vectors (retreived vectors)
+        y (list of floats or numpy.ndarray): Scores given by user
+        iters (int, optional): iterations of updates to be run
+    """
+    assert len(X) == len(y)
+    # train the classifier
+    clf.fit(X, y, iters=iters)
+    # extract new vector
+    return clf.get_weights()
+class Classifier:
+    """Multi-Class Zero-shot Classifier
+    This Classifier provides proxy regarding to the user's reaction to the probed images.
+    The proxy will replace the original query vector generated by prompted vector and finally
+    give the user a satisfying retrieval result.
+    This can be commonly seen in a recommendation system. The classifier will recommend more
+    precise result as it accumulating user's activity.
+    This is a multiclass classifier. For N queries it will set the all queries to the first-N classes
+    and the last one takes the negative one.
+    """
+    def __init__(self, xq: list):
+        init_weight = torch.Tensor(xq)
+        self.num_class = xq.shape[0]
+        DIMS = xq.shape[1]
+        # note that the bias is ignored, as we only focus on the inner product result
+        self.model = torch.nn.Linear(DIMS, self.num_class, bias=False)
+        # convert initial query `xq` to tensor parameter to init weights
+        self.model.weight = torch.nn.Parameter(init_weight)
+        # init loss and optimizer
+        self.loss = torch.nn.BCEWithLogitsLoss()
+        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
+    def fit(self, X: list, y: list, iters: int = 5):
+        # convert X and y to tensor
+        X = torch.Tensor(X)
+        X /= torch.norm(X, p=2, dim=-1, keepdim=True)
+        y = torch.Tensor(y).long()
+        # Generate labels for binary classification and ignore outbound labels
+        non_ind = y > self.num_class
+        y = torch.nn.functional.one_hot(y % self.num_class, num_classes=self.num_class).float()
+        y[non_ind] = 0
+        for i in range(iters):
+            # zero gradients
+            self.optimizer.zero_grad()
+            # Normalize the weight before inference
+            # This will constrain the gradient or you will have an explosion on query vector
+            self.model.weight.data /= torch.norm(self.model.weight.data, p=2, dim=-1, keepdim=True)
+            # forward pass
+            out = self.model(X)
+            # compute loss
+            loss = self.loss(out, y)
+            # backward pass
+            loss.backward()
+            # update weights
+            self.optimizer.step()
+    def get_weights(self):
+        xq = self.model.weight.detach().numpy()
+        return xq
+class SplitLayer(torch.nn.Module):
+    def forward(self, x):
+        return torch.split(x, 1, dim=-1)

query_model.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import logging
+def topk_obj_query(client, xq, IMG_DB_NAME, OBJ_DB_NAME,
+                   exclude_list=[], topk=10):
+    xq_s = [
+        f"[{', '.join([str(float(fnum)) for fnum in _xq.tolist() + [1]])}]" for _xq in xq]
+    exclude_list_str = ','.join([f'\'{i}\'' for i in exclude_list])
+    _cond = (f"WHERE obj_id NOT IN ({exclude_list_str})" if len(
+        exclude_list) > 0 else "")
+    _subq_str = []
+    _img_score_subq = []
+    for _l, _xq in enumerate(xq_s):
+        _img_score_subq.append(
+            f"arrayReduce('maxIf', logit, arrayMap(x->x={_l}, label))")
+        _subq_str.append(f"""
+            SELECT img_id, img_url, img_w, img_h, 1/(1+exp(-arraySum(arrayMap((x,y)->x*y, prelogit, {_xq})))) AS pred_logit,
+                obj_id, box_cx, box_cy, box_w, box_h, class_embedding, {_l} AS l
+            FROM {OBJ_DB_NAME}
+            JOIN {IMG_DB_NAME}
+            ON {IMG_DB_NAME}.img_id = {OBJ_DB_NAME}.img_id
+            PREWHERE obj_id IN (
+                SELECT obj_id FROM (
+                    SELECT obj_id, distance('topK={topk}', 'nprobe=32')(prelogit, {_xq}) AS dist FROM {OBJ_DB_NAME}
+                    ORDER BY dist DESC
+                ) {_cond} LIMIT 10
+            )
+            """)
+    _subq_str = ' UNION ALL '.join(_subq_str)
+    _img_score_q = ','.join(_img_score_subq)
+    _img_score_q = f"arraySum(arrayFilter(x->NOT isNaN(x), array({_img_score_q}))) AS img_score"
+    q_str = f"""
+            SELECT img_id, img_url, img_w, img_h, groupArray(obj_id) AS box_id,
+                groupArray(box_cx) AS cx, groupArray(box_cy) AS cy, groupArray(box_w) AS w, groupArray(box_h) AS h,
+                groupArray(pred_logit) AS logit, groupArray(l) as label, groupArray(class_embedding) AS cls_emb,
+                {_img_score_q}
+            FROM
+                    ({_subq_str})
+            GROUP BY img_id, img_url, img_w, img_h ORDER BY img_score DESC
+    """
+    xc = client.fetch(q_str)
+    return xc
+def rev_query(client, xq, img_ids, IMG_DB_NAME, OBJ_DB_NAME, thresh=0.08):
+    xq_s = [
+        f"[{', '.join([str(float(fnum)) for fnum in _xq.tolist() + [1]])}]" for _xq in xq]
+    image_list = ','.join([f'\'{i}\'' for i in img_ids])
+    _thresh = f"WHERE pred_logit > {thresh}" if thresh > 0 else ""
+    _subq_str = []
+    _img_score_subq = []
+    for _l, _xq in enumerate(xq_s):
+        _img_score_subq.append(
+            f"arrayReduce('maxIf', logit, arrayMap(x->x={_l}, label))")
+        _subq_str.append(f"""
+            SELECT {OBJ_DB_NAME}.img_id AS img_id, img_url, img_w, img_h,
+                (1 / (1 + exp(-(arraySum(arrayMap((x,y)->x*y, prelogit, {_xq})))))) AS pred_logit,
+                obj_id, box_cx, box_cy, box_w, box_h, class_embedding, {_l} AS l
+            FROM {OBJ_DB_NAME}
+                JOIN {IMG_DB_NAME}
+                ON {IMG_DB_NAME}.img_id = {OBJ_DB_NAME}.img_id
+                PREWHERE img_id IN ({image_list})
+                {_thresh}
+            """)
+    _subq_str = ' UNION ALL '.join(_subq_str)
+    _img_score_q = ','.join(_img_score_subq)
+    _img_score_q = f"arraySum(arrayFilter(x->NOT isNaN(x), array({_img_score_q}))) AS img_score"
+    q_str = f"""
+            SELECT img_id, groupArray(obj_id) AS box_id, img_url, img_w, img_h,
+                groupArray(box_cx) AS cx, groupArray(box_cy) AS cy, groupArray(box_w) AS w, groupArray(box_h) AS h,
+                groupArray(pred_logit) AS logit, groupArray(l) as label, groupArray(class_embedding) AS cls_emb,
+                {_img_score_q}
+            FROM
+                ({_subq_str})
+            GROUP BY img_id, img_url, img_w, img_h ORDER BY img_score DESC
+    """
+    xc = client.fetch(q_str)
+    return xc
+def simple_query(client, xq, IMG_DB_NAME, OBJ_DB_NAME, thresh=0.08, topk=10):
+    xq_s = [
+        f"[{', '.join([str(float(fnum)) for fnum in _xq.tolist() + [1]])}]" for _xq in xq]
+    res = []
+    subq_str = []
+    _thresh = f"WHERE pred_logit > {thresh}" if thresh > 0 else ""
+    for _l, _xq in enumerate(xq_s):
+        subq_str.append(
+            f"""
+            SELECT {OBJ_DB_NAME}.img_id AS img_id, img_url, img_w, img_h, prelogit,
+                obj_id, box_cx, box_cy, box_w, box_h, {_l} AS l,  distance('topK={topk}', 'nprobe=32')(prelogit, {_xq}) AS dist
+            FROM {OBJ_DB_NAME}
+            JOIN {IMG_DB_NAME}
+            ON {IMG_DB_NAME}.img_id = {OBJ_DB_NAME}.img_id
+            {_thresh} LIMIT 10
+            """)
+    subq_str = " UNION ALL ".join(subq_str)
+    q_str = f"""
+                SELECT groupArray(img_url) AS img_url, groupArray(img_w) AS img_w, groupArray(img_h) AS img_h,
+                groupArray(box_cx) AS cx, groupArray(box_cy) AS cy, groupArray(box_w) AS w, groupArray(box_h) AS h,
+                l AS label, groupArray(dist) as d,
+                groupArray(1 / (1 + exp(-dist))) AS logit FROM (
+                    {subq_str}
+                )
+                GROUP BY l
+                """
+    res = client.fetch(q_str)
+    return res