Spaces:
Runtime error
Runtime error
MaureenZOU
commited on
Commit
·
e972e1f
1
Parent(s):
a74dbcb
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +85 -0
- .gitignore +103 -0
- README.md +5 -4
- __init__.py +0 -0
- app.py +120 -0
- configs/xdecoder/svlp_focalt_lang.yaml +110 -0
- images/apples.jpg +0 -0
- images/coco/000.jpg +3 -0
- images/coco/001.jpg +3 -0
- images/coco/002.jpg +3 -0
- images/coco/003.jpg +3 -0
- images/coco/004.jpg +3 -0
- images/coco/005.jpg +3 -0
- images/coco/006.jpg +3 -0
- images/coco/007.jpg +3 -0
- images/coco/008.jpg +3 -0
- images/coco/009.jpg +3 -0
- images/coco/010.jpg +3 -0
- images/coco/011.jpg +3 -0
- images/coco/012.jpg +3 -0
- images/coco/013.jpg +3 -0
- images/coco/014.jpg +3 -0
- images/coco/015.jpg +3 -0
- images/coco/016.jpg +3 -0
- images/coco/017.jpg +3 -0
- images/coco/018.jpg +3 -0
- images/coco/019.jpg +3 -0
- images/coco/020.jpg +3 -0
- images/coco/021.jpg +3 -0
- images/coco/022.jpg +3 -0
- images/coco/023.jpg +3 -0
- images/coco/024.jpg +3 -0
- images/coco/025.jpg +3 -0
- images/coco/026.jpg +3 -0
- images/coco/027.jpg +3 -0
- images/coco/028.jpg +3 -0
- images/coco/029.jpg +3 -0
- images/coco/030.jpg +3 -0
- images/coco/031.jpg +3 -0
- images/coco/032.jpg +3 -0
- images/coco/033.jpg +3 -0
- images/coco/034.jpg +3 -0
- images/coco/035.jpg +3 -0
- images/coco/036.jpg +3 -0
- images/coco/037.jpg +3 -0
- images/coco/038.jpg +3 -0
- images/coco/039.jpg +3 -0
- images/coco/040.jpg +3 -0
- images/coco/041.jpg +3 -0
- images/coco/042.jpg +3 -0
.gitattributes
CHANGED
@@ -32,3 +32,88 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
images/animals.png filter=lfs diff=lfs merge=lfs -text
|
36 |
+
images/region_retrieval.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
xdecoder_focalt_last_novg.pt filter=lfs diff=lfs merge=lfs -text
|
38 |
+
xdecoder_focalt_last.pt filter=lfs diff=lfs merge=lfs -text
|
39 |
+
v_emb.da filter=lfs diff=lfs merge=lfs -text
|
40 |
+
images/coco/077.jpg filter=lfs diff=lfs merge=lfs -text
|
41 |
+
images/coco/071.jpg filter=lfs diff=lfs merge=lfs -text
|
42 |
+
images/coco/022.jpg filter=lfs diff=lfs merge=lfs -text
|
43 |
+
images/coco/026.jpg filter=lfs diff=lfs merge=lfs -text
|
44 |
+
images/coco/036.jpg filter=lfs diff=lfs merge=lfs -text
|
45 |
+
images/coco/039.jpg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
images/coco/052.jpg filter=lfs diff=lfs merge=lfs -text
|
47 |
+
images/coco/057.jpg filter=lfs diff=lfs merge=lfs -text
|
48 |
+
images/coco/061.jpg filter=lfs diff=lfs merge=lfs -text
|
49 |
+
images/coco/017.jpg filter=lfs diff=lfs merge=lfs -text
|
50 |
+
images/coco/021.jpg filter=lfs diff=lfs merge=lfs -text
|
51 |
+
images/coco/030.jpg filter=lfs diff=lfs merge=lfs -text
|
52 |
+
images/coco/056.jpg filter=lfs diff=lfs merge=lfs -text
|
53 |
+
images/coco/064.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
images/coco/072.jpg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
images/coco/014.jpg filter=lfs diff=lfs merge=lfs -text
|
56 |
+
images/coco/025.jpg filter=lfs diff=lfs merge=lfs -text
|
57 |
+
images/coco/027.jpg filter=lfs diff=lfs merge=lfs -text
|
58 |
+
images/coco/038.jpg filter=lfs diff=lfs merge=lfs -text
|
59 |
+
images/coco/044.jpg filter=lfs diff=lfs merge=lfs -text
|
60 |
+
images/coco/049.jpg filter=lfs diff=lfs merge=lfs -text
|
61 |
+
images/coco/053.jpg filter=lfs diff=lfs merge=lfs -text
|
62 |
+
images/coco/078.jpg filter=lfs diff=lfs merge=lfs -text
|
63 |
+
images/coco/002.jpg filter=lfs diff=lfs merge=lfs -text
|
64 |
+
images/coco/005.jpg filter=lfs diff=lfs merge=lfs -text
|
65 |
+
images/coco/007.jpg filter=lfs diff=lfs merge=lfs -text
|
66 |
+
images/coco/008.jpg filter=lfs diff=lfs merge=lfs -text
|
67 |
+
images/coco/011.jpg filter=lfs diff=lfs merge=lfs -text
|
68 |
+
images/coco/013.jpg filter=lfs diff=lfs merge=lfs -text
|
69 |
+
images/coco/020.jpg filter=lfs diff=lfs merge=lfs -text
|
70 |
+
images/coco/034.jpg filter=lfs diff=lfs merge=lfs -text
|
71 |
+
images/coco/000.jpg filter=lfs diff=lfs merge=lfs -text
|
72 |
+
images/coco/066.jpg filter=lfs diff=lfs merge=lfs -text
|
73 |
+
images/coco/074.jpg filter=lfs diff=lfs merge=lfs -text
|
74 |
+
images/coco/065.jpg filter=lfs diff=lfs merge=lfs -text
|
75 |
+
images/coco/023.jpg filter=lfs diff=lfs merge=lfs -text
|
76 |
+
images/coco/024.jpg filter=lfs diff=lfs merge=lfs -text
|
77 |
+
images/coco/033.jpg filter=lfs diff=lfs merge=lfs -text
|
78 |
+
images/coco/040.jpg filter=lfs diff=lfs merge=lfs -text
|
79 |
+
images/coco/041.jpg filter=lfs diff=lfs merge=lfs -text
|
80 |
+
images/coco/046.jpg filter=lfs diff=lfs merge=lfs -text
|
81 |
+
images/coco/060.jpg filter=lfs diff=lfs merge=lfs -text
|
82 |
+
images/coco/003.jpg filter=lfs diff=lfs merge=lfs -text
|
83 |
+
images/coco/058.jpg filter=lfs diff=lfs merge=lfs -text
|
84 |
+
images/coco/073.jpg filter=lfs diff=lfs merge=lfs -text
|
85 |
+
images/coco/042.jpg filter=lfs diff=lfs merge=lfs -text
|
86 |
+
images/coco/015.jpg filter=lfs diff=lfs merge=lfs -text
|
87 |
+
images/coco/016.jpg filter=lfs diff=lfs merge=lfs -text
|
88 |
+
images/coco/018.jpg filter=lfs diff=lfs merge=lfs -text
|
89 |
+
images/coco/051.jpg filter=lfs diff=lfs merge=lfs -text
|
90 |
+
images/coco/054.jpg filter=lfs diff=lfs merge=lfs -text
|
91 |
+
images/coco/063.jpg filter=lfs diff=lfs merge=lfs -text
|
92 |
+
images/coco/010.jpg filter=lfs diff=lfs merge=lfs -text
|
93 |
+
images/coco/050.jpg filter=lfs diff=lfs merge=lfs -text
|
94 |
+
images/coco/070.jpg filter=lfs diff=lfs merge=lfs -text
|
95 |
+
images/coco/037.jpg filter=lfs diff=lfs merge=lfs -text
|
96 |
+
images/coco/031.jpg filter=lfs diff=lfs merge=lfs -text
|
97 |
+
images/coco/062.jpg filter=lfs diff=lfs merge=lfs -text
|
98 |
+
images/coco/067.jpg filter=lfs diff=lfs merge=lfs -text
|
99 |
+
images/coco/069.jpg filter=lfs diff=lfs merge=lfs -text
|
100 |
+
images/coco/029.jpg filter=lfs diff=lfs merge=lfs -text
|
101 |
+
images/coco/012.jpg filter=lfs diff=lfs merge=lfs -text
|
102 |
+
images/coco/068.jpg filter=lfs diff=lfs merge=lfs -text
|
103 |
+
images/coco/075.jpg filter=lfs diff=lfs merge=lfs -text
|
104 |
+
images/coco/006.jpg filter=lfs diff=lfs merge=lfs -text
|
105 |
+
images/coco/035.jpg filter=lfs diff=lfs merge=lfs -text
|
106 |
+
images/coco/001.jpg filter=lfs diff=lfs merge=lfs -text
|
107 |
+
images/coco/055.jpg filter=lfs diff=lfs merge=lfs -text
|
108 |
+
images/coco/048.jpg filter=lfs diff=lfs merge=lfs -text
|
109 |
+
images/coco/019.jpg filter=lfs diff=lfs merge=lfs -text
|
110 |
+
images/coco/028.jpg filter=lfs diff=lfs merge=lfs -text
|
111 |
+
images/coco/043.jpg filter=lfs diff=lfs merge=lfs -text
|
112 |
+
images/coco/059.jpg filter=lfs diff=lfs merge=lfs -text
|
113 |
+
images/coco/079.jpg filter=lfs diff=lfs merge=lfs -text
|
114 |
+
images/coco/004.jpg filter=lfs diff=lfs merge=lfs -text
|
115 |
+
images/coco/032.jpg filter=lfs diff=lfs merge=lfs -text
|
116 |
+
images/coco/045.jpg filter=lfs diff=lfs merge=lfs -text
|
117 |
+
images/coco/047.jpg filter=lfs diff=lfs merge=lfs -text
|
118 |
+
images/coco/076.jpg filter=lfs diff=lfs merge=lfs -text
|
119 |
+
images/coco/009.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IntelliJ project files
|
2 |
+
.idea
|
3 |
+
*.iml
|
4 |
+
out
|
5 |
+
gen
|
6 |
+
|
7 |
+
### Vim template
|
8 |
+
[._]*.s[a-w][a-z]
|
9 |
+
[._]s[a-w][a-z]
|
10 |
+
*.un~
|
11 |
+
Session.vim
|
12 |
+
.netrwhist
|
13 |
+
*~
|
14 |
+
|
15 |
+
### IPythonNotebook template
|
16 |
+
# Temporary data
|
17 |
+
.ipynb_checkpoints/
|
18 |
+
|
19 |
+
### Python template
|
20 |
+
# Byte-compiled / optimized / DLL files
|
21 |
+
__pycache__/
|
22 |
+
*.py[cod]
|
23 |
+
*$py.class
|
24 |
+
|
25 |
+
# C extensions
|
26 |
+
*.so
|
27 |
+
|
28 |
+
# Distribution / packaging
|
29 |
+
.Python
|
30 |
+
env/
|
31 |
+
build/
|
32 |
+
develop-eggs/
|
33 |
+
dist/
|
34 |
+
downloads/
|
35 |
+
eggs/
|
36 |
+
.eggs/
|
37 |
+
#lib/
|
38 |
+
#lib64/
|
39 |
+
parts/
|
40 |
+
sdist/
|
41 |
+
var/
|
42 |
+
*.egg-info/
|
43 |
+
.installed.cfg
|
44 |
+
*.egg
|
45 |
+
|
46 |
+
# PyInstaller
|
47 |
+
# Usually these files are written by a python script from a template
|
48 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
49 |
+
*.manifest
|
50 |
+
*.spec
|
51 |
+
|
52 |
+
# Installer logs
|
53 |
+
pip-log.txt
|
54 |
+
pip-delete-this-directory.txt
|
55 |
+
|
56 |
+
# Unit test / coverage reports
|
57 |
+
htmlcov/
|
58 |
+
.tox/
|
59 |
+
.coverage
|
60 |
+
.coverage.*
|
61 |
+
.cache
|
62 |
+
nosetests.xml
|
63 |
+
coverage.xml
|
64 |
+
*,cover
|
65 |
+
|
66 |
+
# Translations
|
67 |
+
*.mo
|
68 |
+
*.pot
|
69 |
+
|
70 |
+
# Django stuff:
|
71 |
+
*.log
|
72 |
+
|
73 |
+
# Sphinx documentation
|
74 |
+
docs/_build/
|
75 |
+
|
76 |
+
# PyBuilder
|
77 |
+
target/
|
78 |
+
|
79 |
+
*.ipynb
|
80 |
+
*.params
|
81 |
+
# *.json
|
82 |
+
.vscode/
|
83 |
+
*.code-workspace/
|
84 |
+
|
85 |
+
lib/pycocotools/_mask.c
|
86 |
+
lib/nms/cpu_nms.c
|
87 |
+
|
88 |
+
OUTPUT
|
89 |
+
OUTPUT/*
|
90 |
+
models/*
|
91 |
+
DATASET
|
92 |
+
DATASET/*
|
93 |
+
external/
|
94 |
+
MODELS
|
95 |
+
MODELS/*
|
96 |
+
gradio_cached_examples/*
|
97 |
+
|
98 |
+
kill.sh
|
99 |
+
|
100 |
+
draws/
|
101 |
+
plot/
|
102 |
+
|
103 |
+
*venv/*
|
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: purple
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: X Decoder
|
3 |
+
emoji: 📈
|
4 |
colorFrom: purple
|
5 |
+
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.14.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: afl-3.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
+
# Copyright (c) 2022 Microsoft
|
4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
5 |
+
# Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
|
6 |
+
# --------------------------------------------------------
|
7 |
+
|
8 |
+
import os
|
9 |
+
os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git")
|
10 |
+
|
11 |
+
import gradio as gr
|
12 |
+
import torch
|
13 |
+
import argparse
|
14 |
+
|
15 |
+
from xdecoder.BaseModel import BaseModel
|
16 |
+
from xdecoder import build_model
|
17 |
+
from utils.distributed import init_distributed
|
18 |
+
from utils.arguments import load_opt_from_config_files
|
19 |
+
|
20 |
+
from tasks import *
|
21 |
+
|
22 |
+
def parse_option():
|
23 |
+
parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False)
|
24 |
+
parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', )
|
25 |
+
args = parser.parse_args()
|
26 |
+
|
27 |
+
return args
|
28 |
+
|
29 |
+
'''
|
30 |
+
build args
|
31 |
+
'''
|
32 |
+
args = parse_option()
|
33 |
+
opt = load_opt_from_config_files(args.conf_files)
|
34 |
+
opt = init_distributed(opt)
|
35 |
+
|
36 |
+
# META DATA
|
37 |
+
pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt")
|
38 |
+
pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt")
|
39 |
+
|
40 |
+
if not os.path.exists(pretrained_pth_last):
|
41 |
+
os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt"))
|
42 |
+
|
43 |
+
if not os.path.exists(pretrained_pth_novg):
|
44 |
+
os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt"))
|
45 |
+
|
46 |
+
|
47 |
+
'''
|
48 |
+
build model
|
49 |
+
'''
|
50 |
+
model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda()
|
51 |
+
model_cap = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_novg).eval().cuda()
|
52 |
+
|
53 |
+
with torch.no_grad():
|
54 |
+
model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
|
55 |
+
model_cap.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
|
56 |
+
|
57 |
+
'''
|
58 |
+
inference model
|
59 |
+
'''
|
60 |
+
|
61 |
+
@torch.no_grad()
|
62 |
+
def inference(image, task, *args, **kwargs):
|
63 |
+
image = image.convert("RGB")
|
64 |
+
with torch.autocast(device_type='cuda', dtype=torch.float16):
|
65 |
+
if task == 'Referring Inpainting':
|
66 |
+
return referring_inpainting(model_last, image, *args, **kwargs)
|
67 |
+
elif task == 'Referring Segmentation':
|
68 |
+
return referring_segmentation(model_last, image, *args, **kwargs)
|
69 |
+
elif task == 'Open Vocabulary Semantic Segmentation':
|
70 |
+
return open_semseg(model_last, image, *args, **kwargs)
|
71 |
+
elif task == 'Open Vocabulary Panoptic Segmentation':
|
72 |
+
return open_panoseg(model_last, image, *args, **kwargs)
|
73 |
+
elif task == 'Open Vocabulary Instance Segmentation':
|
74 |
+
return open_instseg(model_last, image, *args, **kwargs)
|
75 |
+
elif task == 'Image Captioning':
|
76 |
+
return image_captioning(model_cap, image, *args, **kwargs)
|
77 |
+
elif task == 'Referring Captioning (Beta)':
|
78 |
+
return referring_captioning([model_last, model_cap], image, *args, **kwargs)
|
79 |
+
elif task == 'Text Retrieval':
|
80 |
+
return text_retrieval(model_cap, image, *args, **kwargs)
|
81 |
+
elif task == 'Image/Region Retrieval (Only Support Exampled 80 images)':
|
82 |
+
return region_retrieval([model_cap, model_last], image, *args, **kwargs)
|
83 |
+
|
84 |
+
'''
|
85 |
+
launch app
|
86 |
+
'''
|
87 |
+
title = "X-Decoder All-in-One Demo"
|
88 |
+
description = "<p style='text-align: center'> <a href='' target='_blank'>Project Page</a> | <a href='' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/X-Decoder' target='_blank'>Github Repo</a> | <a href='' target='_blank'>Video</a> </p>"
|
89 |
+
article = "The Demo is Run on X-Decoder (Focal-T)."
|
90 |
+
|
91 |
+
inputs = [gr.inputs.Image(type='pil'), gr.inputs.Radio(choices=["Referring Segmentation", 'Open Vocabulary Semantic Segmentation','Open Vocabulary Instance Segmentation', "Open Vocabulary Panoptic Segmentation", "Image Captioning", "Text Retrieval", "Referring Inpainting", "Referring Captioning (Beta)", "Image/Region Retrieval (Only Support Exampled 80 images)"], type="value", default="OpenVocab Semantic Segmentation", label="Task"), gr.Textbox(label="xdecoder_text"), gr.Textbox(label="inpainting_text"), gr.Textbox(label="task_description")]
|
92 |
+
gr.Interface(
|
93 |
+
fn=inference,
|
94 |
+
inputs=inputs,
|
95 |
+
outputs=[
|
96 |
+
gr.outputs.Image(
|
97 |
+
type="pil",
|
98 |
+
label="segmentation results"),
|
99 |
+
gr.Textbox(label="text restuls"),
|
100 |
+
gr.outputs.Image(
|
101 |
+
type="pil",
|
102 |
+
label="inpainting results"),
|
103 |
+
],
|
104 |
+
examples=[
|
105 |
+
["./images/fruit.jpg", "Referring Segmentation", "The larger watermelon.,The front white flower.,White tea pot.,Flower bunch.,white vase.,The peach on the left.,The brown knife.", '', 'Format: s,s,s'],
|
106 |
+
["./images/animals.png", "Open Vocabulary Semantic Segmentation", "zebra,antelope,giraffe,ostrich,sky,water,grass,sand,tree", '', 'Format: x,x,x'],
|
107 |
+
["./images/street.jpg", "Open Vocabulary Panoptic Segmentation", "stuff:building,sky,street,tree,rock,sidewalk;thing:car,person,traffic light", '', 'Format: stuff:x,x,x;thing:y,y,y'],
|
108 |
+
["./images/owls.jpeg", "Open Vocabulary Instance Segmentation", "owl", '', 'Format: y,y,y'],
|
109 |
+
["./images/mountain.jpeg", "Image Captioning", "", '', ''],
|
110 |
+
["./images/rose.webp", "Text Retrieval", "lily,rose,peoney,tulip", '', 'Format: s,s,s'],
|
111 |
+
["./images/region_retrieval.png", "Image/Region Retrieval (Only Support Exampled 80 images)", "The tangerine on the plate.", '', 'Please describe the object in a detailed way.'],
|
112 |
+
["./images/landscape.jpg", "Referring Captioning (Beta)", "cloud", '', 'Please fill in a noun/noun phrase. (may start with a/the)'],
|
113 |
+
["./images/apples.jpg", "Referring Inpainting", "a yellow apple", 'a pear', 'x-decoder + ldm (inference takes ~40s.)'],
|
114 |
+
],
|
115 |
+
title=title,
|
116 |
+
description=description,
|
117 |
+
article=article,
|
118 |
+
allow_flagging='never',
|
119 |
+
cache_examples=True,
|
120 |
+
).launch(share=True)
|
configs/xdecoder/svlp_focalt_lang.yaml
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
+
# Copyright (c) 2022 Microsoft
|
4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
5 |
+
# Written by Xueyan Zou ([email protected])
|
6 |
+
# --------------------------------------------------------
|
7 |
+
|
8 |
+
##################
|
9 |
+
# Task settings
|
10 |
+
##################
|
11 |
+
VERBOSE: true
|
12 |
+
MODEL:
|
13 |
+
NAME: xdecoder_model
|
14 |
+
HEAD: xdecoder_head
|
15 |
+
DIM_PROJ: 512
|
16 |
+
BACKBONE_DIM: 768
|
17 |
+
TEXT:
|
18 |
+
ARCH: vlpencoder
|
19 |
+
NAME: transformer
|
20 |
+
TOKENIZER: clip
|
21 |
+
CONTEXT_LENGTH: 77 # 77
|
22 |
+
WIDTH: 512
|
23 |
+
HEADS: 8
|
24 |
+
LAYERS: 12 # 6
|
25 |
+
AUTOGRESSIVE: True
|
26 |
+
BACKBONE:
|
27 |
+
NAME: focal_dw
|
28 |
+
PRETRAINED: ''
|
29 |
+
LOAD_PRETRAINED: false
|
30 |
+
FOCAL:
|
31 |
+
PRETRAIN_IMG_SIZE: 224
|
32 |
+
PATCH_SIZE: 4
|
33 |
+
EMBED_DIM: 96
|
34 |
+
DEPTHS: [2, 2, 6, 2]
|
35 |
+
FOCAL_LEVELS: [3, 3, 3, 3]
|
36 |
+
FOCAL_WINDOWS: [3, 3, 3, 3]
|
37 |
+
DROP_PATH_RATE: 0.3
|
38 |
+
MLP_RATIO: 4.0
|
39 |
+
DROP_RATE: 0.0
|
40 |
+
PATCH_NORM: True
|
41 |
+
USE_CONV_EMBED: True
|
42 |
+
SCALING_MODULATOR: True
|
43 |
+
USE_CHECKPOINT: False
|
44 |
+
USE_POSTLN: true
|
45 |
+
USE_POSTLN_IN_MODULATION: false
|
46 |
+
USE_LAYERSCALE: True
|
47 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
48 |
+
OUT_INDICES: [0, 1, 2, 3]
|
49 |
+
ENCODER:
|
50 |
+
NAME: transformer_encoder_fpn
|
51 |
+
IGNORE_VALUE: 255
|
52 |
+
NUM_CLASSES: 133
|
53 |
+
LOSS_WEIGHT: 1.0
|
54 |
+
CONVS_DIM: 512
|
55 |
+
MASK_DIM: 512
|
56 |
+
NORM: "GN"
|
57 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
58 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
59 |
+
COMMON_STRIDE: 4
|
60 |
+
TRANSFORMER_ENC_LAYERS: 6
|
61 |
+
DECODER:
|
62 |
+
NAME: xdecoder
|
63 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
64 |
+
MASK: True
|
65 |
+
GROUNDING:
|
66 |
+
ENABLED: True
|
67 |
+
MAX_LEN: 5
|
68 |
+
TEXT_WEIGHT: 2.0
|
69 |
+
CLASS_WEIGHT: 0.5
|
70 |
+
DETECTION: False
|
71 |
+
CAPTION:
|
72 |
+
ENABLED: True
|
73 |
+
PHRASE_PROB: 0.0
|
74 |
+
SIM_THRES: 0.95
|
75 |
+
CAPTIONING:
|
76 |
+
ENABLED: True
|
77 |
+
STEP: 50
|
78 |
+
RETRIEVAL:
|
79 |
+
ENABLED: True
|
80 |
+
DIM_IMG: 768
|
81 |
+
ENSEMBLE: True
|
82 |
+
HIDDEN_DIM: 512
|
83 |
+
NUM_OBJECT_QUERIES: 101
|
84 |
+
NHEADS: 8
|
85 |
+
DROPOUT: 0.0
|
86 |
+
DIM_FEEDFORWARD: 2048
|
87 |
+
PRE_NORM: False
|
88 |
+
ENFORCE_INPUT_PROJ: False
|
89 |
+
SIZE_DIVISIBILITY: 32
|
90 |
+
TRAIN_NUM_POINTS: 12544
|
91 |
+
OVERSAMPLE_RATIO: 3.0
|
92 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
93 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
94 |
+
TOP_GROUNDING_LAYERS: 3
|
95 |
+
TOP_CAPTION_LAYERS: 3
|
96 |
+
TOP_CAPTIONING_LAYERS: 3
|
97 |
+
TOP_RETRIEVAL_LAYERS: 3
|
98 |
+
TOP_OPENIMAGE_LAYERS: 10
|
99 |
+
TEST:
|
100 |
+
SEMANTIC_ON: True
|
101 |
+
INSTANCE_ON: True
|
102 |
+
PANOPTIC_ON: True
|
103 |
+
OVERLAP_THRESHOLD: 0.8
|
104 |
+
OBJECT_MASK_THRESHOLD: 0.4
|
105 |
+
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
|
106 |
+
DETECTIONS_PER_IMAGE: 100
|
107 |
+
|
108 |
+
INPUT:
|
109 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
110 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
images/apples.jpg
ADDED
images/coco/000.jpg
ADDED
Git LFS Details
|
images/coco/001.jpg
ADDED
Git LFS Details
|
images/coco/002.jpg
ADDED
Git LFS Details
|
images/coco/003.jpg
ADDED
Git LFS Details
|
images/coco/004.jpg
ADDED
Git LFS Details
|
images/coco/005.jpg
ADDED
Git LFS Details
|
images/coco/006.jpg
ADDED
Git LFS Details
|
images/coco/007.jpg
ADDED
Git LFS Details
|
images/coco/008.jpg
ADDED
Git LFS Details
|
images/coco/009.jpg
ADDED
Git LFS Details
|
images/coco/010.jpg
ADDED
Git LFS Details
|
images/coco/011.jpg
ADDED
Git LFS Details
|
images/coco/012.jpg
ADDED
Git LFS Details
|
images/coco/013.jpg
ADDED
Git LFS Details
|
images/coco/014.jpg
ADDED
Git LFS Details
|
images/coco/015.jpg
ADDED
Git LFS Details
|
images/coco/016.jpg
ADDED
Git LFS Details
|
images/coco/017.jpg
ADDED
Git LFS Details
|
images/coco/018.jpg
ADDED
Git LFS Details
|
images/coco/019.jpg
ADDED
Git LFS Details
|
images/coco/020.jpg
ADDED
Git LFS Details
|
images/coco/021.jpg
ADDED
Git LFS Details
|
images/coco/022.jpg
ADDED
Git LFS Details
|
images/coco/023.jpg
ADDED
Git LFS Details
|
images/coco/024.jpg
ADDED
Git LFS Details
|
images/coco/025.jpg
ADDED
Git LFS Details
|
images/coco/026.jpg
ADDED
Git LFS Details
|
images/coco/027.jpg
ADDED
Git LFS Details
|
images/coco/028.jpg
ADDED
Git LFS Details
|
images/coco/029.jpg
ADDED
Git LFS Details
|
images/coco/030.jpg
ADDED
Git LFS Details
|
images/coco/031.jpg
ADDED
Git LFS Details
|
images/coco/032.jpg
ADDED
Git LFS Details
|
images/coco/033.jpg
ADDED
Git LFS Details
|
images/coco/034.jpg
ADDED
Git LFS Details
|
images/coco/035.jpg
ADDED
Git LFS Details
|
images/coco/036.jpg
ADDED
Git LFS Details
|
images/coco/037.jpg
ADDED
Git LFS Details
|
images/coco/038.jpg
ADDED
Git LFS Details
|
images/coco/039.jpg
ADDED
Git LFS Details
|
images/coco/040.jpg
ADDED
Git LFS Details
|
images/coco/041.jpg
ADDED
Git LFS Details
|
images/coco/042.jpg
ADDED
Git LFS Details
|