Upload 7 files
Browse files- Base-RCNN-FPN.yml +69 -0
- README.md +12 -0
- app.py +77 -0
- cascade_dit_base.yml +20 -0
- packages.txt +1 -0
- publaynet_example.jpeg +0 -0
- requirements.txt +10 -0
Base-RCNN-FPN.yml
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
MASK_ON: True
|
3 |
+
META_ARCHITECTURE: "GeneralizedRCNN"
|
4 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
5 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
6 |
+
BACKBONE:
|
7 |
+
NAME: "build_vit_fpn_backbone"
|
8 |
+
VIT:
|
9 |
+
OUT_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
|
10 |
+
DROP_PATH: 0.1
|
11 |
+
IMG_SIZE: [224,224]
|
12 |
+
POS_TYPE: "abs"
|
13 |
+
FPN:
|
14 |
+
IN_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
|
15 |
+
ANCHOR_GENERATOR:
|
16 |
+
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
|
17 |
+
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
|
18 |
+
RPN:
|
19 |
+
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
|
20 |
+
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
|
21 |
+
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
|
22 |
+
# Detectron1 uses 2000 proposals per-batch,
|
23 |
+
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
|
24 |
+
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
|
25 |
+
POST_NMS_TOPK_TRAIN: 1000
|
26 |
+
POST_NMS_TOPK_TEST: 1000
|
27 |
+
ROI_HEADS:
|
28 |
+
NAME: "StandardROIHeads"
|
29 |
+
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
30 |
+
NUM_CLASSES: 5
|
31 |
+
ROI_BOX_HEAD:
|
32 |
+
NAME: "FastRCNNConvFCHead"
|
33 |
+
NUM_FC: 2
|
34 |
+
POOLER_RESOLUTION: 7
|
35 |
+
ROI_MASK_HEAD:
|
36 |
+
NAME: "MaskRCNNConvUpsampleHead"
|
37 |
+
NUM_CONV: 4
|
38 |
+
POOLER_RESOLUTION: 14
|
39 |
+
DATASETS:
|
40 |
+
TRAIN: ("publaynet_train",)
|
41 |
+
TEST: ("publaynet_val",)
|
42 |
+
SOLVER:
|
43 |
+
LR_SCHEDULER_NAME: "WarmupCosineLR"
|
44 |
+
AMP:
|
45 |
+
ENABLED: True
|
46 |
+
OPTIMIZER: "ADAMW"
|
47 |
+
BACKBONE_MULTIPLIER: 1.0
|
48 |
+
CLIP_GRADIENTS:
|
49 |
+
ENABLED: True
|
50 |
+
CLIP_TYPE: "full_model"
|
51 |
+
CLIP_VALUE: 1.0
|
52 |
+
NORM_TYPE: 2.0
|
53 |
+
WARMUP_FACTOR: 0.01
|
54 |
+
BASE_LR: 0.0004
|
55 |
+
WEIGHT_DECAY: 0.05
|
56 |
+
IMS_PER_BATCH: 32
|
57 |
+
INPUT:
|
58 |
+
CROP:
|
59 |
+
ENABLED: True
|
60 |
+
TYPE: "absolute_range"
|
61 |
+
SIZE: (384, 600)
|
62 |
+
MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
|
63 |
+
FORMAT: "RGB"
|
64 |
+
DATALOADER:
|
65 |
+
FILTER_EMPTY_ANNOTATIONS: False
|
66 |
+
VERSION: 2
|
67 |
+
AUG:
|
68 |
+
DETR: True
|
69 |
+
SEED: 42
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Dit Document Layout Analysis
|
3 |
+
emoji: 👀
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.5.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
app.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.system('git clone https://github.com/facebookresearch/detectron2.git')
|
3 |
+
os.system('pip install -e detectron2')
|
4 |
+
os.system("git clone https://github.com/microsoft/unilm.git")
|
5 |
+
os.system("sed -i 's/from collections import Iterable/from collections.abc import Iterable/' unilm/dit/object_detection/ditod/table_evaluation/data_structure.py")
|
6 |
+
os.system("curl -LJ -o publaynet_dit-b_cascade.pth 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_cascade.pth?sv=2022-11-02&ss=b&srt=o&sp=r&se=2033-06-08T16:48:15Z&st=2023-06-08T08:48:15Z&spr=https&sig=a9VXrihTzbWyVfaIDlIT1Z0FoR1073VB0RLQUMuudD4%3D'")
|
7 |
+
|
8 |
+
import sys
|
9 |
+
sys.path.append("unilm")
|
10 |
+
sys.path.append("detectron2")
|
11 |
+
|
12 |
+
import cv2
|
13 |
+
|
14 |
+
from unilm.dit.object_detection.ditod import add_vit_config
|
15 |
+
|
16 |
+
import torch
|
17 |
+
|
18 |
+
from detectron2.config import CfgNode as CN
|
19 |
+
from detectron2.config import get_cfg
|
20 |
+
from detectron2.utils.visualizer import ColorMode, Visualizer
|
21 |
+
from detectron2.data import MetadataCatalog
|
22 |
+
from detectron2.engine import DefaultPredictor
|
23 |
+
|
24 |
+
from huggingface_hub import hf_hub_download
|
25 |
+
|
26 |
+
import gradio as gr
|
27 |
+
|
28 |
+
|
29 |
+
# Step 1: instantiate config
|
30 |
+
cfg = get_cfg()
|
31 |
+
add_vit_config(cfg)
|
32 |
+
cfg.merge_from_file("cascade_dit_base.yml")
|
33 |
+
|
34 |
+
# Step 2: add model weights URL to config
|
35 |
+
filepath = hf_hub_download(repo_id="Sebas6k/DiT_weights", filename="publaynet_dit-b_cascade.pth", repo_type="model")
|
36 |
+
cfg.MODEL.WEIGHTS = filepath
|
37 |
+
|
38 |
+
# Step 3: set device
|
39 |
+
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
40 |
+
|
41 |
+
# Step 4: define model
|
42 |
+
predictor = DefaultPredictor(cfg)
|
43 |
+
|
44 |
+
|
45 |
+
def analyze_image(img):
|
46 |
+
img = img.astype("float32")
|
47 |
+
md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
|
48 |
+
if cfg.DATASETS.TEST[0]=='icdar2019_test':
|
49 |
+
md.set(thing_classes=["table"])
|
50 |
+
else:
|
51 |
+
md.set(thing_classes=["text","title","list","table","figure"])
|
52 |
+
|
53 |
+
output = predictor(img)["instances"]
|
54 |
+
v = Visualizer(img[:, :, ::-1],
|
55 |
+
md,
|
56 |
+
scale=1.0,
|
57 |
+
instance_mode=ColorMode.SEGMENTATION)
|
58 |
+
result = v.draw_instance_predictions(output.to("cpu"))
|
59 |
+
result_image = result.get_image()[:, :, ::-1]
|
60 |
+
|
61 |
+
return result_image
|
62 |
+
|
63 |
+
title = "Document Layout Analysis"
|
64 |
+
description = "Demo"
|
65 |
+
article = ""
|
66 |
+
examples =[['publaynet_example.jpeg']]
|
67 |
+
css = ".output-image, .input-image, .image-preview {height: 600px !important}"
|
68 |
+
|
69 |
+
iface = gr.Interface(fn=analyze_image,
|
70 |
+
inputs=gr.Image(type="numpy", label="document image"),
|
71 |
+
outputs=gr.Image(type="numpy", label="annotated document"),
|
72 |
+
title=title,
|
73 |
+
description=description,
|
74 |
+
examples=examples,
|
75 |
+
article=article,
|
76 |
+
css=css)
|
77 |
+
iface.queue(5).launch()
|
cascade_dit_base.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: "Base-RCNN-FPN.yml"
|
2 |
+
MODEL:
|
3 |
+
PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
|
4 |
+
PIXEL_STD: [ 127.5, 127.5, 127.5 ]
|
5 |
+
WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
|
6 |
+
VIT:
|
7 |
+
NAME: "dit_base_patch16"
|
8 |
+
ROI_HEADS:
|
9 |
+
NAME: CascadeROIHeads
|
10 |
+
ROI_BOX_HEAD:
|
11 |
+
CLS_AGNOSTIC_BBOX_REG: True
|
12 |
+
RPN:
|
13 |
+
POST_NMS_TOPK_TRAIN: 2000
|
14 |
+
SOLVER:
|
15 |
+
WARMUP_ITERS: 1000
|
16 |
+
IMS_PER_BATCH: 16
|
17 |
+
MAX_ITER: 60000
|
18 |
+
CHECKPOINT_PERIOD: 2000
|
19 |
+
TEST:
|
20 |
+
EVAL_PERIOD: 2000
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python3-opencv
|
publaynet_example.jpeg
ADDED
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pyyaml==5.1
|
2 |
+
torch==1.11.0
|
3 |
+
torchvision==0.12.0
|
4 |
+
|
5 |
+
gradio
|
6 |
+
numpy<2
|
7 |
+
scipy
|
8 |
+
shapely
|
9 |
+
timm
|
10 |
+
opencv-python
|