ivan-wald commited on
Commit
80ceed3
1 Parent(s): 8e5a190

Upload 7 files

Browse files
Files changed (7) hide show
  1. Base-RCNN-FPN.yml +69 -0
  2. README.md +12 -0
  3. app.py +77 -0
  4. cascade_dit_base.yml +20 -0
  5. packages.txt +1 -0
  6. publaynet_example.jpeg +0 -0
  7. requirements.txt +10 -0
Base-RCNN-FPN.yml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ MASK_ON: True
3
+ META_ARCHITECTURE: "GeneralizedRCNN"
4
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
5
+ PIXEL_STD: [58.395, 57.120, 57.375]
6
+ BACKBONE:
7
+ NAME: "build_vit_fpn_backbone"
8
+ VIT:
9
+ OUT_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
10
+ DROP_PATH: 0.1
11
+ IMG_SIZE: [224,224]
12
+ POS_TYPE: "abs"
13
+ FPN:
14
+ IN_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
15
+ ANCHOR_GENERATOR:
16
+ SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
17
+ ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
18
+ RPN:
19
+ IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
20
+ PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
21
+ PRE_NMS_TOPK_TEST: 1000 # Per FPN level
22
+ # Detectron1 uses 2000 proposals per-batch,
23
+ # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
24
+ # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
25
+ POST_NMS_TOPK_TRAIN: 1000
26
+ POST_NMS_TOPK_TEST: 1000
27
+ ROI_HEADS:
28
+ NAME: "StandardROIHeads"
29
+ IN_FEATURES: ["p2", "p3", "p4", "p5"]
30
+ NUM_CLASSES: 5
31
+ ROI_BOX_HEAD:
32
+ NAME: "FastRCNNConvFCHead"
33
+ NUM_FC: 2
34
+ POOLER_RESOLUTION: 7
35
+ ROI_MASK_HEAD:
36
+ NAME: "MaskRCNNConvUpsampleHead"
37
+ NUM_CONV: 4
38
+ POOLER_RESOLUTION: 14
39
+ DATASETS:
40
+ TRAIN: ("publaynet_train",)
41
+ TEST: ("publaynet_val",)
42
+ SOLVER:
43
+ LR_SCHEDULER_NAME: "WarmupCosineLR"
44
+ AMP:
45
+ ENABLED: True
46
+ OPTIMIZER: "ADAMW"
47
+ BACKBONE_MULTIPLIER: 1.0
48
+ CLIP_GRADIENTS:
49
+ ENABLED: True
50
+ CLIP_TYPE: "full_model"
51
+ CLIP_VALUE: 1.0
52
+ NORM_TYPE: 2.0
53
+ WARMUP_FACTOR: 0.01
54
+ BASE_LR: 0.0004
55
+ WEIGHT_DECAY: 0.05
56
+ IMS_PER_BATCH: 32
57
+ INPUT:
58
+ CROP:
59
+ ENABLED: True
60
+ TYPE: "absolute_range"
61
+ SIZE: (384, 600)
62
+ MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
63
+ FORMAT: "RGB"
64
+ DATALOADER:
65
+ FILTER_EMPTY_ANNOTATIONS: False
66
+ VERSION: 2
67
+ AUG:
68
+ DETR: True
69
+ SEED: 42
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Dit Document Layout Analysis
3
+ emoji: 👀
4
+ colorFrom: purple
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.5.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system('git clone https://github.com/facebookresearch/detectron2.git')
3
+ os.system('pip install -e detectron2')
4
+ os.system("git clone https://github.com/microsoft/unilm.git")
5
+ os.system("sed -i 's/from collections import Iterable/from collections.abc import Iterable/' unilm/dit/object_detection/ditod/table_evaluation/data_structure.py")
6
+ os.system("curl -LJ -o publaynet_dit-b_cascade.pth 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_cascade.pth?sv=2022-11-02&ss=b&srt=o&sp=r&se=2033-06-08T16:48:15Z&st=2023-06-08T08:48:15Z&spr=https&sig=a9VXrihTzbWyVfaIDlIT1Z0FoR1073VB0RLQUMuudD4%3D'")
7
+
8
+ import sys
9
+ sys.path.append("unilm")
10
+ sys.path.append("detectron2")
11
+
12
+ import cv2
13
+
14
+ from unilm.dit.object_detection.ditod import add_vit_config
15
+
16
+ import torch
17
+
18
+ from detectron2.config import CfgNode as CN
19
+ from detectron2.config import get_cfg
20
+ from detectron2.utils.visualizer import ColorMode, Visualizer
21
+ from detectron2.data import MetadataCatalog
22
+ from detectron2.engine import DefaultPredictor
23
+
24
+ from huggingface_hub import hf_hub_download
25
+
26
+ import gradio as gr
27
+
28
+
29
+ # Step 1: instantiate config
30
+ cfg = get_cfg()
31
+ add_vit_config(cfg)
32
+ cfg.merge_from_file("cascade_dit_base.yml")
33
+
34
+ # Step 2: add model weights URL to config
35
+ filepath = hf_hub_download(repo_id="Sebas6k/DiT_weights", filename="publaynet_dit-b_cascade.pth", repo_type="model")
36
+ cfg.MODEL.WEIGHTS = filepath
37
+
38
+ # Step 3: set device
39
+ cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
40
+
41
+ # Step 4: define model
42
+ predictor = DefaultPredictor(cfg)
43
+
44
+
45
+ def analyze_image(img):
46
+ img = img.astype("float32")
47
+ md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
48
+ if cfg.DATASETS.TEST[0]=='icdar2019_test':
49
+ md.set(thing_classes=["table"])
50
+ else:
51
+ md.set(thing_classes=["text","title","list","table","figure"])
52
+
53
+ output = predictor(img)["instances"]
54
+ v = Visualizer(img[:, :, ::-1],
55
+ md,
56
+ scale=1.0,
57
+ instance_mode=ColorMode.SEGMENTATION)
58
+ result = v.draw_instance_predictions(output.to("cpu"))
59
+ result_image = result.get_image()[:, :, ::-1]
60
+
61
+ return result_image
62
+
63
+ title = "Document Layout Analysis"
64
+ description = "Demo"
65
+ article = ""
66
+ examples =[['publaynet_example.jpeg']]
67
+ css = ".output-image, .input-image, .image-preview {height: 600px !important}"
68
+
69
+ iface = gr.Interface(fn=analyze_image,
70
+ inputs=gr.Image(type="numpy", label="document image"),
71
+ outputs=gr.Image(type="numpy", label="annotated document"),
72
+ title=title,
73
+ description=description,
74
+ examples=examples,
75
+ article=article,
76
+ css=css)
77
+ iface.queue(5).launch()
cascade_dit_base.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base-RCNN-FPN.yml"
2
+ MODEL:
3
+ PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
4
+ PIXEL_STD: [ 127.5, 127.5, 127.5 ]
5
+ WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
6
+ VIT:
7
+ NAME: "dit_base_patch16"
8
+ ROI_HEADS:
9
+ NAME: CascadeROIHeads
10
+ ROI_BOX_HEAD:
11
+ CLS_AGNOSTIC_BBOX_REG: True
12
+ RPN:
13
+ POST_NMS_TOPK_TRAIN: 2000
14
+ SOLVER:
15
+ WARMUP_ITERS: 1000
16
+ IMS_PER_BATCH: 16
17
+ MAX_ITER: 60000
18
+ CHECKPOINT_PERIOD: 2000
19
+ TEST:
20
+ EVAL_PERIOD: 2000
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python3-opencv
publaynet_example.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pyyaml==5.1
2
+ torch==1.11.0
3
+ torchvision==0.12.0
4
+
5
+ gradio
6
+ numpy<2
7
+ scipy
8
+ shapely
9
+ timm
10
+ opencv-python