File size: 11,079 Bytes
fa50974
679d3c5
 
fa50974
86dc437
69dd8ab
6e73f0b
 
 
 
1ebac6f
 
308a301
d94e271
0f127fa
d94e271
 
1ebac6f
1847114
1ebac6f
 
 
1847114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e73f0b
1847114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190bae6
74c842e
679d3c5
6e73f0b
679d3c5
fa50974
679d3c5
 
 
 
 
 
 
a231a61
fa50974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6329b7d
fa50974
 
e996bd4
fa50974
e996bd4
 
 
 
 
 
 
 
 
 
fa50974
 
 
e996bd4
fa50974
e996bd4
 
 
 
fa50974
 
e996bd4
fa50974
e996bd4
 
fa50974
 
 
 
 
 
e996bd4
 
 
 
 
fa50974
e996bd4
 
fa50974
e996bd4
 
 
 
 
 
 
fa50974
 
e996bd4
 
 
 
fa50974
e996bd4
 
 
 
 
6329b7d
14db543
 
b6ee85c
14db543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa50974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679d3c5
fa50974
 
679d3c5
fa50974
 
 
 
679d3c5
fa50974
14db543
c991f81
 
28aab42
 
c991f81
 
 
 
 
 
 
 
 
 
14db543
faf74d9
5a79011
6e73f0b
 
4a4a9b4
c991f81
6e73f0b
 
 
679d3c5
14db543
679d3c5
fa50974
679d3c5
 
 
 
 
 
 
 
 
45440e8
 
63b70c6
679d3c5
2132edd
679d3c5
 
 
3ec7d0a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
from ultralytics import RTDETR
import gradio as gr
from huggingface_hub import snapshot_download
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import random
from collections import defaultdict
from typing import List, Dict
import torch
from transformers import LayoutLMv3ForTokenClassification
from transformers import AutoProcessor
from transformers import AutoModelForTokenClassification
quant=False
if quant:
    finetuned_fully = LayoutLMv3ForTokenClassification.from_pretrained("omarelsayeed/ArabicLayoutReader_4bit" , load_in_4bit=True)
else:
    finetuned_fully = LayoutLMv3ForTokenClassification.from_pretrained("omarelsayeed/YARAB_FOK_ELDE2A")


processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", 
                                          apply_ocr=False)

MAX_LEN = 70
CLS_TOKEN_ID = 0
UNK_TOKEN_ID = 3
EOS_TOKEN_ID = 2
import torch
def boxes2inputs(boxes):
    bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
    input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
    attention_mask = [1] + [1] * len(boxes) + [1]
    return {
        "bbox": torch.tensor([bbox]),
        "attention_mask": torch.tensor([attention_mask]),
        "input_ids": torch.tensor([input_ids]),
    }
def parse_logits(logits: torch.Tensor, length):
    """
    parse logits to orders

    :param logits: logits from model
    :param length: input length
    :return: orders
    """
    logits = logits[1 : length + 1, :length]
    orders = logits.argsort(descending=False).tolist()
    ret = [o.pop() for o in orders]
    while True:
        order_to_idxes = defaultdict(list)
        for idx, order in enumerate(ret):
            order_to_idxes[order].append(idx)
        # filter idxes len > 1
        order_to_idxes = {k: v for k, v in order_to_idxes.items() if len(v) > 1}
        if not order_to_idxes:
            break
        # filter
        for order, idxes in order_to_idxes.items():
            # find original logits of idxes
            idxes_to_logit = {}
            for idx in idxes:
                idxes_to_logit[idx] = logits[idx, order]
            idxes_to_logit = sorted(
                idxes_to_logit.items(), key=lambda x: x[1], reverse=True
            )
            # keep the highest logit as order, set others to next candidate
            for idx, _ in idxes_to_logit[1:]:
                ret[idx] = orders[idx].pop()
    return ret

    
def prepare_inputs(
    inputs, model
):
    ret = {}
    for k, v in inputs.items():
        v = v.to(model.device)
        if torch.is_floating_point(v):
            v = v.to(model.dtype)
        ret[k] = v
    return ret
    
    
def get_orders(image_path , boxes):
    inputs = boxes2inputs(boxes)
    inputs = prepare_inputs(inputs, finetuned_fully)
    logits = finetuned_fully(**inputs).logits.cpu().squeeze(0)
    predictions = parse_logits(logits, len(boxes))
    return predictions

model_dir = snapshot_download("omarelsayeed/DETR-ARABIC-DOCUMENT-LAYOUT-ANALYSIS") + "/rtdetr_1024_crops.pt"
model = RTDETR(model_dir)


def detect_layout(img, conf_threshold, iou_threshold):
    """Predicts objects in an image using a YOLO11 model with adjustable confidence and IOU thresholds."""
    results = model.predict(
        source=img,
        conf=conf_threshold,
        iou=iou_threshold,
        show_labels=True,
        show_conf=True,
        imgsz=1024,
        agnostic_nms= True,
        max_det=34,
        nms=True
    )[0]
    bboxes = results.boxes.xyxy.cpu().tolist()
    classes = results.boxes.cls.cpu().tolist()
    mapping = {0: 'CheckBox',
              1: 'List',
              2: 'P',
              3: 'abandon',
              4: 'figure',
              5: 'gridless_table',
              6: 'handwritten_signature',
              7: 'qr_code',
              8: 'table',
              9: 'title'}
    classes = [mapping[i] for i in classes]
    return bboxes , classes 
    
from PIL import Image, ImageDraw, ImageFont

def draw_bboxes_on_image(image_path, bboxes, classes, reading_order):
    # Define a color map for each class name
    class_colors = {
        'CheckBox': 'orange',
        'List': 'blue',
        'P': 'green',
        'abandon': 'purple',
        'figure': 'cyan',
        'gridless_table': 'yellow',
        'handwritten_signature': 'magenta',
        'qr_code': 'red',
        'table': 'brown',
        'title': 'pink'
    }

    # Open the image using PIL
    image = image_path
    
    # Prepare to draw on the image
    draw = ImageDraw.Draw(image)
    
    # Try loading a default font, if it fails, use a basic font
    try:
        font = ImageFont.truetype("arial.ttf", 20)
        title_font = ImageFont.truetype("arial.ttf", 30)  # Larger font for titles
    except IOError:
        font = ImageFont.load_default(size = 30)
        title_font = font  # Use the same font for title if custom font fails

    # Loop through the bounding boxes and corresponding labels
    for i in range(len(bboxes)):
        x1, y1, x2, y2 = bboxes[i]
        class_name = classes[i]
        order = reading_order[i]
        
        # Get the color for the class
        color = class_colors[class_name]
        
        # If it's a title, make the bounding box thicker and text larger
        if class_name == 'title':
            box_thickness = 4  # Thicker box for title
            label_font = title_font  # Larger font for title
        else:
            box_thickness = 2  # Default box thickness
            label_font = font  # Default font for other classes
        
        # Draw the rectangle with the class color and box thickness
        draw.rectangle([x1, y1, x2, y2], outline=color, width=box_thickness)
        
        # Label the box with the class and order
        label = f"{class_name}-{order}"
        
        # Calculate text size using textbbox() to get the bounding box of the text
        bbox = draw.textbbox((x1, y1 - 20), label, font=label_font)
        label_width = bbox[2] - bbox[0]
        label_height = bbox[3] - bbox[1]
        
        # Draw the text above the box
        draw.text((x1, y1 - label_height), label, fill="black", font=label_font)
    
    # Return the modified image as a PIL image object
    return image



def scale_and_normalize_boxes(bboxes, old_width = 1024, old_height= 1024, new_width=640, new_height=640, normalize_width=1000, normalize_height=1000):
    """
    Scales and normalizes bounding boxes from original dimensions to new dimensions.

    Args:
        bboxes (list of lists): List of bounding boxes in [x_min, y_min, x_max, y_max] format.
        old_width (int or float): Width of the original image.
        old_height (int or float): Height of the original image.
        new_width (int or float): Width of the scaled image.
        new_height (int or float): Height of the scaled image.
        normalize_width (int or float): Width of the normalization range (e.g., target resolution width).
        normalize_height (int or float): Height of the normalization range (e.g., target resolution height).

    Returns:
        list of lists: Scaled and normalized bounding boxes in [x_min, y_min, x_max, y_max] format.
    """
    scale_x = new_width / old_width
    scale_y = new_height / old_height
    
    def scale_and_normalize_single(bbox):
        # Extract coordinates
        x_min, y_min, x_max, y_max = bbox
        
        # Scale to new dimensions
        x_min *= scale_x
        y_min *= scale_y
        x_max *= scale_x
        y_max *= scale_y
        
        # Normalize to the target range
        x_min = int(normalize_width * (x_min / new_width))
        y_min = int(normalize_height * (y_min / new_height))
        x_max = int(normalize_width * (x_max / new_width))
        y_max = int(normalize_height * (y_max / new_height))
        
        return [x_min, y_min, x_max, y_max]
    
    # Process all bounding boxes
    return [scale_and_normalize_single(bbox) for bbox in bboxes]



from PIL import Image, ImageDraw

def is_inside(box1, box2):
    # Check if box1 is inside box2
    return box1[0] >= box2[0] and box1[1] >= box2[1] and box1[2] <= box2[2] and box1[3] <= box2[3]

def is_overlap(box1, box2):
    # Check if box1 overlaps with box2
    x1, y1, x2, y2 = box1
    x3, y3, x4, y4 = box2

    # No overlap if one box is to the left, right, above, or below the other box
    return not (x2 <= x3 or x4 <= x1 or y2 <= y3 or y4 <= y1)

def remove_overlapping_and_inside_boxes(boxes, classes):
    to_remove = []
    
    for i, box1 in enumerate(boxes):
        for j, box2 in enumerate(boxes):
            if i != j:
                if is_inside(box1, box2):
                    # Mark the smaller (inside) box for removal
                    to_remove.append(i)
                elif is_inside(box2, box1):
                    # Mark the smaller (inside) box for removal
                    to_remove.append(j)
                elif is_overlap(box1, box2):
                    # If the boxes overlap, mark the smaller one for removal
                    if (box2[2] - box2[0]) * (box2[3] - box2[1]) < (box1[2] - box1[0]) * (box1[3] - box1[1]):
                        to_remove.append(j)
                    else:
                        to_remove.append(i)

    # Remove duplicates and sort by the index to keep original boxes
    to_remove = sorted(set(to_remove), reverse=True)

    # Remove the boxes and their corresponding classes from the list
    for idx in to_remove:
        del boxes[idx]
        del classes[idx]

    return boxes, classes

def process_r1(r1):
    # Step 1: Find the index of the maximum value
    if len(r1) == 2:
        return r1
    max_index = r1.index(max(r1))
    one_index = r1.index(1)
    # Step 2: Swap the maximum value with 1
    r1[max_index] = 1
    
    # Step 3: Increment all values except 0 and 1
    r1 = [x + 1 if x not in (0, 1) else x for x in r1]
    r1[one_index] +=1
    return r1
    

def full_predictions(IMAGE_PATH, conf_threshold, iou_threshold):
    IMAGE_PATH = IMAGE_PATH.resize((1024,1024))
    bboxes, classes = detect_layout(IMAGE_PATH, conf_threshold, iou_threshold)
    bboxes, classes = remove_overlapping_and_inside_boxes(bboxes, classes)
    orders = get_orders(IMAGE_PATH, scale_and_normalize_boxes(bboxes))
    orders = process_r1(orders)
    final_image = draw_bboxes_on_image(IMAGE_PATH, bboxes, classes, orders)
    return final_image



iface = gr.Interface(
    fn=full_predictions,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence threshold"),
        gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU threshold"),
    ],
    outputs=gr.Image(type="pil", label="Result"),
    title="Ultralytics Gradio",
    description="Upload images for inference. The Ultralytics YOLO11n model is used by default.",
    examples=[
        ["kashida.png", 0.2, 0.45],
        ["image.jpg", 0.2, 0.45],
        ["Screenshot 2024-11-06 130230.png" , 0.25 , 0.45]
    ],
    theme=gr.themes.Default()
)

if __name__ == "__main__":
    iface.launch()