RT-Detr-ArabicLayoutAnalysisR

Sleeping

App Files Files Community

omarelsayeed commited on Nov 26, 2024

Commit

6e73f0b

verified ·

1 Parent(s): e059e1e

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -13

app.py CHANGED Viewed

@@ -3,14 +3,67 @@ import gradio as gr
 from huggingface_hub import snapshot_download
 from PIL import Image
 from PIL import Image, ImageDraw, ImageFont
-from surya.ordering import batch_ordering
-from surya.model.ordering.processor import load_processor
-from surya.model.ordering.model import load_model
 model_dir = snapshot_download("omarelsayeed/DETR-ARABIC-DOCUMENT-LAYOUT-ANALYSIS") + "/rtdetr_1024_crops.pt"
 model = RTDETR(model_dir)
-order_model = load_model()
-processor = load_processor()
 def detect_layout(img, conf_threshold, iou_threshold):
     """Predicts objects in an image using a YOLO11 model with adjustable confidence and IOU thresholds."""
@@ -40,9 +93,6 @@ def detect_layout(img, conf_threshold, iou_threshold):
     classes = [mapping[i] for i in classes]
     return bboxes , classes
-def get_orders(image_path , boxes):
-  order_predictions = batch_ordering([image_path], [boxes], order_model, processor)
-  return [i.position for i in order_predictions[0].bboxes]
 def draw_bboxes_on_image(image_path, bboxes, classes, reading_order):
     # Define a color map for each class name
@@ -149,11 +199,12 @@ def remove_overlapping_and_inside_boxes(boxes, classes):
     return boxes, classes
 def full_predictions(IMAGE_PATH, conf_threshold, iou_threshold):
-  bboxes , classes = detect_layout(IMAGE_PATH ,conf_threshold, iou_threshold)
-  bboxes , classes = remove_overlapping_and_inside_boxes(bboxes,classes)
-  orders = get_orders(IMAGE_PATH , bboxes)
-  final_image = draw_bboxes_on_image(IMAGE_PATH , bboxes , classes , orders)
-  return final_image
 iface = gr.Interface(
     fn=full_predictions,

 from huggingface_hub import snapshot_download
 from PIL import Image
 from PIL import Image, ImageDraw, ImageFont
+from collections import defaultdict
+from typing import List, Dict
+import torch
+from transformers import LayoutLMv3ForTokenClassification
+# Load the LayoutLMv3 model
+layout_model = LayoutLMv3ForTokenClassification.from_pretrained("omarelsayeed/LayoutReader80Small")
+MAX_LEN = 100
+CLS_TOKEN_ID = 0
+UNK_TOKEN_ID = 3
+EOS_TOKEN_ID = 2
+def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]:
+    bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
+    input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
+    attention_mask = [1] + [1] * len(boxes) + [1]
+    return {
+        "bbox": torch.tensor([bbox]),
+        "attention_mask": torch.tensor([attention_mask]),
+        "input_ids": torch.tensor([input_ids]),
+    }
+def parse_logits(logits: torch.Tensor, length: int) -> List[int]:
+    """
+    Parse logits to determine the reading order.
+    """
+    logits = logits[1: length + 1, :length]
+    orders = logits.argsort(descending=False).tolist()
+    ret = [o.pop() for o in orders]
+    while True:
+        order_to_idxes = defaultdict(list)
+        for idx, order in enumerate(ret):
+            order_to_idxes[order].append(idx)
+        # Filter indices with length > 1
+        order_to_idxes = {k: v for k, v in order_to_idxes.items() if len(v) > 1}
+        if not order_to_idxes:
+            break
+        # Resolve conflicts
+        for order, idxes in order_to_idxes.items():
+            idxes_to_logit = {idx: logits[idx, order] for idx in idxes}
+            idxes_to_logit = sorted(idxes_to_logit.items(), key=lambda x: x[1], reverse=True)
+            for idx, _ in idxes_to_logit[1:]:
+                ret[idx] = orders[idx].pop()
+    return ret
+def get_orders(image_path, boxes):
+    inputs = boxes2inputs(boxes)
+    inputs = {k: v.to(layout_model.device) for k, v in inputs.items()}  # Move inputs to model device
+    logits = layout_model(**inputs).logits.cpu().squeeze(0)  # Perform inference and get logits
+    orders = parse_logits(logits, len(boxes))
+    return orders
 model_dir = snapshot_download("omarelsayeed/DETR-ARABIC-DOCUMENT-LAYOUT-ANALYSIS") + "/rtdetr_1024_crops.pt"
 model = RTDETR(model_dir)
 def detect_layout(img, conf_threshold, iou_threshold):
     """Predicts objects in an image using a YOLO11 model with adjustable confidence and IOU thresholds."""
     classes = [mapping[i] for i in classes]
     return bboxes , classes
 def draw_bboxes_on_image(image_path, bboxes, classes, reading_order):
     # Define a color map for each class name
     return boxes, classes
 def full_predictions(IMAGE_PATH, conf_threshold, iou_threshold):
+    bboxes, classes = detect_layout(IMAGE_PATH, conf_threshold, iou_threshold)
+    bboxes, classes = remove_overlapping_and_inside_boxes(bboxes, classes)
+    orders = get_orders(IMAGE_PATH, bboxes)
+    final_image = draw_bboxes_on_image(IMAGE_PATH, bboxes, classes, orders)
+    return final_image
 iface = gr.Interface(
     fn=full_predictions,