RT-Detr-ArabicLayoutAnalysisR

Running

App Files Files Community

omarelsayeed commited on 20 days ago

Commit

1ebac6f

•

1 Parent(s): 86dc437

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -94

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from ultralytics import RTDETR
 import gradio as gr
 from huggingface_hub import snapshot_download
-from PIL import Image
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 import random
@@ -9,99 +8,36 @@ from collections import defaultdict
 from typing import List, Dict
 import torch
 from transformers import LayoutLMv3ForTokenClassification
-# Load the LayoutLMv3 model
-layout_model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
-MAX_LEN = 100
-CLS_TOKEN_ID = 0
-UNK_TOKEN_ID = 3
-EOS_TOKEN_ID = 2
-def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]:
-    bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
-    input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
-    attention_mask = [1] + [1] * len(boxes) + [1]
-    return {
-        "bbox": torch.tensor([bbox]),
-        "attention_mask": torch.tensor([attention_mask]),
-        "input_ids": torch.tensor([input_ids]),
-    }
-def parse_logits(logits: torch.Tensor, length: int) -> List[int]:
-    """
-    Parse logits to determine the reading order.
-    """
-    logits = logits[1: length + 1, :length]
-    orders = logits.argsort(descending=False).tolist()
-    ret = [o.pop() for o in orders]
-    while True:
-        order_to_idxes = defaultdict(list)
-        for idx, order in enumerate(ret):
-            order_to_idxes[order].append(idx)
-        # Filter indices with length > 1
-        order_to_idxes = {k: v for k, v in order_to_idxes.items() if len(v) > 1}
-        if not order_to_idxes:
-            break
-        # Resolve conflicts
-        for order, idxes in order_to_idxes.items():
-            idxes_to_logit = {idx: logits[idx, order] for idx in idxes}
-            idxes_to_logit = sorted(idxes_to_logit.items(), key=lambda x: x[1], reverse=True)
-            for idx, _ in idxes_to_logit[1:]:
-                ret[idx] = orders[idx].pop()
-    return ret
-def get_orders(_,bounding_boxes):
-    """
-    Detects reading order for Arabic text layout, given bounding boxes in xyxy format.
-    Args:
-    - bounding_boxes: List of tuples (x1, y1, x2, y2), where
-                      (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner of the bounding box.
-    Returns:
-    - A list of indices representing the reading order.
-    """
-    # Convert to numpy array for easier processing
-    bounding_boxes = [tuple(b) for b in bounding_boxes]
-    boxes = np.array(bounding_boxes)
-    # Extract positions: (x1, y1) as the top-left, (x2, y2) as the bottom-right
-    # Sort by vertical position first (y1), and then horizontal position (x1), with right-to-left sorting
-    sorted_indices = np.lexsort((boxes[:, 0], boxes[:, 1]))  # Sort by y1, then by x1 (right-to-left)
-    # Sort within rows by checking overlap tolerance for y coordinates
-    rows = []
-    tolerance = 10  # Tolerance for grouping elements into rows
-    for idx in sorted_indices:
-        placed = False
-        for row in rows:
-            # Check if the box belongs to an existing row (y1 overlap within tolerance)
-            if abs(row[-1][1] - boxes[idx][1]) < tolerance:
-                row.append(boxes[idx])
-                placed = True
-                break
-        if not placed:
-            rows.append([boxes[idx]])
-    # Within each row, sort by x1 (right-to-left)
-    reading_order = []
-    for row in rows:
-        row.sort(key=lambda b: -b[0])  # Sort by x1 descending (right-to-left)
-        reading_order.extend(row)
-    # Return the indices of the bounding boxes in the correct reading order
-    return [bounding_boxes.index(tuple(box)) for box in reading_order]
-# def get_orders(image_path, boxes):
-#     b = scale_and_normalize_boxes(boxes)
-#     inputs = boxes2inputs(b)
-#     inputs = {k: v.to(layout_model.device) for k, v in inputs.items()}  # Move inputs to model device
-#     logits = layout_model(**inputs).logits.cpu().squeeze(0)  # Perform inference and get logits
-#     orders = parse_logits(logits, len(b))
-#     return orders
 model_dir = snapshot_download("omarelsayeed/DETR-ARABIC-DOCUMENT-LAYOUT-ANALYSIS") + "/rtdetr_1024_crops.pt"
@@ -203,7 +139,7 @@ def draw_bboxes_on_image(image_path, bboxes, classes, reading_order):
-def scale_and_normalize_boxes(bboxes, old_width = 1024, old_height= 1024, new_width=640, new_height=640, normalize_width=1000, normalize_height=1000):
     """
     Scales and normalizes bounding boxes from original dimensions to new dimensions.

 from ultralytics import RTDETR
 import gradio as gr
 from huggingface_hub import snapshot_download
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 import random
 from typing import List, Dict
 import torch
 from transformers import LayoutLMv3ForTokenClassification
+from transformers import AutoProcessor
+from transformers import AutoModelForTokenClassification
+reading_order_model = AutoModelForTokenClassification.from_pretrained("omarelsayeed/yea_yea").to("cuda")
+processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base",
+                                          apply_ocr=False)
+def predict_reading_order(boxes,image_path):
+    words = ["<unk>"]*len(boxes)
+    print(boxes)
+    encoding = processor(image_path , text = words
+                            , boxes=boxes
+                            ,return_tensors="pt" ,
+                            return_offsets_mapping=True)
+    offset_mapping = encoding.pop('offset_mapping')
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    for k,v in encoding.items():
+      encoding[k] = v.to(device)
+    outputs = model(**encoding)
+    predictions = outputs.logits.argmax(-1).squeeze().tolist()
+    token_boxes = encoding.bbox.squeeze().tolist()
+    is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
+    # true_predictions = [id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]]
+    predictions = predictions[1:-1]
+    return predictions
+def get_orders(image_path, boxes):
+    b = scale_and_normalize_boxes(boxes)
+    orders = predict_reading_order(b,  image_path)
+    return orders
 model_dir = snapshot_download("omarelsayeed/DETR-ARABIC-DOCUMENT-LAYOUT-ANALYSIS") + "/rtdetr_1024_crops.pt"
+def scale_and_normalize_boxes(bboxes, old_width = 1024, old_height= 1024, new_width=595.303955, new_height=841.889771, normalize_width=1000, normalize_height=1000):
     """
     Scales and normalizes bounding boxes from original dimensions to new dimensions.