Good job !
Really useful experiments. Please note that .pt format is considered unsafe. A convertor is available here: https://huggingface.co/spaces/safetensors/convert.
I test with https://huggingface.co/Aryn/deformable-detr-DocLayNet. They use transformers and pre-trained weights are in .safetensors, important from cybersecurity viewpoint.
Personally while Ultralytics have developed further from base YOLOv4, which is good,
I don't like the obsessive approach they did - to rename everything to their name, then overcomplicate code, configure with YAML, hiding details, provide own API, etc bloat, making harder to understand what's going on.
I used a docker container to convert your trained .pt to .safetensors. Here is the code:
import torch
from safetensors.torch import save_file
# Ultralytics .pt files often contain custom model structures or metadata that require the library to load correctly:
from ultralytics import YOLO
# model = torch.load('/data/hantian_yolov11l-doclaynet.pt')
# Load the Ultralytics .pt file
model = YOLO('/data/hantian_yolov11l-doclaynet.pt')
# Extract the state_dict (model weights)
state_dict = model.model.state_dict()
# Save state_dict as a safetensors file
save_file(state_dict, '/data/hantian_yolov11l-doclaynet.safetensors')
This was run by a temporary docker container:
docker run -it --rm -v ~/conversion:/data ultralytics/ultralytics:latest bash -c "pip install safetensors && python /data/convert.py"
The documentation of Ultralytics was not very useful. Here how I try to load the converted weights:
from ultralytics.nn.tasks import DetectionModel # Backbone class
from ultralytics.utils import yaml_load # Helper function for loading YAML configs
# Load the model config:
cfg = yaml_load('/home/user/miniconda3/envs/py310/lib/python3.10/site-packages/ultralytics/cfg/models/11/yolo11.yaml')
# Modify number of classes to match yours:
print('default nc', cfg['nc']) # 80 by default
cfg['nc'] = 11 # We hack: override class count BEFORE building
print('new nc', cfg['nc'])
# Build model with the correct number of classes
n_channels = 3 # RGB
# Patch scale, not present in cfg. We use Hantyan's favorite L size:
cfg['scale'] = 'l'
model = DetectionModel(cfg, ch=n_channels, nc=cfg['nc']) # Initialize an empty YOLO model (same architecture as original)
# Load the safetensors weights
from safetensors.torch import load_file
state_dict = load_file('/home/user/conversion/hantian_yolov11l-doclaynet.safetensors')
# Below we FAIL
model.model.load_state_dict(state_dict, strict=False)
Yes, Ultralytics is not clear enough. Thanks for your sharing!
Today, I report success with safetensors format, here is the inference code:
# Import libraries
import cv2 # for reading images, draw bounding boxes
from ultralytics import YOLO
from ultralytics.nn.tasks import DetectionModel # Backbone class
from ultralytics.utils import yaml_load # We need this to hack class number
# Load the safetensors weights
from safetensors.torch import load_file
state_dict = load_file('/home/user/conversion/hantian_yolov11l-doclaynet.safetensors')
# Build the ultralytics yolo model
# Load the model config, they like the latest scream in fashion - yaml, prone to whitespace errors. Need to find it's real path:
cfg = yaml_load('/home/user/miniconda3/envs/py310ocr/lib/python3.10/site-packages/ultralytics/cfg/models/11/yolo11.yaml')
cfg['nc'] = 11 # hack: override class count BEFORE building
n_channels = 3 # RGB
# Patch scale, not present in cfg. We use Hantian's favorite L size:
cfg['scale'] = 'l'
detection_model = DetectionModel(cfg, ch=n_channels, nc=cfg['nc']) # Initialize an empty YOLO model (same architecture as original)
# Apply weights:
detection_model.load_state_dict(state_dict, strict=True)
device="cuda:0"
#Initialize an empty YOLO model
Yolo_model = YOLO("yolo11l.yaml", task='detect') # build a new model from fu*** YAML
Yolo_model.model = detection_model # Replace with our model, it is no more empty
Yolo_model.model.to(device) # Ensure it’s on the correct device
# Define inference constants - our classes with colors
ENTITIES_COLORS = {
"Caption": (191, 100, 21),
"Footnote": (2, 62, 115),
"Formula": (140, 80, 58),
"List-item": (168, 181, 69),
"Page-footer": (2, 69, 84),
"Page-header": (83, 115, 106),
"Picture": (255, 72, 88),
"Section-header": (0, 204, 192),
"Table": (116, 127, 127),
"Text": (0, 153, 221),
"Title": (196, 51, 2)
}
# Set class names (must match the order in training dataset)
class_names = list(ENTITIES_COLORS.keys()) # ["Caption", "Footnote", ..., "Title"]
Yolo_model.model.names = {i: name for i, name in enumerate(class_names)}
# Load an image with cv2
image_path = "/home/user/LLMs/ocr/data/book1/page_0011_org.jpg" # Replace with your image path
image = cv2.imread(image_path) # Load image in BGR format
# Make predictions on the cv2-loaded image
results = Yolo_model.predict(source=image, conf=0.25, iou=0.45)
# Process the results and draw bounding boxes with class-specific colors (example for IPython notebook)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
import matplotlib.pyplot as plt
for result in results:
boxes = result.boxes # Get detection boxes
for box in boxes:
x, y, w, h = box.xywh[0] # Box coordinates (center x, y, width, height)
x, y, w, h = int(x), int(y), int(w), int(h) # Convert to integers
conf = box.conf.item() # Confidence score
cls = int(box.cls.item()) # Class ID
label = f"{Yolo_model.model.names[cls]} {conf:.2f}"
color = ENTITIES_COLORS[Yolo_model.model.names[cls]] # Get the color for this class
top_left = (x - w // 2, y - h // 2)
bottom_right = (x + w // 2, y + h // 2) # Class-specific colored box
cv2.rectangle(image_rgb, top_left, bottom_right, color, 2)
cv2.putText(image_rgb, label, (top_left[0], top_left[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2) # Matching text color
plt.figure(figsize=(10, 10))
plt.imshow(image_rgb)
plt.axis('off')
plt.show()
So I proved it can be done.