import os
import cv2
import gradio as gr
import numpy as np
import supervision as sv
import torch
from inference.models import YOLOWorld

from efficientvit.models.efficientvit.sam import EfficientViTSamPredictor
from efficientvit.sam_model_zoo import create_sam_model


MARKDOWN = """
# YOLO-World + EfficientViT-SAM
Powered by Roboflow [Inference](https://github.com/roboflow/inference) and [Supervision](https://github.com/roboflow/supervision) and [YOLO-World](https://github.com/AILab-CVC/YOLO-World) and [EfficientViT-SAM](https://github.com/mit-han-lab/efficientvit)
"""

# Load models
# yolo_world = YOLOWorld(model_id="yolo_world/s")
# yolo_world = YOLOWorld(model_id="yolo_world/m")
yolo_world = YOLOWorld(model_id="yolo_world/l")
# yolo_world = YOLOWorld(model_id="yolo_world/x")

# yolo_world = YOLOWorld(model_id="yolo_world/v2-s")
# yolo_world = YOLOWorld(model_id="yolo_world/v2-m")
# yolo_world = YOLOWorld(model_id="yolo_world/v2-l")
# yolo_world = YOLOWorld(model_id="yolo_world/v2-x")

device = "cuda" if torch.cuda.is_available() else "cpu"
sam = EfficientViTSamPredictor(
    create_sam_model(name="xl1", weight_url="./weights/xl1.pt").to(device).eval()
)


# Load annotators
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
MASK_ANNOTATOR = sv.MaskAnnotator()
LABEL_ANNOTATOR = sv.LabelAnnotator()


def detect(
    image: np.ndarray,
    query: str,
    confidence_threshold: float,
    nms_threshold: float,
) -> np.ndarray:
    # Preparation.
    categories = [category.strip() for category in query.split(",")]
    yolo_world.set_classes(categories)
    # print("categories:", categories)

    # Object detection
    results = yolo_world.infer(image, confidence=confidence_threshold)
    detections = sv.Detections.from_inference(results).with_nms(
        class_agnostic=True, threshold=nms_threshold
    )
    # print("detected:", detections)

    # Segmentation
    sam.set_image(image, image_format="RGB")
    masks = []
    for xyxy in detections.xyxy:
        mask, _, _ = sam.predict(box=xyxy, multimask_output=False)
        masks.append(mask.squeeze())
    detections.mask = np.array(masks)
    # print("masks shaped as", detections.mask.shape)

    # Annotation
    output_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    labels = [
        # f"{categories[class_id]}: {confidence:.2f}"
        f"{categories[class_id]}: {confidence:.3f}"
        for class_id, confidence in zip(detections.class_id, detections.confidence)
    ]
    output_image = MASK_ANNOTATOR.annotate(output_image, detections)
    output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
    output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
    return cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        with gr.Column():
            input_image_component = gr.Image(type="numpy", label="Input Image")
            image_categories_text_component = gr.Textbox(
                placeholder="you can input multiple words with comma (,)",
            )

            with gr.Accordion("YOLO-WORLD", open=False):
                confidence_threshold_component = gr.Slider(
                    minimum=0,
                    maximum=1,
                    value=0.005,
                    step=0.01,
                    label="Confidence Threshold",
                )

                iou_threshold_component = gr.Slider(
                    minimum=0,
                    maximum=1,
                    value=0.5,
                    step=0.01,
                    label="NMS Threshold",
                )

        yolo_world_output_image_component = gr.Image(type="numpy", label="Output image")
    submit_button_component = gr.Button(value="Submit", scale=1, variant="primary")
    gr.Examples(
        fn=detect,
        examples=[
            [
                os.path.join(os.path.dirname(__file__), "examples/livingroom.jpg"),
                "table, lamp, dog, sofa, plant, clock, carpet, frame on the wall",
                0.05,
                0.5,
            ],
            [
                os.path.join(os.path.dirname(__file__), "examples/cat_and_dogs.jpg"),
                "cat, dog",
                0.2,
                0.5,
            ],
        ],
        inputs=[
            input_image_component,
            image_categories_text_component,
            confidence_threshold_component,
            iou_threshold_component,
        ],
        outputs=yolo_world_output_image_component,
    )

    submit_button_component.click(
        fn=detect,
        inputs=[
            input_image_component,
            image_categories_text_component,
            confidence_threshold_component,
            iou_threshold_component,
        ],
        outputs=yolo_world_output_image_component,
    )

demo.launch(debug=False, show_error=True)