import cv2
import numpy as np

# Load the SSD model and configuration
model_path = 'ssd_mobilenet_v2_coco.pb'   # Path to the pre-trained SSD model
config_path = 'deploy.prototxt'           # Path to the deploy prototxt file

# Load the class labels from the COCO dataset
CLASSES = [
    'background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 
    'truck', 'boat', 'traffic light', 'fire hydrant', 'none', 'stop sign', 'parking meter', 
    'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 
    'giraffe', 'none', 'backpack', 'umbrella', 'none', 'handbag', 'tie', 'suitcase', 
    'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'none', 'wine glass', 'cup', 
    'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 
    'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 
    'bed', 'dining table', 'toilet', 'none', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Initialize the OpenCV DNN network
net = cv2.dnn.readNetFromTensorflow(model_path, config_path)

# Function to process the video frame and detect objects
def detect_objects_in_frame(frame):
    # Get the image shape
    height, width = frame.shape[:2]
    
    # Prepare the frame for the model (mean subtraction and resizing)
    blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300), (127.5, 127.5, 127.5), swapRB=True, crop=False)
    
    # Set the blob as input to the network
    net.setInput(blob)
    
    # Run the forward pass to get predictions
    detections = net.forward()

    # Loop through all the detections
    for i in range(detections.shape[2]):
        confidence = detections[0, 0, i, 2]

        if confidence > 0.5:  # Set a threshold for object detection
            # Get the class index and the bounding box coordinates
            class_id = int(detections[0, 0, i, 1])
            left = int(detections[0, 0, i, 3] * width)
            top = int(detections[0, 0, i, 4] * height)
            right = int(detections[0, 0, i, 5] * width)
            bottom = int(detections[0, 0, i, 6] * height)
            
            # Draw the bounding box and label on the frame
            label = f"{CLASSES[class_id]}: {confidence:.2f}"
            cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
            cv2.putText(frame, label, (left, top - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return frame

# Capture video from a file or camera (0 for the default camera)
cap = cv2.VideoCapture(0)  # Use 0 for webcam or provide a path to a video file

# Check if the video capture is initialized correctly
if not cap.isOpened():
    print("Error: Could not open video stream.")
    exit()

while True:
    # Read a new frame from the video feed
    ret, frame = cap.read()
    
    if not ret:
        print("Error: Failed to read frame from video stream.")
        break
    
    # Detect objects in the current frame
    output_frame = detect_objects_in_frame(frame)

    # Display the resulting frame
    cv2.imshow("Detected Objects in Video", output_frame)
    
    # Break the loop if the user presses the 'Esc' key
    if cv2.waitKey(1) & 0xFF == 27:  # 27 is the keycode for 'Esc'
        break