Spaces:

AIConsultant
/

Face-Emotion

Running

File size: 3,802 Bytes

04f9904

import os
import tempfile
import torch
import cv2
from PIL import Image
from torchvision import transforms
from facenet_pytorch import MTCNN

def process_emotion(input_video):
    try:
        # Load the pre-trained CNN model
        model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
        num_ftrs = model.fc.in_features
        model.fc = torch.nn.Linear(num_ftrs, 7)  # 7 classes for emotions

        # Load the pre-trained weights
        model.eval()

        # Define emotions
        emotions = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise", "Neutral"]

        # Initialize MTCNN for face detection
        mtcnn = MTCNN()

        # Open the video capture
        cap = cv2.VideoCapture(input_video.name)

        if not cap.isOpened():
            return "Error: Could not open video capture."

        # Create a temporary directory to store the frames
        temp_dir = tempfile.mkdtemp()
        frame_paths = []

        # Define image transformations
        preprocess = transforms.Compose([
            transforms.Resize(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        # Process each frame of the video
        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()

            if not ret:
                break

            if frame is None:
                print("Error: Empty frame.")
                continue

            frame_count += 1

            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            face_boxes, _ = mtcnn.detect(pil_frame)

            if face_boxes is None:
                print(f"No face detected in frame {frame_count}.")
            else:
                for box in face_boxes:
                    x1, y1, x2, y2 = box.astype(int)
                    w = x2 - x1
                    h = y2 - y1

                    face_roi = frame[y1:y2, x1:x2]
                    pil_face = Image.fromarray(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))

                    input_tensor = preprocess(pil_face)
                    input_batch = input_tensor.unsqueeze(0)

                    with torch.no_grad():
                        output = model(input_batch)

                    predicted_emotion = output.argmax().item()

                    emotion_label = emotions[predicted_emotion]

                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(frame, emotion_label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

                    frame_path = os.path.join(temp_dir, f"{frame_count:04d}.png")
                    cv2.imwrite(frame_path, frame)
                    frame_paths.append(frame_path)

        cap.release()
        cv2.destroyAllWindows()

        if not frame_paths:
            return "No faces detected in the video."

        # Convert the frames to a video
        output_path = "output_video.mp4"
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')

        # Get dimensions from the first frame
        sample_frame = cv2.imread(frame_paths[0])
        if sample_frame is None:
            return "Error: Unable to read sample frame."

        frame_height, frame_width, _ = sample_frame.shape

        out = cv2.VideoWriter(output_path, fourcc, 30, (frame_width, frame_height))

        for frame_path in frame_paths:
            frame = cv2.imread(frame_path)
            if frame is not None:
                out.write(frame)
                os.remove(frame_path)
            else:
                print(f"Warning: Unable to read frame {frame_path}")

        out.release()

        return output_path

    except Exception as e:
        return f"An error occurred: {str(e)}"