import gradio as gr import os import cv2 import face_recognition from fastai.vision.all import load_learner import time import base64 from deepface import DeepFace import torchaudio import moviepy.editor as mp from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline # import pathlib # temp = pathlib.PosixPath # pathlib.PosixPath = pathlib.WindowsPath backends = [ 'opencv', 'ssd', 'dlib', 'mtcnn', 'retinaface', 'mediapipe' ] emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True) sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") model = load_learner("gaze-recognizer-v3.pkl") def analyze_emotion(text): result = emotion_pipeline(text) return result def analyze_sentiment(text): result = sentiment_pipeline(text) return result def getTranscription(path): # Insert Local Video File Path clip = mp.VideoFileClip(path) # Insert Local Audio File Path clip.audio.write_audiofile(r"audio.wav") waveform, sample_rate = torchaudio.load("audio.wav") resampler = torchaudio.transforms.Resample(sample_rate, 16000) waveform = resampler(waveform)[0] processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") model.config.forced_decoder_ids = None input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) return transcription[0] def process_frame(frame): # Convert the frame to RGB color (face_recognition uses RGB) gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Find all the faces in the frame using a pre-trained convolutional neural network. face_locations = face_recognition.face_locations(gray) #face_locations = face_recognition.face_locations(gray, number_of_times_to_upsample=0, model="cnn") if len(face_locations) > 0: # Show the original frame with face rectangles drawn around the faces for top, right, bottom, left in face_locations: # cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2) face_image = gray[top:bottom, left:right] color_image = frame[top:bottom, left:right] # Resize the face image to the desired size resized_face_image = cv2.resize(face_image, (128,128)) try: emotion = DeepFace.analyze(color_image,actions=['emotion'],detector_backend = backends[2],enforce_detection = False)# 2,3, 4 works emotion_count += 1 print(emotion) except Exception as e: emotion = 0 pass # Predict the class of the resized face image using the model result = model.predict(resized_face_image) print(result[0]) return result[0], emotion def video_processing(video_file, encoded_video): angry = 0 disgust = 0 fear = 0 happy = 0 sad = 0 surprise = 0 neutral = 0 emotion_count = 0 if encoded_video != "": decoded_file_data = base64.b64decode(encoded_video) with open("temp_video.mp4", "wb") as f: f.write(decoded_file_data) video_file = "temp_video.mp4" start_time = time.time() transcription = getTranscription(video_file) print(transcription) text_emotion = analyze_emotion(transcription) print(text_emotion) text_sentiment = analyze_sentiment(transcription) print(text_sentiment) video_capture = cv2.VideoCapture(video_file) on_camera = 0 off_camera = 0 total = 0 while True: # Read a single frame from the video for i in range(24*3): ret, frame = video_capture.read() if not ret: break # If there are no more frames, break out of the loop if not ret: break result, emotion = process_frame(frame) print(emotion) if result: if result == 'on_camera': on_camera += 1 elif result == 'off_camera': off_camera += 1 total += 1 if emotion != 0: # print(emotion[0]['emotion']) angry += emotion[0]['emotion']['angry'] disgust += emotion[0]['emotion']['disgust'] fear += emotion[0]['emotion']['fear'] happy += emotion[0]['emotion']['happy'] sad += emotion[0]['emotion']['sad'] surprise += emotion[0]['emotion']['surprise'] neutral += emotion[0]['emotion']['neutral'] try: # your processing code here gaze_percentage = on_camera / total * 100 except Exception as e: print(f"An error occurred while processing the video: {e}") gaze_percentage = f'no face detected Total = {total},on_camera = {on_camera},off_camera = {off_camera}' print(f'Total = {total},on_camera = {on_camera},off_camera = {off_camera}') # print(f'focus perfectage = {on_camera/total*100}') # Release the video capture object and close all windows video_capture.release() cv2.destroyAllWindows() end_time = time.time() print(f'Time taken: {end_time-start_time}') if os.path.exists("temp_video.mp4"): os.remove("temp_video.mp4") print(gaze_percentage) angry = angry / emotion_count disgust = disgust / emotion_count fear = fear / emotion_count happy = happy / emotion_count sad = sad / emotion_count surprise = surprise / emotion_count neutral = neutral / emotion_count emotion = { 'angry': angry, 'disgust': disgust, 'fear': fear, 'happy': happy, 'sad': sad, 'surprise': surprise, 'neutral': neutral }, final_result_dict = { "gaze_percentage" : gaze_percentage, "face_emotion" : emotion, "text_emotion" : text_emotion, "transcription" : transcription, "text_sentiment" : text_sentiment } # angry = 'total anger percentage' + str(angry) # disgust = 'total disgust percentage' + str(disgust) # fear = 'total fear percentage' + str(fear) # happy = 'total happy percentage' + str(happy) # sad = 'total sad percentage' + str(sad) # surprise = 'total surprise percentage' + str(surprise) # neutral = 'total neutral percentage' + str(neutral) print(f'total anger percentage = {angry}') print(f'total disgust percentage = {disgust}') print(f'total fear percentage = {fear}') print(f'total happy percentage = {happy}') print(f'total sad percentage = {sad}') print(f'total surprise percentage = {surprise}') print(f'total neutral percentage = {neutral}') final_result = "Gaze = "+str(gaze_percentage)+"\nFace Emotion = "+str(emotion)+"\nText Emotion = "+str(text_emotion)+"\nText transcription = "+str(transcription)+"\nText sentiment = "+str(text_sentiment) return final_result_dict demo = gr.Interface(fn=video_processing, inputs=["video", "text"], outputs="json") if __name__ == "__main__": demo.launch()