Spaces:

abrar-adnan
/

speech-analyzer

Running

App Files Files Community

abrar-adnan commited on Mar 15, 2023

Commit

378962d

1 Parent(s): 7dfcf08

Upload app.py

Browse files

Files changed (1) hide show

app.py +57 -45

app.py CHANGED Viewed

@@ -43,9 +43,6 @@ def getTranscription(path):
     # Insert Local Audio File Path
     clip.audio.write_audiofile(r"audio.wav")
-    waveform, sample_rate = torchaudio.load("audio.wav")
-    waveform, sample_rate
     waveform, sample_rate = torchaudio.load("audio.wav")
     resampler = torchaudio.transforms.Resample(sample_rate, 16000)
     waveform = resampler(waveform)[0]
@@ -61,6 +58,37 @@ def getTranscription(path):
     return transcription[0]
 def video_processing(video_file, encoded_video):
     angry = 0
     disgust = 0
@@ -104,48 +132,25 @@ def video_processing(video_file, encoded_video):
         # If there are no more frames, break out of the loop
         if not ret:
             break
-        # Convert the frame to RGB color (face_recognition uses RGB)
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        # Find all the faces in the frame using a pre-trained convolutional neural network.
-        face_locations = face_recognition.face_locations(gray)
-        #face_locations = face_recognition.face_locations(gray, number_of_times_to_upsample=0, model="cnn")
-        if len(face_locations) > 0:
-            # Show the original frame with face rectangles drawn around the faces
-            for top, right, bottom, left in face_locations:
-                # cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)
-                face_image = gray[top:bottom, left:right]
-                color_image = frame[top:bottom, left:right]
-                # Resize the face image to the desired size
-                resized_face_image = cv2.resize(face_image, (128,128))
-                try:
-                    emotion = DeepFace.analyze(color_image,actions=['emotion'],detector_backend = backends[2],enforce_detection = False)# 2,3, 4 works
-                    emotion_count += 1
-                except Exception as e:
-                    pass
-                print(emotion[0]['emotion'])
-                angry += emotion[0]['emotion']['angry']
-                disgust += emotion[0]['emotion']['disgust']
-                fear += emotion[0]['emotion']['fear']
-                happy += emotion[0]['emotion']['happy']
-                sad += emotion[0]['emotion']['sad']
-                surprise += emotion[0]['emotion']['surprise']
-                neutral += emotion[0]['emotion']['neutral']
-                # Predict the class of the resized face image using the model
-                result = model.predict(resized_face_image)
-                print(result[0])
-                if(result[0] == 'on_camera'): on_camera = on_camera + 1
-                elif(result[0] == 'off_camera'): off_camera = off_camera + 1
-                total = total + 1
     try:
         # your processing code here
@@ -179,7 +184,14 @@ def video_processing(video_file, encoded_video):
         'sad': sad,
         'surprise': surprise,
         'neutral': neutral
-     },
     # angry = 'total anger percentage' + str(angry)
     # disgust = 'total disgust percentage' + str(disgust)
@@ -196,7 +208,7 @@ def video_processing(video_file, encoded_video):
     print(f'total surprise percentage = {surprise}')
     print(f'total neutral percentage = {neutral}')
     final_result = "Gaze = "+str(gaze_percentage)+"\nFace Emotion = "+str(emotion)+"\nText Emotion = "+str(text_emotion)+"\nText transcription = "+str(transcription)+"\nText sentiment = "+str(text_sentiment)
-    return final_result
 demo = gr.Interface(fn=video_processing,

     # Insert Local Audio File Path
     clip.audio.write_audiofile(r"audio.wav")
     waveform, sample_rate = torchaudio.load("audio.wav")
     resampler = torchaudio.transforms.Resample(sample_rate, 16000)
     waveform = resampler(waveform)[0]
     return transcription[0]
+def process_frame(frame):
+    # Convert the frame to RGB color (face_recognition uses RGB)
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        # Find all the faces in the frame using a pre-trained convolutional neural network.
+        face_locations = face_recognition.face_locations(gray)
+        #face_locations = face_recognition.face_locations(gray, number_of_times_to_upsample=0, model="cnn")
+        if len(face_locations) > 0:
+            # Show the original frame with face rectangles drawn around the faces
+            for top, right, bottom, left in face_locations:
+                # cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)
+                face_image = gray[top:bottom, left:right]
+                color_image = frame[top:bottom, left:right]
+                # Resize the face image to the desired size
+                resized_face_image = cv2.resize(face_image, (128,128))
+                try:
+                    emotion = DeepFace.analyze(color_image,actions=['emotion'],detector_backend = backends[2],enforce_detection = False)# 2,3, 4 works
+                    emotion_count += 1
+                except Exception as e:
+                    emotion = 0
+                    pass
+                # Predict the class of the resized face image using the model
+                result = model.predict(resized_face_image)
+                print(result[0])
+                return result[0], emotion
 def video_processing(video_file, encoded_video):
     angry = 0
     disgust = 0
         # If there are no more frames, break out of the loop
         if not ret:
             break
+        result, emotion = process_frame(frame)
+        if result:
+            if result == 'on_camera':
+                on_camera += 1
+            elif result == 'off_camera':
+                off_camera += 1
+            total += 1
+        if emotion != 0:
+            print(emotion[0]['emotion'])
+            angry += emotion[0]['emotion']['angry']
+            disgust += emotion[0]['emotion']['disgust']
+            fear += emotion[0]['emotion']['fear']
+            happy += emotion[0]['emotion']['happy']
+            sad += emotion[0]['emotion']['sad']
+            surprise += emotion[0]['emotion']['surprise']
+            neutral += emotion[0]['emotion']['neutral']
     try:
         # your processing code here
         'sad': sad,
         'surprise': surprise,
         'neutral': neutral
+     },
+    final_result_dict = {
+        "gaze_percentage" : gaze_percentage,
+        "face_emotion" : emotion,
+        "text_emotion" : text_emotion,
+        "transcription" : transcription,
+        "text_sentiment" : text_sentiment
+    }
     # angry = 'total anger percentage' + str(angry)
     # disgust = 'total disgust percentage' + str(disgust)
     print(f'total surprise percentage = {surprise}')
     print(f'total neutral percentage = {neutral}')
     final_result = "Gaze = "+str(gaze_percentage)+"\nFace Emotion = "+str(emotion)+"\nText Emotion = "+str(text_emotion)+"\nText transcription = "+str(transcription)+"\nText sentiment = "+str(text_sentiment)
+    return final_result_dict
 demo = gr.Interface(fn=video_processing,