Spaces:

Navsatitagain
/

empath-ai

Running

App Files Files Community

Navsatitagain commited on 20 days ago

Commit

204bcef

verified ·

1 Parent(s): cab3375

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -6

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 from transformers import pipeline
 from PIL import Image
 TEXT_MODEL  = "j-hartmann/emotion-english-distilroberta-base"
 IMAGE_MODEL = "trpakov/vit-face-expression"
@@ -11,10 +12,11 @@ image_pipe = pipeline("image-classification", model=IMAGE_MODEL, top_k=None)
 audio_pipe = pipeline("audio-classification", model=AUDIO_MODEL, top_k=None)
 def _as_label_dict(preds):
     preds_sorted = sorted(preds, key=lambda p: p["score"], reverse=True)
     return {p["label"]: float(round(p["score"], 4)) for p in preds_sorted}
 def analyze_text(text: str):
     if not text or not text.strip():
         return {"(enter some text)": 1.0}
@@ -34,15 +36,56 @@ def analyze_face(img):
 def analyze_voice(audio_path):
     if audio_path is None:
         return {"(no audio)": 1.0}
-    preds = audio_pipe(audio_path)
     return _as_label_dict(preds)
 with gr.Blocks(title="Empath AI — Multimodal Emotion Detection") as demo:
     gr.Markdown(
         """
-        # Empath AI — Emotion Detection (Text • Face • Voice)
-        Grant permission when the browser asks for **camera/microphone**.
-        Nothing is stored; analysis happens in memory and the scores are shown back to you.
         """
     )
@@ -65,4 +108,15 @@ with gr.Blocks(title="Empath AI — Multimodal Emotion Detection") as demo:
         a_out = gr.Label(num_top_classes=3)
         a_btn.click(analyze_voice, inputs=a_in, outputs=a_out)
-demo.launch()

 import gradio as gr
 from transformers import pipeline
 from PIL import Image
+import imageio
 TEXT_MODEL  = "j-hartmann/emotion-english-distilroberta-base"
 IMAGE_MODEL = "trpakov/vit-face-expression"
 audio_pipe = pipeline("audio-classification", model=AUDIO_MODEL, top_k=None)
 def _as_label_dict(preds):
+    """Convert HF predictions to {label: score} sorted desc."""
     preds_sorted = sorted(preds, key=lambda p: p["score"], reverse=True)
     return {p["label"]: float(round(p["score"], 4)) for p in preds_sorted}
+# ---------- Text ----------
 def analyze_text(text: str):
     if not text or not text.strip():
         return {"(enter some text)": 1.0}
 def analyze_voice(audio_path):
     if audio_path is None:
         return {"(no audio)": 1.0}
+    preds = audio_pipe(audio_path)
     return _as_label_dict(preds)
+def analyze_video(video_path, sample_fps=2, max_frames=120):
+    """
+    Read the video, sample ~sample_fps frames/second (up to max_frames),
+    run face-expression model on each, and return the average scores.
+    """
+    if video_path is None:
+        return {"(no video)": 1.0}, "No file provided."
+    try:
+        reader = imageio.get_reader(video_path)
+        meta = reader.get_meta_data()
+        fps = int(meta.get("fps", 25))
+        step = max(int(round(fps / max(1, sample_fps))), 1)
+        totals = {}
+        used = 0
+        for i, frame in enumerate(reader):
+            if i % step != 0:
+                continue
+            if used >= max_frames:
+                break
+            pil = Image.fromarray(frame)
+            preds = image_pipe(pil)
+            for p in preds:
+                label = p["label"]
+                totals[label] = totals.get(label, 0.0) + float(p["score"])
+            used += 1
+        if used == 0:
+            return {"(no frames sampled)": 1.0}, "Could not sample frames; try a shorter/different video."
+        avg = {k: round(v / used, 4) for k, v in totals.items()}
+        avg_sorted = dict(sorted(avg.items(), key=lambda x: x[1], reverse=True))
+        info = f"Frames analyzed: {used}  •  Sampling ≈{sample_fps} fps  •  Max frames: {max_frames}"
+        return avg_sorted, info
+    except Exception as e:
+        return {"(error)": 1.0}, f"Video read error: {e}"
 with gr.Blocks(title="Empath AI — Multimodal Emotion Detection") as demo:
     gr.Markdown(
         """
+        # Empath AI — Emotion Detection (Text • Face • Voice • Video)
+        - Allow **camera** and **microphone** permissions when prompted.
+        - Keep videos **short (≤15s)** for faster results.
+        - No data is stored; analysis happens in memory and results are shown back to you.
         """
     )
         a_out = gr.Label(num_top_classes=3)
         a_btn.click(analyze_voice, inputs=a_in, outputs=a_out)
+    with gr.Tab("Video (Record or Upload)"):
+        # Gradio will show a camera-record button and an upload option.
+        v_in  = gr.Video(sources=["webcam", "upload"], label="Record or upload a short video (≤15s)", height=280)
+        with gr.Row():
+            fps = gr.Slider(1, 5, value=2, step=1, label="Sampling FPS (frames analyzed per second)")
+            maxf = gr.Slider(30, 240, value=120, step=10, label="Max Frames to Analyze")
+        v_btn = gr.Button("Analyze Video", variant="primary")
+        v_out = gr.Label(num_top_classes=3, label="Average Emotion (video)")
+        v_info = gr.Markdown()
+        v_btn.click(analyze_video, inputs=[v_in, fps, maxf], outputs=[v_out, v_info])
+demo.launch()