File size: 2,620 Bytes
3e40110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pprint

import gradio as gr
import librosa
import plotly.graph_objects as go
import spaces
import torch
from loguru import logger
from transformers import AutoFeatureExtractor
from transformers.modeling_outputs import SequenceClassifierOutput

from model import EmotionModel

repo_id = "my_model"
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"device: {device}")
model = EmotionModel.from_pretrained(repo_id, device_map=device)
model.eval()
processor = AutoFeatureExtractor.from_pretrained(repo_id)

label_map = {
    "Angry": "😠 怒り",
    "Disgusted": "😒 嫌悪",
    "Embarrassed": "😳 戸惑い",
    "Fearful": "😨 恐怖",
    "Happy": "😊 幸せ",
    "Sad": "😢 悲しみ",
    "Surprised": "😲 驚き",
    "Neutral": "😐 中立",
    "Sexual1": "🥰 NSFW1",
    "Sexual2": "🍭 NSFW2",
}


@spaces.GPU
def pipe(filename: str) -> tuple[dict[str, float], go.Figure]:
    audio, sr = librosa.load(filename, sr=16000)
    duration = librosa.get_duration(y=audio, sr=sr)
    logger.info(f"filename: {filename}, duration: {duration}")
    if duration > 30.0:
        return (
            {f"Error: 音声ファイルの長さが長すぎます: {duration}秒": 0.0},
            go.Figure(),
        )
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs: SequenceClassifierOutput = model(**inputs)
    logits = outputs.logits  # shape: (batch_size, num_labels)
    # ロジットの取得
    logits = logits[0].cpu().numpy()
    labels = [label_map[label] for id, label in model.config.id2label.items()]
    sorted_pairs = sorted(zip(logits, labels), key=lambda x: x[0])
    sorted_logits, sorted_labels = zip(*sorted_pairs)
    logger.info(f"Result:\n{pprint.pformat(sorted_pairs)}")
    probabilities = outputs.logits.softmax(dim=-1)
    scores_dict = {label: prob.item() for label, prob in zip(labels, probabilities[0])}
    fig = go.Figure([go.Bar(x=sorted_logits, y=sorted_labels, orientation="h")])
    return scores_dict, fig


md = """
# 音声からの感情認識 ver 0.1

- 音声ファイルから感情を予測して、確率とlogits (softmax前の値) を表示します
- 30秒以上の音声ファイルは受け付けません
"""

with gr.Blocks() as app:
    gr.Markdown(md)
    audio = gr.Audio(type="filepath")
    btn = gr.Button("感情を予測")
    with gr.Row():
        result = gr.Label(label="結果")
        plot = gr.Plot(label="Logits")

    btn.click(pipe, inputs=audio, outputs=[result, plot])

app.launch()