File size: 2,163 Bytes
c165076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f21fcbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c165076
 
 
 
 
 
 
 
f21fcbc
 
 
 
 
c165076
f21fcbc
17c370b
 
 
c165076
d2ef383
c165076
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio
import torchaudio
from fastai.vision.all import *
from fastai.learner import load_learner
from torchvision.utils import save_image
from huggingface_hub import hf_hub_download


model = load_learner(
    hf_hub_download("kurianbenoy/music_genre_classification_baseline", "model.pkl")
)

EXAMPLES_PATH = Path("./examples")
labels = model.dls.vocab

interface_options = {
    "title": "Music Genre Classification",
    "description": "A simple baseline model for classifying music genres with fast.ai on [Kaggle competition data](https://www.kaggle.com/competitions/kaggle-pog-series-s01e02/data)",
    "examples": [f"{EXAMPLES_PATH}/{f.name}" for f in EXAMPLES_PATH.iterdir()],
    "interpretation": "default",
    "layout": "horizontal",
    "theme": "default",
}

N_FFT = 2048
HOP_LEN = 1024


def create_spectrogram(filename):
    audio, sr = torchaudio.load(filename)
    specgram = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr,
        n_fft=N_FFT,
        win_length=N_FFT,
        hop_length=HOP_LEN,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        onesided=True,
        n_mels=224,
        mel_scale="htk",
    )(audio).mean(axis=0)
    specgram = torchaudio.transforms.AmplitudeToDB()(specgram)
    specgram = specgram - specgram.min()
    specgram = specgram / specgram.max()

    return specgram


def create_image(filename):
    specgram = create_spectrogram(filename)
    dest = Path("temp.png")
    save_image(specgram, "temp.png")


def predict(img):
    img = PILImage.create(img)
    _pred, _pred_w_idx, probs = model.predict(img)
    labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)}
    return labels_probs


def end2endpipeline(filename):
    create_image(filename)
    return predict("temp.png")


demo = gradio.Interface(
    fn=end2endpipeline,
    inputs=gradio.inputs.Audio(
        source="microphone", type="filepath", label="Record/ Drop audio"
    ),
    outputs=gradio.outputs.Label(num_top_classes=5),
    **interface_options,
)

launch_options = {
    "enable_queue": True,
    "share": False,
}

demo.launch(**launch_options)