File size: 4,119 Bytes
4126fb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

# This file is copied & modified from UNESCO/MMS/blob/main/asr.py

import gradio as gr
import numpy as np
import torch
import torchaudio
import json
from audiobox_aesthetics import infer as aes_infer
import plotly.graph_objects as go

ASR_EXAMPLES = [("assets/bach.wav")]

aes_predictor = aes_infer.initialize_model(None)


def transcribe(audio_data=None):
    if not audio_data:
        return "<<ERROR: Empty Audio Input>>"

    if isinstance(audio_data, tuple):
        # microphone
        sr, audio_samples = audio_data
        audio_samples = (audio_samples / 32768.0).astype(np.float32)
        audio_samples = torch.tensor(audio_samples)
        if audio_samples.ndim == 1:
            audio_samples = audio_samples[:, None]

        assert audio_samples.ndim == 2
        audio_samples = audio_samples.t()
    else:
        # file upload
        if not isinstance(audio_data, str):
            return "<<ERROR: Invalid Audio Input Instance: {}>>".format(
                type(audio_data)
            )
        audio_samples, sr = torchaudio.load(audio_data)

    transcription = json.loads(
        aes_predictor.forward([{"path": audio_samples, "sample_rate": sr}])[0]
    )

    # reorder like paper figures
    transcription = {
        "Production Quality": transcription["PQ"],
        "Production Complexity": transcription["PC"],
        "Content Enjoyment": transcription["CE"],
        "Content Usefulness": transcription["CU"],
    }

    # Create a Plotly bar plot
    fig = go.Figure()
    colors = ["#b1d8ff", "#fee2f5", "#cefac4", "#d2d3ff"]
    values = list(transcription.values())
    keys = list(transcription.keys())

    fig.add_trace(
        go.Bar(
            x=keys,
            y=values,
            text=[f"{v:.2f}" for v in values],  # Format text to 2 decimal places
            textposition="outside",  # Position text outside the bars
            marker=dict(color=colors),
        )
    )

    # Set the range for the y-axis
    fig.update_layout(
        yaxis=dict(range=[0, 10]),
        xaxis_title="Metrics",
        yaxis_title="Scores",
    )
    return fig


main_interface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(),
    ],
    # outputs="text",
    outputs=gr.Plot(),
    examples=ASR_EXAMPLES,
    title="Audiobox Aesthetics Demo Prediction",
    description=("Play some audio through microphone or upload the file."),
    article="",
    allow_flagging="never",
)

disclaimer = """
## Disclaimer
"""

with gr.Blocks() as demo:
    gr.HTML(
        """
            <div style="text-align: center;">
            <h1>
                Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound
            </h1>
            </div>
           """
    )

    gr.Markdown(
        "<p align='center' style='font-size: 20px;'>See our <a href='https://arxiv.org/abs/2502.05139'>paper</a>, Github <a href='https://github.com/facebookresearch/audiobox-aesthetics'>repo</a> and HuggingFace <a href='https://huggingface.co/facebook/audiobox-aesthetics'>repo</a> </p>"
    )
    gr.HTML(
        """<center><a href="https://huggingface.co/spaces/facebook/audiobox-aesthetics?duplicate=true"  style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"><img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for more control and no queue.</center>"""
    )

    main_interface.render()

    gr.HTML(
        """
            <div class="footer" style="text-align:center">
                <p>
                    Model by <a href="https://ai.facebook.com" style="text-decoration: underline;" target="_blank">Meta AI</a> - Gradio Demo by 🤗 Hugging Face
                </p>
            </div>
           """
    )

    # with gr.Row():
    #     gr.Markdown(disclaimer)

if __name__ == "__main__":
    demo.queue()
    demo.launch()