Spaces:
Running
Running
File size: 4,119 Bytes
4126fb2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# This file is copied & modified from UNESCO/MMS/blob/main/asr.py
import gradio as gr
import numpy as np
import torch
import torchaudio
import json
from audiobox_aesthetics import infer as aes_infer
import plotly.graph_objects as go
ASR_EXAMPLES = [("assets/bach.wav")]
aes_predictor = aes_infer.initialize_model(None)
def transcribe(audio_data=None):
if not audio_data:
return "<<ERROR: Empty Audio Input>>"
if isinstance(audio_data, tuple):
# microphone
sr, audio_samples = audio_data
audio_samples = (audio_samples / 32768.0).astype(np.float32)
audio_samples = torch.tensor(audio_samples)
if audio_samples.ndim == 1:
audio_samples = audio_samples[:, None]
assert audio_samples.ndim == 2
audio_samples = audio_samples.t()
else:
# file upload
if not isinstance(audio_data, str):
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(
type(audio_data)
)
audio_samples, sr = torchaudio.load(audio_data)
transcription = json.loads(
aes_predictor.forward([{"path": audio_samples, "sample_rate": sr}])[0]
)
# reorder like paper figures
transcription = {
"Production Quality": transcription["PQ"],
"Production Complexity": transcription["PC"],
"Content Enjoyment": transcription["CE"],
"Content Usefulness": transcription["CU"],
}
# Create a Plotly bar plot
fig = go.Figure()
colors = ["#b1d8ff", "#fee2f5", "#cefac4", "#d2d3ff"]
values = list(transcription.values())
keys = list(transcription.keys())
fig.add_trace(
go.Bar(
x=keys,
y=values,
text=[f"{v:.2f}" for v in values], # Format text to 2 decimal places
textposition="outside", # Position text outside the bars
marker=dict(color=colors),
)
)
# Set the range for the y-axis
fig.update_layout(
yaxis=dict(range=[0, 10]),
xaxis_title="Metrics",
yaxis_title="Scores",
)
return fig
main_interface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(),
],
# outputs="text",
outputs=gr.Plot(),
examples=ASR_EXAMPLES,
title="Audiobox Aesthetics Demo Prediction",
description=("Play some audio through microphone or upload the file."),
article="",
allow_flagging="never",
)
disclaimer = """
## Disclaimer
"""
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center;">
<h1>
Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound
</h1>
</div>
"""
)
gr.Markdown(
"<p align='center' style='font-size: 20px;'>See our <a href='https://arxiv.org/abs/2502.05139'>paper</a>, Github <a href='https://github.com/facebookresearch/audiobox-aesthetics'>repo</a> and HuggingFace <a href='https://huggingface.co/facebook/audiobox-aesthetics'>repo</a> </p>"
)
gr.HTML(
"""<center><a href="https://huggingface.co/spaces/facebook/audiobox-aesthetics?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"><img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for more control and no queue.</center>"""
)
main_interface.render()
gr.HTML(
"""
<div class="footer" style="text-align:center">
<p>
Model by <a href="https://ai.facebook.com" style="text-decoration: underline;" target="_blank">Meta AI</a> - Gradio Demo by 🤗 Hugging Face
</p>
</div>
"""
)
# with gr.Row():
# gr.Markdown(disclaimer)
if __name__ == "__main__":
demo.queue()
demo.launch()
|