|
import os |
|
import argparse |
|
import gradio as gr |
|
from timeit import default_timer as timer |
|
import torch |
|
import numpy as np |
|
import pandas as pd |
|
from huggingface_hub import hf_hub_download |
|
from model.bart import BartCaptionModel |
|
from utils.audio_utils import load_audio, STR_CH_FIRST |
|
|
|
if os.path.isfile("transfer.pth") == False: |
|
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth') |
|
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3') |
|
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav') |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
example_list = ['electronic.mp3', 'orchestra.wav'] |
|
model = BartCaptionModel(max_length = 128) |
|
pretrained_object = torch.load('./transfer.pth', map_location='cpu') |
|
state_dict = pretrained_object['state_dict'] |
|
model.load_state_dict(state_dict) |
|
if torch.cuda.is_available(): |
|
torch.cuda.set_device(device) |
|
model = model.cuda(device) |
|
model.eval() |
|
|
|
def get_audio(audio_path, duration=10, target_sr=16000): |
|
n_samples = int(duration * target_sr) |
|
audio, sr = load_audio( |
|
path= audio_path, |
|
ch_format= STR_CH_FIRST, |
|
sample_rate= target_sr, |
|
downmix_to_mono= True, |
|
) |
|
if len(audio.shape) == 2: |
|
audio = audio.mean(0, False) |
|
input_size = int(n_samples) |
|
if audio.shape[-1] < input_size: |
|
pad = np.zeros(input_size) |
|
pad[: audio.shape[-1]] = audio |
|
audio = pad |
|
ceil = int(audio.shape[-1] // n_samples) |
|
audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32')) |
|
return audio |
|
|
|
def captioning(audio_path): |
|
audio_tensor = get_audio(audio_path = audio_path) |
|
if torch.cuda.is_available(): |
|
audio_tensor = audio_tensor.to(device) |
|
with torch.no_grad(): |
|
output = model.generate( |
|
samples=audio_tensor, |
|
num_beams=5, |
|
) |
|
inference = "" |
|
number_of_chunks = range(audio_tensor.shape[0]) |
|
for chunk, text in zip(number_of_chunks, output): |
|
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]" |
|
inference += f"{time}\n{text} \n \n" |
|
return inference |
|
|
|
title = "Interactive demo: Music Captioning π€π΅" |
|
description = """ |
|
<p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p> |
|
<p style='text-align: center'> SeungHeon Doh, Keunwoo Choi, Jongpil Lee, Juhan Nam, ISMIR 2023</p> |
|
<p style='text-align: center'> <a href='#' target='_blank'>ArXiv</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>Github</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>LP-MusicCaps-Dataset</a> </p> |
|
<p style='text-align: center'> To use it, simply upload your audio and click 'submit', or click one of the examples to load them. Read more at the links below. </p> |
|
""" |
|
article = "<p style='text-align: center'><a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>LP-MusicCaps Github</a> | <a href='#' target='_blank'>LP-MusicCaps Paper</a></p>" |
|
|
|
|
|
demo = gr.Interface(fn=captioning, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs=[ |
|
gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"), |
|
], |
|
examples=example_list, |
|
title=title, |
|
description=description, |
|
article=article, |
|
cache_examples=False |
|
) |
|
demo.launch() |