Spaces:

seungheondoh
/

LP-Music-Caps-demo

Running

File size: 4,048 Bytes

e48ca55
 
01f8821
e48ca55
 
 
 
 
 
 
01f8821
e48ca55
 
a88e2b0
e48ca55
 
01f8821
e48ca55
 
a88e2b0
e48ca55
 
 
 
1fe02e5
 
ab464b2
e48ca55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab464b2
e48ca55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faf2605
e48ca55
28fa9dd
e48ca55
7b253d9
a88e2b0
e48ca55

import os
import argparse
import gradio as gr
from timeit import default_timer as timer
import torch
import numpy as np
import pandas as pd
from huggingface_hub import hf_hub_download
from model.bart import BartCaptionModel
from utils.audio_utils import load_audio, STR_CH_FIRST

if os.path.isfile("transfer.pth") == False:
    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
    torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')

device = "cuda:0" if torch.cuda.is_available() else "cpu"

example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
model = BartCaptionModel(max_length = 128)
pretrained_object = torch.load('./transfer.pth', map_location='cpu')
state_dict = pretrained_object['state_dict']
model.load_state_dict(state_dict)
if torch.cuda.is_available():
    torch.cuda.set_device(device)
    model = model.cuda(device)
model.eval()

def get_audio(audio_path, duration=10, target_sr=16000):
    n_samples = int(duration * target_sr)
    audio, sr = load_audio(
        path= audio_path,
        ch_format= STR_CH_FIRST,
        sample_rate= target_sr,
        downmix_to_mono= True,
    )
    if len(audio.shape) == 2:
        audio = audio.mean(0, False)  # to mono
    input_size = int(n_samples)
    if audio.shape[-1] < input_size:  # pad sequence
        pad = np.zeros(input_size)
        pad[: audio.shape[-1]] = audio
        audio = pad
    ceil = int(audio.shape[-1] // n_samples)
    audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
    return audio

def captioning(audio_path):
    audio_tensor = get_audio(audio_path = audio_path)
    if torch.cuda.is_available():
        audio_tensor = audio_tensor.to(device)
    with torch.no_grad():
        output = model.generate(
            samples=audio_tensor,
            num_beams=5,
        )
    inference = ""
    number_of_chunks = range(audio_tensor.shape[0])
    for chunk, text in zip(number_of_chunks, output):
        time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
        inference += f"{time}\n{text} \n \n"
    return inference

title = "Interactive demo: Music Captioning 🤖🎵"
description = """
<p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p> 
<p style='text-align: center'> SeungHeon Doh, Keunwoo Choi, Jongpil Lee, Juhan Nam, ISMIR 2023</p> 
<p style='text-align: center'> <a href='https://arxiv.org/abs/2307.16372' target='_blank'>ArXiv</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>Codes</a> | <a href='https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC' target='_blank'>Dataset</a> </p>
<p style='text-align: center'> To use it, simply upload your audio and click 'submit', or click one of the examples to load them. Read more at the links below. </p>
<p style='text-align: center'> If you have any error, plz check this code: <a href='https://github.com/seungheondoh/lp-music-caps/blob/main/demo/app.py' target='_blank'>Demo</a>. </p>
"""

article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"


demo = gr.Interface(fn=captioning,
                    inputs=gr.Audio(type="filepath"),
                    outputs=[
                        gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
                        ],
                    examples=example_list,
                    title=title,
                    description=description,
                    article=article, 
                    cache_examples=False
                    )
demo.launch()