{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "b447e2c4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c2fc0e7a",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"from src.mel import Mel\n",
"from PIL import ImageOps, Image\n",
"from IPython.display import Audio\n",
"from diffusers import DDPMPipeline\n",
"from datasets import load_from_disk"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a3d45c36",
"metadata": {},
"outputs": [],
"source": [
"mel = Mel(x_res=64, y_res=64, hop_length=1024)"
]
},
{
"cell_type": "markdown",
"id": "011fb5a1",
"metadata": {},
"source": [
"### Run model inference to generate Mel spectrogram"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "b809fed5",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a7fe83b5914a437e99cf1838cb47f2b5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1000 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model_id = \"../ddpm-ema-audio-64\"\n",
"ddpm = DDPMPipeline.from_pretrained(model_id) # you can replace DDPMPipeline with DDIMPipeline or PNDMPipeline for faster inference\n",
"images = ddpm(output_type=\"numpy\")[\"sample\"]\n",
"images = (images * 255).round().astype(\"uint8\").transpose(0, 3, 1, 2)"
]
},
{
"cell_type": "markdown",
"id": "7230c280",
"metadata": {},
"source": [
"### Transform Mel spectrogram to audio"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "5f8a149d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audio = mel.image_to_audio(Image.fromarray(images[0][0]))\n",
"Audio(data=audio, rate=mel.get_sample_rate())"
]
},
{
"cell_type": "markdown",
"id": "ef54cef3",
"metadata": {},
"source": [
"### Compare results with random sample from training set"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "269ee816",
"metadata": {},
"outputs": [],
"source": [
"ds = load_from_disk('../data-64')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "492e2334",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audio = mel.image_to_audio(random.choice(ds['train'])['image'])\n",
"Audio(data=audio, rate=mel.get_sample_rate())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8ae1a19",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "huggingface",
"language": "python",
"name": "huggingface"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}