diff --git "a/notebooks/test-model.ipynb" "b/notebooks/test-model.ipynb"
new file mode 100644--- /dev/null
+++ "b/notebooks/test-model.ipynb"
@@ -0,0 +1,204 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "fd262b00",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "d2253762",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import random\n",
+ "from src.mel import Mel\n",
+ "from PIL import ImageOps, Image\n",
+ "from IPython.display import Audio\n",
+ "from datasets import load_from_disk\n",
+ "from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "293dd2c7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mel = Mel(x_res=64, y_res=64, hop_length=1024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bdb2648",
+ "metadata": {},
+ "source": [
+ "### Run model inference to generate Mel spectrogram"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "aac92f90",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9fa5515ab1984c45bf459e9dfa12c3b9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/1000 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "model_id = \"../ddpm-ema-audio-64\"\n",
+ "ddpm = DDPMPipeline.from_pretrained(model_id) # you can replace DDPMPipeline with DDIMPipeline or PNDMPipeline for faster inference\n",
+ "image = ddpm()[\"sample\"][0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df6c533b",
+ "metadata": {},
+ "source": [
+ "### Transform Mel spectrogram to audio"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "37c24f43",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "audio = mel.image_to_audio(ImageOps.grayscale(image))\n",
+ "Audio(data=audio, rate=mel.get_sample_rate())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10805113",
+ "metadata": {},
+ "source": [
+ "### Compare results with random sample from training set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "7a366813",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ds = load_from_disk('../data-64')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "55a29505",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "audio = mel.image_to_audio(random.choice(ds['train'])['image'])\n",
+ "Audio(data=audio, rate=mel.get_sample_rate())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "afb1f699",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "huggingface",
+ "language": "python",
+ "name": "huggingface"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}