File size: 6,362 Bytes
a03c9b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
"""preprocess_maestro.py"""
import os
import glob
import re
import json
from typing import Dict, List, Tuple
import numpy as np
from utils.audio import get_audio_file_info
from utils.midi import midi2note, note_event2midi
from utils.note2event import note2note_event, note_event2event
from utils.event2note import event2note_event
from utils.note_event_dataclasses import Note, NoteEvent
from utils.utils import note_event2token2note_event_sanity_check
from utils.utils import assert_note_events_almost_equal
def create_note_event_and_note_from_midi(mid_file: str,
id: str,
ignore_pedal: bool = False) -> Tuple[Dict, Dict]:
"""Extracts note or note_event and metadata from midi:
Returns:
notes (dict): note events and metadata.
note_events (dict): note events and metadata.
"""
notes, dur_sec = midi2note(
mid_file,
binary_velocity=True,
ch_9_as_drum=False,
force_all_drum=False,
force_all_program_to=0, # always piano
trim_overlap=True,
fix_offset=True,
quantize=True,
verbose=0,
minimum_offset_sec=0.01,
drum_offset_sec=0.01,
ignore_pedal=ignore_pedal)
return { # notes
'maps_id': id,
'program': [0],
'is_drum': [0],
'duration_sec': dur_sec + 0.01,
'notes': notes,
}, { # note_events
'maps_id': id,
'program': [0],
'is_drum': [0],
'duration_sec': dur_sec + 0.01,
'note_events': note2note_event(notes),
}
def note_event2event_sanity_check(note_events: List[NoteEvent]):
"""Sanity check for note events."""
events = note_event2event(note_events, None)
note_events2, _, _ = event2note_event(events)
assert_note_events_almost_equal(note_events, note_events2)
def preprocess_maestro16k(data_home=os.PathLike,
dataset_name='maestro',
ignore_pedal=False,
sanity_check=False) -> None:
"""
Splits:
- train: 962 files
- validation: 137 files
- test: 177 files
- all: 1276 file
Writes:
- {dataset_name}_{split}_file_list.json: a dictionary with the following keys:
{
index:
{
'maestro_id': maestro_id,
'n_frames': (int),
'mix_audio_file': 'path/to/mix.wav',
'notes_file': 'path/to/notes.npy',
'note_events_file': 'path/to/note_events.npy',
'midi_file': 'path/to/midi.mid',
'program': List[int],
'is_drum': List[int], # 0 or 1
}
}
"""
# Directory and file paths
base_dir = os.path.join(data_home, dataset_name + '_yourmt3_16k')
output_index_dir = os.path.join(data_home, 'yourmt3_indexes')
os.makedirs(output_index_dir, exist_ok=True)
# Get metadata
metadata_file = os.path.join(base_dir, 'maestro-v3.0.0.json')
with open(metadata_file, 'r') as f:
_metadata = json.load(f)
metadata = {}
ids_all = list(range(len(_metadata['canonical_composer'])))
assert len(ids_all) == 1276
for i in ids_all:
metadata[i] = {}
for key in ['split', 'midi_filename', 'audio_filename', 'duration']:
metadata[i][key] = _metadata[key][str(i)]
# Collect ids and prepend base_dir to filenames
ids = {'all': ids_all, 'train': [], 'validation': [], 'test': []}
for i in ids_all:
m = metadata[i]
ids[m['split']].append(i)
# Prepend base_dir
m['midi_filename'] = os.path.join(base_dir, m['midi_filename'])
m['audio_filename'] = os.path.join(base_dir, m['audio_filename'])
# Rename '.midi' to '.mid'
if '.midi' in m['midi_filename'] and not os.path.exists(m['midi_filename'].replace(
'.midi', '.mid')):
os.rename(m['midi_filename'], m['midi_filename'].replace('.midi', '.mid'))
m['midi_filename'] = m['midi_filename'].replace('.midi', '.mid')
# File sanity check
assert os.path.exists(m['midi_filename']) and '.mid' == m['midi_filename'][-4:]
assert os.path.exists(m['audio_filename']) and '.wav' in m['audio_filename']
assert len(ids['train']) == 962
assert len(ids['validation']) == 137
assert len(ids['test']) == 177
# Create 'all' filelist, and process MIDI
file_list = {}
for i in ids['all']:
m = metadata[i]
mix_audio_file = m['audio_filename']
fs, n_frames, n_channels = get_audio_file_info(mix_audio_file)
assert fs == 16000 and n_channels == 1
n_frames = min(int(m['duration'] * 16000), n_frames)
assert n_frames > 32001
notes_file = m['midi_filename'].replace('.mid', '_notes.npy')
note_events_file = m['midi_filename'].replace('.mid', '_note_events.npy')
midi_file = m['midi_filename']
file_list[i] = {
'maestro_id': i,
'n_frames': n_frames,
'mix_audio_file': mix_audio_file,
'notes_file': notes_file,
'note_events_file': note_events_file,
'midi_file': midi_file,
'program': [0],
'is_drum': [0],
}
# Process MIDI
notes, note_events = create_note_event_and_note_from_midi(
mid_file=midi_file, id=i, ignore_pedal=ignore_pedal)
if sanity_check:
# sanity check
print(f'Sanity check for {i}: {midi_file}')
note_event2token2note_event_sanity_check(note_events['note_events'], notes['notes'])
np.save(notes_file, notes, allow_pickle=True, fix_imports=False)
print(f'Created {notes_file}')
np.save(note_events_file, note_events, allow_pickle=True, fix_imports=False)
print(f'Created {note_events_file}')
# Save index
for split in ['all', 'train', 'validation', 'test']:
fl = {}
for i, maestro_id in enumerate(ids[split]):
fl[i] = file_list[maestro_id]
output_index_file = os.path.join(output_index_dir, f'{dataset_name}_{split}_file_list.json')
with open(output_index_file, 'w') as f:
json.dump(fl, f, indent=4)
print(f'Created {output_index_file}')
|