File size: 6,905 Bytes
a03c9b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
"""preprocess_egmd.py"""
import os
import csv
import glob
import re
import json
from typing import Dict, List, Tuple
import numpy as np
from utils.audio import get_audio_file_info
from utils.midi import midi2note, note_event2midi
from utils.note2event import note2note_event, note_event2event
from utils.event2note import event2note_event
from utils.note_event_dataclasses import Note, NoteEvent
from utils.utils import note_event2token2note_event_sanity_check
# from utils.utils import assert_note_events_almost_equal
def create_note_event_and_note_from_midi(mid_file: str, id: str) -> Tuple[Dict, Dict]:
"""Extracts note or note_event and metadata from midi:
Returns:
notes (dict): note events and metadata.
note_events (dict): note events and metadata.
"""
notes, dur_sec = midi2note(
mid_file,
binary_velocity=True,
ch_9_as_drum=True,
force_all_drum=True,
trim_overlap=True,
fix_offset=True,
quantize=True,
verbose=0,
minimum_offset_sec=0.01,
drum_offset_sec=0.01,
ignore_pedal=True)
return { # notes
'egmd_id': id,
'program': [128],
'is_drum': [1],
'duration_sec': dur_sec,
'notes': notes,
}, { # note_events
'maps_id': id,
'program': [128],
'is_drum': [1],
'duration_sec': dur_sec,
'note_events': note2note_event(notes),
}
def preprocess_egmd16k(data_home: os.PathLike, dataset_name='egmd') -> None:
"""
Splits:
- train: 35217 files
- validation: 5031 files
- test: 5289 files
- test_reduced: 246 files that contain '_5.midi' or '_10.midi' in the filename
Writes:
- {dataset_name}_{split}_file_list.json: a dictionary with the following keys:
{
index:
{
'egmd_id': egmd_id, # filename wihout extension
'n_frames': (int),
'mix_audio_file': 'path/to/mix.wav',
'notes_file': 'path/to/notes.npy',
'note_events_file': 'path/to/note_events.npy',
'midi_file': 'path/to/midi.mid',
'program': List[int],
'is_drum': List[int], # 0 or 1
}
}
"""
# Directory and file paths
base_dir = os.path.join(data_home, dataset_name + '_yourmt3_16k')
output_index_dir = os.path.join(data_home, 'yourmt3_indexes')
os.makedirs(output_index_dir, exist_ok=True)
# Load csv file and create a dictionary
csv_file = os.path.join(base_dir, 'e-gmd-v1.0.0.csv')
with open(csv_file, 'r') as f:
csv_dict_reader = csv.DictReader(f)
egmd_dict_list_all = list(csv_dict_reader)
assert len(egmd_dict_list_all) == 45537
# Process MIDI files
for d in egmd_dict_list_all:
emgd_id = d['midi_filename'].split('.')[0]
midi_file = os.path.join(base_dir, d['midi_filename'])
notes, note_events = create_note_event_and_note_from_midi(midi_file, emgd_id)
# Write notes and note_events
notes_file = midi_file.replace('.midi', '_notes.npy')
note_events_file = midi_file.replace('.midi', '_note_events.npy')
np.save(notes_file, notes, allow_pickle=True, fix_imports=False)
print(f"Created {notes_file}")
np.save(note_events_file, note_events, allow_pickle=True, fix_imports=False)
print(f"Created {note_events_file}")
# rewrite 120 bpm quantized midi file
quantized_midi_file = midi_file.replace('.midi', '_quantized_120bpm.mid')
note_event2midi(note_events['note_events'], quantized_midi_file)
print(f'Wrote {quantized_midi_file}')
# Process audio files
pass
# Create index files
for split in ['train', 'validation', 'test']:
file_list = {}
i = 0
for d in egmd_dict_list_all:
if d['split'] == split:
egmd_id = d['midi_filename'].split('.')[0]
mix_audio_file = os.path.join(base_dir, d['audio_filename'])
n_frames = get_audio_file_info(mix_audio_file)[1]
midi_file = os.path.join(base_dir, d['midi_filename'])
notes_file = midi_file.replace('.midi', '_notes.npy')
note_events_file = midi_file.replace('.midi', '_note_events.npy')
# check file existence
assert os.path.exists(mix_audio_file)
assert os.path.exists(midi_file)
assert os.path.exists(notes_file)
assert os.path.exists(note_events_file)
# create file list
file_list[i] = {
'egmd_id': egmd_id,
'n_frames': n_frames,
'mix_audio_file': mix_audio_file,
'notes_file': notes_file,
'note_events_file': note_events_file,
'midi_file': midi_file,
'program': [128],
'is_drum': [1],
}
i += 1
else:
pass
# Write file list
output_file = os.path.join(output_index_dir, f'{dataset_name}_{split}_file_list.json')
with open(output_file, 'w') as f:
json.dump(file_list, f, indent=4)
print(f'Wrote {output_file}')
if split == 'train':
assert len(file_list) == 35217
elif split == 'validation':
assert len(file_list) == 5031
elif split == 'test':
assert len(file_list) == 5289
# Create reduced test index file
split = 'test_reduced'
file_list = {}
i = 0
for d in egmd_dict_list_all:
if d['split'] == 'test':
midi_file = os.path.join(base_dir, d['midi_filename'])
if '_5.midi' in midi_file or '_10.midi' in midi_file:
egmd_id = d['midi_filename'].split('.')[0]
mix_audio_file = os.path.join(base_dir, d['audio_filename'])
n_frames = get_audio_file_info(mix_audio_file)[1]
notes_file = midi_file.replace('.midi', '_notes.npy')
note_events_file = midi_file.replace('.midi', '_note_events.npy')
file_list[i] = {
'egmd_id': egmd_id,
'n_frames': n_frames,
'mix_audio_file': mix_audio_file,
'notes_file': notes_file,
'note_events_file': note_events_file,
'midi_file': midi_file,
'program': [128],
'is_drum': [1],
}
i += 1
output_file = os.path.join(output_index_dir, f'{dataset_name}_{split}_file_list.json')
with open(output_file, 'w') as f:
json.dump(file_list, f, indent=4)
print(f'Wrote {output_file}')
assert len(file_list) == 246
|