Spaces:

tspsram
/

music

Build error

App Files Files Community

tspsram commited on Oct 14, 2024

Commit

35410df

verified ·

1 Parent(s): 08ab31c

Upload 19 files

Browse files

Files changed (19) hide show

.gitattributes +36 -35
.gitignore +154 -0
Dockerfile +43 -0
MIDI.py +1735 -0
README.md +13 -12
app.py +534 -0
app_onnx.py +626 -0
example/Bach--Fugue-in-D-Minor.mid +3 -0
example/Beethoven--Symphony-No5-in-C-Minor-Fate-Opus-67.mid +3 -0
example/Chopin--Nocturne No. 9 in B Major, Opus 32 No.1, Andante Sostenuto.mid +3 -0
example/Mozart--Requiem, No.1..mid +3 -0
example/castle_in_the_sky.mid +3 -0
example/eva-残酷な天使のテーゼ.mid +3 -0
javascript/app.js +732 -0
midi_model.py +250 -0
midi_synthesizer.py +81 -0
midi_tokenizer.py +1196 -0
packages.txt +1 -0
requirements.txt +11 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,36 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mid filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,154 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+output.mid
+/outputs/

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+RUN apt-get update && apt-get install --no-install-recommends -y \
+  build-essential \
+  python3.9 \
+  python3-pip \
+  git \
+  ffmpeg \
+  fluidsynth \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python3", "app.py"]

MIDI.py ADDED Viewed

	@@ -0,0 +1,1735 @@

+#! /usr/bin/python3
+# unsupported 20091104 ...
+#     ['set_sequence_number', dtime, sequence]
+#     ['raw_data', dtime, raw]
+# 20150914   jimbo1qaz   MIDI.py str/bytes bug report
+# I found a MIDI file which had Shift-JIS titles. When midi.py decodes it as
+# latin-1, it produces a string which cannot even be accessed without raising
+# a UnicodeDecodeError.  Maybe, when converting raw byte strings from MIDI,
+# you should keep them as bytes, not improperly decode them.  However, this
+# would change the API.  (ie: text = a "string" ? of 0 or more bytes).  It
+# could break compatiblity, but there's not much else you can do to fix the bug
+# https://en.wikipedia.org/wiki/Shift_JIS
+r'''
+This module offers functions:  concatenate_scores(), grep(),
+merge_scores(), mix_scores(), midi2opus(), midi2score(), opus2midi(),
+opus2score(), play_score(), score2midi(), score2opus(), score2stats(),
+score_type(), segment(), timeshift() and to_millisecs(),
+where "midi" means the MIDI-file bytes (as can be put in a .mid file,
+or piped into aplaymidi), and "opus" and "score" are list-structures
+as inspired by Sean Burke's MIDI-Perl CPAN module.
+Warning: Version 6.4 is not necessarily backward-compatible with
+previous versions, in that text-data is now bytes, not strings.
+This reflects the fact that many MIDI files have text data in
+encodings other that ISO-8859-1, for example in Shift-JIS.
+Download MIDI.py from   http://www.pjb.com.au/midi/free/MIDI.py
+and put it in your PYTHONPATH.  MIDI.py depends on Python3.
+There is also a call-compatible translation into Lua of this
+module: see http://www.pjb.com.au/comp/lua/MIDI.html
+The "opus" is a direct translation of the midi-file-events, where
+the times are delta-times, in ticks, since the previous event.
+The "score" is more human-centric; it uses absolute times, and
+combines the separate note_on and note_off events into one "note"
+event, with a duration:
+ ['note', start_time, duration, channel, note, velocity] # in a "score"
+  EVENTS (in an "opus" structure)
+     ['note_off', dtime, channel, note, velocity]       # in an "opus"
+     ['note_on', dtime, channel, note, velocity]        # in an "opus"
+     ['key_after_touch', dtime, channel, note, velocity]
+     ['control_change', dtime, channel, controller(0-127), value(0-127)]
+     ['patch_change', dtime, channel, patch]
+     ['channel_after_touch', dtime, channel, velocity]
+     ['pitch_wheel_change', dtime, channel, pitch_wheel]
+     ['text_event', dtime, text]
+     ['copyright_text_event', dtime, text]
+     ['track_name', dtime, text]
+     ['instrument_name', dtime, text]
+     ['lyric', dtime, text]
+     ['marker', dtime, text]
+     ['cue_point', dtime, text]
+     ['text_event_08', dtime, text]
+     ['text_event_09', dtime, text]
+     ['text_event_0a', dtime, text]
+     ['text_event_0b', dtime, text]
+     ['text_event_0c', dtime, text]
+     ['text_event_0d', dtime, text]
+     ['text_event_0e', dtime, text]
+     ['text_event_0f', dtime, text]
+     ['end_track', dtime]
+     ['set_tempo', dtime, tempo]
+     ['smpte_offset', dtime, hr, mn, se, fr, ff]
+     ['time_signature', dtime, nn, dd, cc, bb]
+     ['key_signature', dtime, sf, mi]
+     ['sequencer_specific', dtime, raw]
+     ['raw_meta_event', dtime, command(0-255), raw]
+     ['sysex_f0', dtime, raw]
+     ['sysex_f7', dtime, raw]
+     ['song_position', dtime, song_pos]
+     ['song_select', dtime, song_number]
+     ['tune_request', dtime]
+  DATA TYPES
+     channel = a value 0 to 15
+     controller = 0 to 127 (see http://www.pjb.com.au/muscript/gm.html#cc )
+     dtime = time measured in "ticks", 0 to 268435455
+     velocity = a value 0 (soft) to 127 (loud)
+     note = a value 0 to 127  (middle-C is 60)
+     patch = 0 to 127 (see http://www.pjb.com.au/muscript/gm.html )
+     pitch_wheel = a value -8192 to 8191 (0x1FFF)
+     raw = bytes, of length 0 or more  (for sysex events see below)
+     sequence_number = a value 0 to 65,535 (0xFFFF)
+     song_pos = a value 0 to 16,383 (0x3FFF)
+     song_number = a value 0 to 127
+     tempo = microseconds per crochet (quarter-note), 0 to 16777215
+     text = bytes, of length 0 or more
+     ticks = the number of ticks per crochet (quarter-note)
+   In sysex_f0 events, the raw data must not start with a \xF0 byte,
+   since this gets added automatically;
+   but it must end with an explicit \xF7 byte!
+   In the very unlikely case that you ever need to split sysex data
+   into one sysex_f0 followed by one or more sysex_f7s, then only the
+   last of those sysex_f7 events must end with the explicit \xF7 byte
+   (again, the raw data of individual sysex_f7 events must not start
+   with any \xF7 byte, since this gets added automatically).
+   Since version 6.4, text data is in bytes, not in a ISO-8859-1 string.
+  GOING THROUGH A SCORE WITHIN A PYTHON PROGRAM
+    channels = {2,3,5,8,13}
+    itrack = 1   # skip 1st element which is ticks
+    while itrack < len(score):
+        for event in score[itrack]:
+            if event[0] == 'note':   # for example,
+                pass  # do something to all notes
+            # or, to work on events in only particular channels...
+            channel_index = MIDI.Event2channelindex.get(event[0], False)
+            if channel_index and (event[channel_index] in channels):
+                pass  # do something to channels 2,3,5,8 and 13
+        itrack += 1
+'''
+import sys, struct, copy
+# sys.stdout = os.fdopen(sys.stdout.fileno(), 'wb')
+Version = '6.7'
+VersionDate = '20201120'
+# 20201120 6.7 call to bytest() removed, and protect _unshift_ber_int
+# 20160702 6.6 to_millisecs() now handles set_tempo across multiple Tracks
+# 20150921 6.5 segment restores controllers as well as patch and tempo
+# 20150914 6.4 text data is bytes or bytearray, not ISO-8859-1 strings
+# 20150628 6.3 absent any set_tempo, default is 120bpm (see MIDI file spec 1.1)
+# 20150101 6.2 all text events can be 8-bit; let user get the right encoding
+# 20141231 6.1 fix _some_text_event; sequencer_specific data can be 8-bit
+# 20141230 6.0 synth_specific data can be 8-bit
+# 20120504 5.9 add the contents of mid_opus_tracks()
+# 20120208 5.8 fix num_notes_by_channel() ; should be a dict
+# 20120129 5.7 _encode handles empty tracks; score2stats num_notes_by_channel
+# 20111111 5.6 fix patch 45 and 46 in Number2patch, should be Harp
+# 20110129 5.5 add mix_opus_tracks() and event2alsaseq()
+# 20110126 5.4 "previous message repeated N times" to save space on stderr
+# 20110125 5.2 opus2score terminates unended notes at the end of the track
+# 20110124 5.1 the warnings in midi2opus display track_num
+# 21110122 5.0 if garbage, midi2opus returns the opus so far
+# 21110119 4.9 non-ascii chars stripped out of the text_events
+# 21110110 4.8 note_on with velocity=0 treated as a note-off
+# 21110108 4.6 unknown F-series event correctly eats just one byte
+# 21011010 4.2 segment() uses start_time, end_time named params
+# 21011005 4.1 timeshift() must not pad the set_tempo command
+# 21011003 4.0 pitch2note_event must be chapitch2note_event
+# 21010918 3.9 set_sequence_number supported, FWIW
+# 20100913 3.7 many small bugfixes; passes all tests
+# 20100910 3.6 concatenate_scores enforce ticks=1000, just like merge_scores
+# 20100908 3.5 minor bugs fixed in score2stats
+# 20091104 3.4 tune_request now supported
+# 20091104 3.3 fixed bug in decoding song_position and song_select
+# 20091104 3.2 unsupported: set_sequence_number tune_request raw_data
+# 20091101 3.1 document how to traverse a score within Python
+# 20091021 3.0 fixed bug in score2stats detecting GM-mode = 0
+# 20091020 2.9 score2stats reports GM-mode and bank msb,lsb events
+# 20091019 2.8 in merge_scores, channel 9 must remain channel 9 (in GM)
+# 20091018 2.7 handles empty tracks gracefully
+# 20091015 2.6 grep() selects channels
+# 20091010 2.5 merge_scores reassigns channels to avoid conflicts
+# 20091010 2.4 fixed bug in to_millisecs which now only does opusses
+# 20091010 2.3 score2stats returns channels & patch_changes, by_track & total
+# 20091010 2.2 score2stats() returns also pitches and percussion dicts
+# 20091010 2.1 bugs: >= not > in segment, to notice patch_change at time 0
+# 20091010 2.0 bugs: spurious pop(0) ( in _decode sysex
+# 20091008 1.9 bugs: ISO decoding in sysex; str( not int( in note-off warning
+# 20091008 1.8 add concatenate_scores()
+# 20091006 1.7 score2stats() measures nticks and ticks_per_quarter
+# 20091004 1.6 first mix_scores() and merge_scores()
+# 20090424 1.5 timeshift() bugfix: earliest only sees events after from_time
+# 20090330 1.4 timeshift() has also a from_time argument
+# 20090322 1.3 timeshift() has also a start_time argument
+# 20090319 1.2 add segment() and timeshift()
+# 20090301 1.1 add to_millisecs()
+_previous_warning = ''  # 5.4
+_previous_times = 0     # 5.4
+_no_warning = True
+#------------------------------- Encoding stuff --------------------------
+def opus2midi(opus=[]):
+    r'''The argument is a list: the first item in the list is the "ticks"
+parameter, the others are the tracks. Each track is a list
+of midi-events, and each event is itself a list; see above.
+opus2midi() returns a bytestring of the MIDI, which can then be
+written either to a file opened in binary mode (mode='wb'),
+or to stdout by means of:   sys.stdout.buffer.write()
+my_opus = [
+    96,
+    [   # track 0:
+        ['patch_change', 0, 1, 8],   # and these are the events...
+        ['note_on',   5, 1, 25, 96],
+        ['note_off', 96, 1, 25, 0],
+        ['note_on',   0, 1, 29, 96],
+        ['note_off', 96, 1, 29, 0],
+    ],   # end of track 0
+]
+my_midi = opus2midi(my_opus)
+sys.stdout.buffer.write(my_midi)
+'''
+    if len(opus) < 2:
+        opus=[1000, [],]
+    tracks = copy.deepcopy(opus)
+    ticks = int(tracks.pop(0))
+    ntracks = len(tracks)
+    if ntracks == 1:
+        format = 0
+    else:
+        format = 1
+    my_midi = b"MThd\x00\x00\x00\x06"+struct.pack('>HHH',format,ntracks,ticks)
+    for track in tracks:
+        events = _encode(track)
+        my_midi += b'MTrk' + struct.pack('>I',len(events)) + events
+    _clean_up_warnings()
+    return my_midi
+def score2opus(score=None):
+    r'''
+The argument is a list: the first item in the list is the "ticks"
+parameter, the others are the tracks. Each track is a list
+of score-events, and each event is itself a list.  A score-event
+is similar to an opus-event (see above), except that in a score:
+ 1) the times are expressed as an absolute number of ticks
+    from the track's start time
+ 2) the pairs of 'note_on' and 'note_off' events in an "opus"
+    are abstracted into a single 'note' event in a "score":
+    ['note', start_time, duration, channel, pitch, velocity]
+score2opus() returns a list specifying the equivalent "opus".
+my_score = [
+    96,
+    [   # track 0:
+        ['patch_change', 0, 1, 8],
+        ['note', 5, 96, 1, 25, 96],
+        ['note', 101, 96, 1, 29, 96]
+    ],   # end of track 0
+]
+my_opus = score2opus(my_score)
+'''
+    if len(score) < 2:
+        score=[1000, [],]
+    tracks = copy.deepcopy(score)
+    ticks = int(tracks.pop(0))
+    opus_tracks = []
+    for scoretrack in tracks:
+        time2events = dict([])
+        for scoreevent in scoretrack:
+            if scoreevent[0] == 'note':
+                note_on_event = ['note_on',scoreevent[1],
+                 scoreevent[3],scoreevent[4],scoreevent[5]]
+                note_off_event = ['note_off',scoreevent[1]+scoreevent[2],
+                 scoreevent[3],scoreevent[4],scoreevent[5]]
+                if time2events.get(note_on_event[1]):
+                   time2events[note_on_event[1]].append(note_on_event)
+                else:
+                   time2events[note_on_event[1]] = [note_on_event,]
+                if time2events.get(note_off_event[1]):
+                   time2events[note_off_event[1]].append(note_off_event)
+                else:
+                   time2events[note_off_event[1]] = [note_off_event,]
+                continue
+            if time2events.get(scoreevent[1]):
+               time2events[scoreevent[1]].append(scoreevent)
+            else:
+               time2events[scoreevent[1]] = [scoreevent,]
+        sorted_times = []  # list of keys
+        for k in time2events.keys():
+            sorted_times.append(k)
+        sorted_times.sort()
+        sorted_events = []  # once-flattened list of values sorted by key
+        for time in sorted_times:
+            sorted_events.extend(time2events[time])
+        abs_time = 0
+        for event in sorted_events:  # convert abs times => delta times
+            delta_time = event[1] - abs_time
+            abs_time = event[1]
+            event[1] = delta_time
+        opus_tracks.append(sorted_events)
+    opus_tracks.insert(0,ticks)
+    _clean_up_warnings()
+    return opus_tracks
+def score2midi(score=None):
+    r'''
+Translates a "score" into MIDI, using score2opus() then opus2midi()
+'''
+    return opus2midi(score2opus(score))
+#--------------------------- Decoding stuff ------------------------
+def midi2opus(midi=b''):
+    r'''Translates MIDI into a "opus".  For a description of the
+"opus" format, see opus2midi()
+'''
+    my_midi=bytearray(midi)
+    if len(my_midi) < 4:
+        _clean_up_warnings()
+        return [1000,[],]
+    id = bytes(my_midi[0:4])
+    if id != b'MThd':
+        _warn("midi2opus: midi starts with "+str(id)+" instead of 'MThd'")
+        _clean_up_warnings()
+        return [1000,[],]
+    [length, format, tracks_expected, ticks] = struct.unpack(
+     '>IHHH', bytes(my_midi[4:14]))
+    if length != 6:
+        _warn("midi2opus: midi header length was "+str(length)+" instead of 6")
+        _clean_up_warnings()
+        return [1000,[],]
+    my_opus = [ticks,]
+    my_midi = my_midi[14:]
+    track_num = 1   # 5.1
+    while len(my_midi) >= 8:
+        track_type   = bytes(my_midi[0:4])
+        if track_type != b'MTrk':
+            _warn('midi2opus: Warning: track #'+str(track_num)+' type is '+str(track_type)+" instead of b'MTrk'")
+        [track_length] = struct.unpack('>I', my_midi[4:8])
+        my_midi = my_midi[8:]
+        if track_length > len(my_midi):
+            _warn('midi2opus: track #'+str(track_num)+' length '+str(track_length)+' is too large')
+            _clean_up_warnings()
+            return my_opus   # 5.0
+        my_midi_track = my_midi[0:track_length]
+        my_track = _decode(my_midi_track)
+        my_opus.append(my_track)
+        my_midi = my_midi[track_length:]
+        track_num += 1   # 5.1
+    _clean_up_warnings()
+    return my_opus
+def opus2score(opus=[]):
+    r'''For a description of the "opus" and "score" formats,
+see opus2midi() and score2opus().
+'''
+    if len(opus) < 2:
+        _clean_up_warnings()
+        return [1000,[],]
+    tracks = copy.deepcopy(opus)  # couple of slices probably quicker...
+    ticks = int(tracks.pop(0))
+    score = [ticks,]
+    for opus_track in tracks:
+        ticks_so_far = 0
+        score_track = []
+        chapitch2note_on_events = dict([])   # 4.0
+        for opus_event in opus_track:
+            ticks_so_far += opus_event[1]
+            if opus_event[0] == 'note_off' or (opus_event[0] == 'note_on' and opus_event[4] == 0):  # 4.8
+                cha = opus_event[2]
+                pitch = opus_event[3]
+                key = cha*128 + pitch
+                if chapitch2note_on_events.get(key):
+                    new_event = chapitch2note_on_events[key].pop(0)
+                    new_event[2] = ticks_so_far - new_event[1]
+                    score_track.append(new_event)
+                elif pitch > 127:
+                    pass #_warn('opus2score: note_off with no note_on, bad pitch='+str(pitch))
+                else:
+                    pass #_warn('opus2score: note_off with no note_on cha='+str(cha)+' pitch='+str(pitch))
+            elif opus_event[0] == 'note_on':
+                cha = opus_event[2]
+                pitch = opus_event[3]
+                key = cha*128 + pitch
+                new_event = ['note',ticks_so_far,0,cha,pitch, opus_event[4]]
+                if chapitch2note_on_events.get(key):
+                    chapitch2note_on_events[key].append(new_event)
+                else:
+                    chapitch2note_on_events[key] = [new_event,]
+            else:
+                opus_event[1] = ticks_so_far
+                score_track.append(opus_event)
+        # check for unterminated notes (Oisín) -- 5.2
+        for chapitch in chapitch2note_on_events:
+            note_on_events = chapitch2note_on_events[chapitch]
+            for new_e in note_on_events:
+                new_e[2] = ticks_so_far - new_e[1]
+                score_track.append(new_e)
+                pass #_warn("opus2score: note_on with no note_off cha="+str(new_e[3])+' pitch='+str(new_e[4])+'; adding note_off at end')
+        score.append(score_track)
+    _clean_up_warnings()
+    return score
+def midi2score(midi=b''):
+    r'''
+Translates MIDI into a "score", using midi2opus() then opus2score()
+'''
+    return opus2score(midi2opus(midi))
+def midi2ms_score(midi=b''):
+    r'''
+Translates MIDI into a "score" with one beat per second and one
+tick per millisecond, using midi2opus() then to_millisecs()
+then opus2score()
+'''
+    return opus2score(to_millisecs(midi2opus(midi)))
+#------------------------ Other Transformations ---------------------
+def to_millisecs(old_opus=None):
+    r'''Recallibrates all the times in an "opus" to use one beat
+per second and one tick per millisecond.  This makes it
+hard to retrieve any information about beats or barlines,
+but it does make it easy to mix different scores together.
+'''
+    if old_opus == None:
+        return [1000,[],]
+    try:
+        old_tpq  = int(old_opus[0])
+    except IndexError:   # 5.0
+        _warn('to_millisecs: the opus '+str(type(old_opus))+' has no elements')
+        return [1000,[],]
+    new_opus = [1000,]
+    # 6.7 first go through building a table of set_tempos by absolute-tick
+    ticks2tempo = {}
+    itrack = 1
+    while itrack < len(old_opus):
+        ticks_so_far = 0
+        for old_event in old_opus[itrack]:
+            if old_event[0] == 'note':
+                raise TypeError('to_millisecs needs an opus, not a score')
+            ticks_so_far += old_event[1]
+            if old_event[0] == 'set_tempo':
+                ticks2tempo[ticks_so_far] = old_event[2]
+        itrack += 1
+    # then get the sorted-array of their keys
+    tempo_ticks = []  # list of keys
+    for k in ticks2tempo.keys():
+        tempo_ticks.append(k)
+    tempo_ticks.sort()
+    # then go through converting to millisec, testing if the next
+    # set_tempo lies before the next track-event, and using it if so.
+    itrack = 1
+    while itrack < len(old_opus):
+        ms_per_old_tick = 500.0 / old_tpq  # float: will round later 6.3
+        i_tempo_ticks = 0
+        ticks_so_far = 0
+        ms_so_far = 0.0
+        previous_ms_so_far = 0.0
+        new_track = [['set_tempo',0,1000000],]  # new "crochet" is 1 sec
+        for old_event in old_opus[itrack]:
+            # detect if ticks2tempo has something before this event
+            # 20160702 if ticks2tempo is at the same time, leave it
+            event_delta_ticks = old_event[1]
+            if (i_tempo_ticks < len(tempo_ticks) and
+              tempo_ticks[i_tempo_ticks] < (ticks_so_far + old_event[1])):
+                delta_ticks = tempo_ticks[i_tempo_ticks] - ticks_so_far
+                ms_so_far += (ms_per_old_tick * delta_ticks)
+                ticks_so_far = tempo_ticks[i_tempo_ticks]
+                ms_per_old_tick = ticks2tempo[ticks_so_far] / (1000.0*old_tpq)
+                i_tempo_ticks += 1
+                event_delta_ticks -= delta_ticks
+            new_event = copy.deepcopy(old_event)  # now handle the new event
+            ms_so_far += (ms_per_old_tick * old_event[1])
+            new_event[1] = round(ms_so_far - previous_ms_so_far)
+            if old_event[0] != 'set_tempo':
+                previous_ms_so_far = ms_so_far
+                new_track.append(new_event)
+            ticks_so_far += event_delta_ticks
+        new_opus.append(new_track)
+        itrack += 1
+    _clean_up_warnings()
+    return new_opus
+def event2alsaseq(event=None):   # 5.5
+    r'''Converts an event into the format needed by the alsaseq module,
+http://pp.com.mx/python/alsaseq
+The type of track (opus or score) is autodetected.
+'''
+    pass
+def grep(score=None, channels=None):
+    r'''Returns a "score" containing only the channels specified
+'''
+    if score == None:
+        return [1000,[],]
+    ticks = score[0]
+    new_score = [ticks,]
+    if channels == None:
+        return new_score
+    channels = set(channels)
+    global Event2channelindex
+    itrack = 1
+    while itrack < len(score):
+        new_score.append([])
+        for event in score[itrack]:
+            channel_index = Event2channelindex.get(event[0], False)
+            if channel_index:
+                if event[channel_index] in channels:
+                    new_score[itrack].append(event)
+            else:
+                new_score[itrack].append(event)
+        itrack += 1
+    return new_score
+def play_score(score=None):
+    r'''Converts the "score" to midi, and feeds it into 'aplaymidi -'
+'''
+    if score == None:
+        return
+    import subprocess
+    pipe = subprocess.Popen(['aplaymidi','-'], stdin=subprocess.PIPE)
+    if score_type(score) == 'opus':
+        pipe.stdin.write(opus2midi(score))
+    else:
+        pipe.stdin.write(score2midi(score))
+    pipe.stdin.close()
+def timeshift(score=None, shift=None, start_time=None, from_time=0, tracks={0,1,2,3,4,5,6,7,8,10,12,13,14,15}):
+    r'''Returns a "score" shifted in time by "shift" ticks, or shifted
+so that the first event starts at "start_time" ticks.
+If "from_time" is specified, only those events in the score
+that begin after it are shifted. If "start_time" is less than
+"from_time" (or "shift" is negative), then the intermediate
+notes are deleted, though patch-change events are preserved.
+If "tracks" are specified, then only those tracks get shifted.
+"tracks" can be a list, tuple or set; it gets converted to set
+internally.
+It is deprecated to specify both "shift" and "start_time".
+If this does happen, timeshift() will print a warning to
+stderr and ignore the "shift" argument.
+If "shift" is negative and sufficiently large that it would
+leave some event with a negative tick-value, then the score
+is shifted so that the first event occurs at time 0. This
+also occurs if "start_time" is negative, and is also the
+default if neither "shift" nor "start_time" are specified.
+'''
+    #_warn('tracks='+str(tracks))
+    if score == None or len(score) < 2:
+        return [1000, [],]
+    new_score = [score[0],]
+    my_type = score_type(score)
+    if my_type == '':
+        return new_score
+    if my_type == 'opus':
+        _warn("timeshift: opus format is not supported\n")
+        # _clean_up_scores()  6.2; doesn't exist! what was it supposed to do?
+        return new_score
+    if not (shift == None) and not (start_time == None):
+        _warn("timeshift: shift and start_time specified: ignoring shift\n")
+        shift = None
+    if shift == None:
+        if (start_time == None) or (start_time < 0):
+            start_time = 0
+        # shift = start_time - from_time
+    i = 1   # ignore first element (ticks)
+    tracks = set(tracks)  # defend against tuples and lists
+    earliest = 1000000000
+    if not (start_time == None) or shift < 0:  # first find the earliest event
+        while i < len(score):
+            if len(tracks) and not ((i-1) in tracks):
+                i += 1
+                continue
+            for event in score[i]:
+                 if event[1] < from_time:
+                     continue  # just inspect the to_be_shifted events
+                 if event[1] < earliest:
+                     earliest = event[1]
+            i += 1
+    if earliest > 999999999:
+        earliest = 0
+    if shift == None:
+        shift = start_time - earliest
+    elif (earliest + shift) < 0:
+        start_time = 0
+        shift = 0 - earliest
+    i = 1   # ignore first element (ticks)
+    while i < len(score):
+        if len(tracks) == 0 or not ((i-1) in tracks):  # 3.8
+            new_score.append(score[i])
+            i += 1
+            continue
+        new_track = []
+        for event in score[i]:
+            new_event = list(event)
+            #if new_event[1] == 0 and shift > 0 and new_event[0] != 'note':
+            #    pass
+            #elif new_event[1] >= from_time:
+            if new_event[1] >= from_time:
+                # 4.1 must not rightshift set_tempo
+                if new_event[0] != 'set_tempo' or shift<0:
+                    new_event[1] += shift
+            elif (shift < 0) and (new_event[1] >= (from_time+shift)):
+                continue
+            new_track.append(new_event)
+        if len(new_track) > 0:
+            new_score.append(new_track)
+        i += 1
+    _clean_up_warnings()
+    return new_score
+def segment(score=None, start_time=None, end_time=None, start=0, end=100000000,
+ tracks={0,1,2,3,4,5,6,7,8,10,11,12,13,14,15}):
+    r'''Returns a "score" which is a segment of the one supplied
+as the argument, beginning at "start_time" ticks and ending
+at "end_time" ticks (or at the end if "end_time" is not supplied).
+If the set "tracks" is specified, only those tracks will
+be returned.
+'''
+    if score == None or len(score) < 2:
+        return [1000, [],]
+    if start_time == None:  # as of 4.2 start_time is recommended
+        start_time = start  # start is legacy usage
+    if end_time == None:    # likewise
+        end_time = end
+    new_score = [score[0],]
+    my_type = score_type(score)
+    if my_type == '':
+        return new_score
+    if my_type == 'opus':
+        # more difficult (disconnecting note_on's from their note_off's)...
+        _warn("segment: opus format is not supported\n")
+        _clean_up_warnings()
+        return new_score
+    i = 1   # ignore first element (ticks); we count in ticks anyway
+    tracks = set(tracks)  # defend against tuples and lists
+    while i < len(score):
+        if len(tracks) and not ((i-1) in tracks):
+            i += 1
+            continue
+        new_track = []
+        channel2cc_num  = {}     # most recent controller change before start
+        channel2cc_val  = {}
+        channel2cc_time = {}
+        channel2patch_num  = {}  # keep most recent patch change before start
+        channel2patch_time = {}
+        set_tempo_num  = 500000 # most recent tempo change before start 6.3
+        set_tempo_time = 0
+        earliest_note_time = end_time
+        for event in score[i]:
+            if event[0] == 'control_change':  # 6.5
+                cc_time = channel2cc_time.get(event[2]) or 0
+                if (event[1] <= start_time) and (event[1] >= cc_time):
+                    channel2cc_num[event[2]]  = event[3]
+                    channel2cc_val[event[2]]  = event[4]
+                    channel2cc_time[event[2]] = event[1]
+            elif event[0] == 'patch_change':
+                patch_time = channel2patch_time.get(event[2]) or 0
+                if (event[1]<=start_time) and (event[1] >= patch_time):  # 2.0
+                    channel2patch_num[event[2]]  = event[3]
+                    channel2patch_time[event[2]] = event[1]
+            elif event[0] == 'set_tempo':
+                if (event[1]<=start_time) and (event[1]>=set_tempo_time): #6.4
+                    set_tempo_num  = event[2]
+                    set_tempo_time = event[1]
+            if (event[1] >= start_time) and (event[1] <= end_time):
+                new_track.append(event)
+                if (event[0] == 'note') and (event[1] < earliest_note_time):
+                    earliest_note_time = event[1]
+        if len(new_track) > 0:
+            new_track.append(['set_tempo', start_time, set_tempo_num])
+            for c in channel2patch_num:
+                new_track.append(['patch_change',start_time,c,channel2patch_num[c]],)
+            for c in channel2cc_num:   # 6.5
+                new_track.append(['control_change',start_time,c,channel2cc_num[c],channel2cc_val[c]])
+            new_score.append(new_track)
+        i += 1
+    _clean_up_warnings()
+    return new_score
+def score_type(opus_or_score=None):
+    r'''Returns a string, either 'opus' or 'score' or ''
+'''
+    if opus_or_score == None or str(type(opus_or_score)).find('list')<0 or len(opus_or_score) < 2:
+        return ''
+    i = 1   # ignore first element
+    while i < len(opus_or_score):
+        for event in opus_or_score[i]:
+            if event[0] == 'note':
+                return 'score'
+            elif event[0] == 'note_on':
+                return 'opus'
+        i += 1
+    return ''
+def concatenate_scores(scores):
+    r'''Concatenates a list of scores into one score.
+If the scores differ in their "ticks" parameter,
+they will all get converted to millisecond-tick format.
+'''
+    # the deepcopys are needed if the input_score's are refs to the same obj
+    # e.g. if invoked by midisox's repeat()
+    input_scores = _consistentise_ticks(scores)  # 3.7
+    output_score = copy.deepcopy(input_scores[0])
+    for input_score in input_scores[1:]:
+        output_stats = score2stats(output_score)
+        delta_ticks = output_stats['nticks']
+        itrack = 1
+        while itrack < len(input_score):
+            if itrack >= len(output_score): # new output track if doesn't exist
+                output_score.append([])
+            for event in input_score[itrack]:
+                output_score[itrack].append(copy.deepcopy(event))
+                output_score[itrack][-1][1] += delta_ticks
+            itrack += 1
+    return output_score
+def merge_scores(scores):
+    r'''Merges a list of scores into one score.  A merged score comprises
+all of the tracks from all of the input scores; un-merging is possible
+by selecting just some of the tracks.  If the scores differ in their
+"ticks" parameter, they will all get converted to millisecond-tick
+format.  merge_scores attempts to resolve channel-conflicts,
+but there are of course only 15 available channels...
+'''
+    input_scores = _consistentise_ticks(scores)  # 3.6
+    output_score = [1000]
+    channels_so_far = set()
+    all_channels = {0,1,2,3,4,5,6,7,8,10,11,12,13,14,15}
+    global Event2channelindex
+    for input_score in input_scores:
+        new_channels = set(score2stats(input_score).get('channels_total', []))
+        new_channels.discard(9)  # 2.8 cha9 must remain cha9 (in GM)
+        for channel in channels_so_far & new_channels:
+            # consistently choose lowest avaiable, to ease testing
+            free_channels = list(all_channels - (channels_so_far|new_channels))
+            if len(free_channels) > 0:
+                free_channels.sort()
+                free_channel = free_channels[0]
+            else:
+                free_channel = None
+                break
+            itrack = 1
+            while itrack < len(input_score):
+                for input_event in input_score[itrack]:
+                    channel_index=Event2channelindex.get(input_event[0],False)
+                    if channel_index and input_event[channel_index]==channel:
+                        input_event[channel_index] = free_channel
+                itrack += 1
+            channels_so_far.add(free_channel)
+        channels_so_far |= new_channels
+        output_score.extend(input_score[1:])
+    return output_score
+def _ticks(event):
+    return event[1]
+def mix_opus_tracks(input_tracks):   # 5.5
+    r'''Mixes an array of tracks into one track.  A mixed track
+cannot be un-mixed.  It is assumed that the tracks share the same
+ticks parameter and the same tempo.
+Mixing score-tracks is trivial (just insert all events into one array).
+Mixing opus-tracks is only slightly harder, but it's common enough
+that a dedicated function is useful.
+'''
+    output_score = [1000, []]
+    for input_track in input_tracks:   # 5.8
+        input_score = opus2score([1000, input_track])
+        for event in input_score[1]:
+            output_score[1].append(event)
+    output_score[1].sort(key=_ticks)
+    output_opus = score2opus(output_score)
+    return output_opus[1]
+def mix_scores(scores):
+    r'''Mixes a list of scores into one one-track score.
+A mixed score cannot be un-mixed.  Hopefully the scores
+have no undesirable channel-conflicts between them.
+If the scores differ in their "ticks" parameter,
+they will all get converted to millisecond-tick format.
+'''
+    input_scores = _consistentise_ticks(scores)  # 3.6
+    output_score = [1000, []]
+    for input_score in input_scores:
+        for input_track in input_score[1:]:
+            output_score[1].extend(input_track)
+    return output_score
+def score2stats(opus_or_score=None):
+    r'''Returns a dict of some basic stats about the score, like
+bank_select (list of tuples (msb,lsb)),
+channels_by_track (list of lists), channels_total (set),
+general_midi_mode (list),
+ntracks, nticks, patch_changes_by_track (list of dicts),
+num_notes_by_channel (list of numbers),
+patch_changes_total (set),
+percussion (dict histogram of channel 9 events),
+pitches (dict histogram of pitches on channels other than 9),
+pitch_range_by_track (list, by track, of two-member-tuples),
+pitch_range_sum (sum over tracks of the pitch_ranges),
+'''
+    bank_select_msb = -1
+    bank_select_lsb = -1
+    bank_select = []
+    channels_by_track = []
+    channels_total    = set([])
+    general_midi_mode = []
+    num_notes_by_channel = dict([])
+    patches_used_by_track  = []
+    patches_used_total     = set([])
+    patch_changes_by_track = []
+    patch_changes_total    = set([])
+    percussion = dict([]) # histogram of channel 9 "pitches"
+    pitches    = dict([]) # histogram of pitch-occurrences channels 0-8,10-15
+    pitch_range_sum = 0   # u pitch-ranges of each track
+    pitch_range_by_track = []
+    is_a_score = True
+    if opus_or_score == None:
+        return {'bank_select':[], 'channels_by_track':[], 'channels_total':[],
+         'general_midi_mode':[], 'ntracks':0, 'nticks':0,
+         'num_notes_by_channel':dict([]),
+         'patch_changes_by_track':[], 'patch_changes_total':[],
+         'percussion':{}, 'pitches':{}, 'pitch_range_by_track':[],
+         'ticks_per_quarter':0, 'pitch_range_sum':0}
+    ticks_per_quarter = opus_or_score[0]
+    i = 1   # ignore first element, which is ticks
+    nticks = 0
+    while i < len(opus_or_score):
+        highest_pitch = 0
+        lowest_pitch = 128
+        channels_this_track = set([])
+        patch_changes_this_track = dict({})
+        for event in opus_or_score[i]:
+            if event[0] == 'note':
+                num_notes_by_channel[event[3]] = num_notes_by_channel.get(event[3],0) + 1
+                if event[3] == 9:
+                    percussion[event[4]] = percussion.get(event[4],0) + 1
+                else:
+                    pitches[event[4]]    = pitches.get(event[4],0) + 1
+                    if event[4] > highest_pitch:
+                        highest_pitch = event[4]
+                    if event[4] < lowest_pitch:
+                        lowest_pitch = event[4]
+                channels_this_track.add(event[3])
+                channels_total.add(event[3])
+                finish_time = event[1] + event[2]
+                if finish_time > nticks:
+                    nticks = finish_time
+            elif event[0] == 'note_off' or (event[0] == 'note_on' and event[4] == 0):  # 4.8
+                finish_time = event[1]
+                if finish_time > nticks:
+                    nticks = finish_time
+            elif event[0] == 'note_on':
+                is_a_score = False
+                num_notes_by_channel[event[2]] = num_notes_by_channel.get(event[2],0) + 1
+                if event[2] == 9:
+                    percussion[event[3]] = percussion.get(event[3],0) + 1
+                else:
+                    pitches[event[3]]    = pitches.get(event[3],0) + 1
+                    if event[3] > highest_pitch:
+                        highest_pitch = event[3]
+                    if event[3] < lowest_pitch:
+                        lowest_pitch = event[3]
+                channels_this_track.add(event[2])
+                channels_total.add(event[2])
+            elif event[0] == 'patch_change':
+                patch_changes_this_track[event[2]] = event[3]
+                patch_changes_total.add(event[3])
+            elif event[0] == 'control_change':
+                if event[3] == 0:  # bank select MSB
+                    bank_select_msb = event[4]
+                elif event[3] == 32:  # bank select LSB
+                    bank_select_lsb = event[4]
+                if bank_select_msb >= 0 and bank_select_lsb >= 0:
+                    bank_select.append((bank_select_msb,bank_select_lsb))
+                    bank_select_msb = -1
+                    bank_select_lsb = -1
+            elif event[0] == 'sysex_f0':
+                if _sysex2midimode.get(event[2], -1) >= 0:
+                    general_midi_mode.append(_sysex2midimode.get(event[2]))
+            if is_a_score:
+                if event[1] > nticks:
+                    nticks = event[1]
+            else:
+                nticks += event[1]
+        if lowest_pitch == 128:
+            lowest_pitch = 0
+        channels_by_track.append(channels_this_track)
+        patch_changes_by_track.append(patch_changes_this_track)
+        pitch_range_by_track.append((lowest_pitch,highest_pitch))
+        pitch_range_sum += (highest_pitch-lowest_pitch)
+        i += 1
+    return {'bank_select':bank_select,
+            'channels_by_track':channels_by_track,
+            'channels_total':channels_total,
+            'general_midi_mode':general_midi_mode,
+            'ntracks':len(opus_or_score)-1,
+            'nticks':nticks,
+            'num_notes_by_channel':num_notes_by_channel,
+            'patch_changes_by_track':patch_changes_by_track,
+            'patch_changes_total':patch_changes_total,
+            'percussion':percussion,
+            'pitches':pitches,
+            'pitch_range_by_track':pitch_range_by_track,
+            'pitch_range_sum':pitch_range_sum,
+            'ticks_per_quarter':ticks_per_quarter}
+#----------------------------- Event stuff --------------------------
+_sysex2midimode = {
+    "\x7E\x7F\x09\x01\xF7": 1,
+    "\x7E\x7F\x09\x02\xF7": 0,
+    "\x7E\x7F\x09\x03\xF7": 2,
+}
+# Some public-access tuples:
+MIDI_events = tuple('''note_off note_on key_after_touch
+control_change patch_change channel_after_touch
+pitch_wheel_change'''.split())
+Text_events = tuple('''text_event copyright_text_event
+track_name instrument_name lyric marker cue_point text_event_08
+text_event_09 text_event_0a text_event_0b text_event_0c
+text_event_0d text_event_0e text_event_0f'''.split())
+Nontext_meta_events = tuple('''end_track set_tempo
+smpte_offset time_signature key_signature sequencer_specific
+raw_meta_event sysex_f0 sysex_f7 song_position song_select
+tune_request'''.split())
+# unsupported: raw_data
+# Actually, 'tune_request' is is F-series event, not strictly a meta-event...
+Meta_events = Text_events + Nontext_meta_events
+All_events  = MIDI_events + Meta_events
+# And three dictionaries:
+Number2patch = {   # General MIDI patch numbers:
+0:'Acoustic Grand',
+1:'Bright Acoustic',
+2:'Electric Grand',
+3:'Honky-Tonk',
+4:'Electric Piano 1',
+5:'Electric Piano 2',
+6:'Harpsichord',
+7:'Clav',
+8:'Celesta',
+9:'Glockenspiel',
+10:'Music Box',
+11:'Vibraphone',
+12:'Marimba',
+13:'Xylophone',
+14:'Tubular Bells',
+15:'Dulcimer',
+16:'Drawbar Organ',
+17:'Percussive Organ',
+18:'Rock Organ',
+19:'Church Organ',
+20:'Reed Organ',
+21:'Accordion',
+22:'Harmonica',
+23:'Tango Accordion',
+24:'Acoustic Guitar(nylon)',
+25:'Acoustic Guitar(steel)',
+26:'Electric Guitar(jazz)',
+27:'Electric Guitar(clean)',
+28:'Electric Guitar(muted)',
+29:'Overdriven Guitar',
+30:'Distortion Guitar',
+31:'Guitar Harmonics',
+32:'Acoustic Bass',
+33:'Electric Bass(finger)',
+34:'Electric Bass(pick)',
+35:'Fretless Bass',
+36:'Slap Bass 1',
+37:'Slap Bass 2',
+38:'Synth Bass 1',
+39:'Synth Bass 2',
+40:'Violin',
+41:'Viola',
+42:'Cello',
+43:'Contrabass',
+44:'Tremolo Strings',
+45:'Pizzicato Strings',
+46:'Orchestral Harp',
+47:'Timpani',
+48:'String Ensemble 1',
+49:'String Ensemble 2',
+50:'SynthStrings 1',
+51:'SynthStrings 2',
+52:'Choir Aahs',
+53:'Voice Oohs',
+54:'Synth Voice',
+55:'Orchestra Hit',
+56:'Trumpet',
+57:'Trombone',
+58:'Tuba',
+59:'Muted Trumpet',
+60:'French Horn',
+61:'Brass Section',
+62:'SynthBrass 1',
+63:'SynthBrass 2',
+64:'Soprano Sax',
+65:'Alto Sax',
+66:'Tenor Sax',
+67:'Baritone Sax',
+68:'Oboe',
+69:'English Horn',
+70:'Bassoon',
+71:'Clarinet',
+72:'Piccolo',
+73:'Flute',
+74:'Recorder',
+75:'Pan Flute',
+76:'Blown Bottle',
+77:'Skakuhachi',
+78:'Whistle',
+79:'Ocarina',
+80:'Lead 1 (square)',
+81:'Lead 2 (sawtooth)',
+82:'Lead 3 (calliope)',
+83:'Lead 4 (chiff)',
+84:'Lead 5 (charang)',
+85:'Lead 6 (voice)',
+86:'Lead 7 (fifths)',
+87:'Lead 8 (bass+lead)',
+88:'Pad 1 (new age)',
+89:'Pad 2 (warm)',
+90:'Pad 3 (polysynth)',
+91:'Pad 4 (choir)',
+92:'Pad 5 (bowed)',
+93:'Pad 6 (metallic)',
+94:'Pad 7 (halo)',
+95:'Pad 8 (sweep)',
+96:'FX 1 (rain)',
+97:'FX 2 (soundtrack)',
+98:'FX 3 (crystal)',
+99:'FX 4 (atmosphere)',
+100:'FX 5 (brightness)',
+101:'FX 6 (goblins)',
+102:'FX 7 (echoes)',
+103:'FX 8 (sci-fi)',
+104:'Sitar',
+105:'Banjo',
+106:'Shamisen',
+107:'Koto',
+108:'Kalimba',
+109:'Bagpipe',
+110:'Fiddle',
+111:'Shanai',
+112:'Tinkle Bell',
+113:'Agogo',
+114:'Steel Drums',
+115:'Woodblock',
+116:'Taiko Drum',
+117:'Melodic Tom',
+118:'Synth Drum',
+119:'Reverse Cymbal',
+120:'Guitar Fret Noise',
+121:'Breath Noise',
+122:'Seashore',
+123:'Bird Tweet',
+124:'Telephone Ring',
+125:'Helicopter',
+126:'Applause',
+127:'Gunshot',
+}
+Notenum2percussion = {   # General MIDI Percussion (on Channel 9):
+35:'Acoustic Bass Drum',
+36:'Bass Drum 1',
+37:'Side Stick',
+38:'Acoustic Snare',
+39:'Hand Clap',
+40:'Electric Snare',
+41:'Low Floor Tom',
+42:'Closed Hi-Hat',
+43:'High Floor Tom',
+44:'Pedal Hi-Hat',
+45:'Low Tom',
+46:'Open Hi-Hat',
+47:'Low-Mid Tom',
+48:'Hi-Mid Tom',
+49:'Crash Cymbal 1',
+50:'High Tom',
+51:'Ride Cymbal 1',
+52:'Chinese Cymbal',
+53:'Ride Bell',
+54:'Tambourine',
+55:'Splash Cymbal',
+56:'Cowbell',
+57:'Crash Cymbal 2',
+58:'Vibraslap',
+59:'Ride Cymbal 2',
+60:'Hi Bongo',
+61:'Low Bongo',
+62:'Mute Hi Conga',
+63:'Open Hi Conga',
+64:'Low Conga',
+65:'High Timbale',
+66:'Low Timbale',
+67:'High Agogo',
+68:'Low Agogo',
+69:'Cabasa',
+70:'Maracas',
+71:'Short Whistle',
+72:'Long Whistle',
+73:'Short Guiro',
+74:'Long Guiro',
+75:'Claves',
+76:'Hi Wood Block',
+77:'Low Wood Block',
+78:'Mute Cuica',
+79:'Open Cuica',
+80:'Mute Triangle',
+81:'Open Triangle',
+}
+Event2channelindex = { 'note':3, 'note_off':2, 'note_on':2,
+ 'key_after_touch':2, 'control_change':2, 'patch_change':2,
+ 'channel_after_touch':2, 'pitch_wheel_change':2
+}
+################################################################
+# The code below this line is full of frightening things, all to
+# do with the actual encoding and decoding of binary MIDI data.
+def _twobytes2int(byte_a):
+    r'''decode a 16 bit quantity from two bytes,'''
+    return (byte_a[1] | (byte_a[0] << 8))
+def _int2twobytes(int_16bit):
+    r'''encode a 16 bit quantity into two bytes,'''
+    return bytes([(int_16bit>>8) & 0xFF, int_16bit & 0xFF])
+def _read_14_bit(byte_a):
+    r'''decode a 14 bit quantity from two bytes,'''
+    return (byte_a[0] | (byte_a[1] << 7))
+def _write_14_bit(int_14bit):
+    r'''encode a 14 bit quantity into two bytes,'''
+    return bytes([int_14bit & 0x7F, (int_14bit>>7) & 0x7F])
+def _ber_compressed_int(integer):
+    r'''BER compressed integer (not an ASN.1 BER, see perlpacktut for
+details).  Its bytes represent an unsigned integer in base 128,
+most significant digit first, with as few digits as possible.
+Bit eight (the high bit) is set on each byte except the last.
+'''
+    ber = bytearray(b'')
+    seven_bits = 0x7F & integer
+    ber.insert(0, seven_bits)  # XXX surely should convert to a char ?
+    integer >>= 7
+    while integer > 0:
+        seven_bits = 0x7F & integer
+        ber.insert(0, 0x80|seven_bits)  # XXX surely should convert to a char ?
+        integer >>= 7
+    return ber
+def _unshift_ber_int(ba):
+    r'''Given a bytearray, returns a tuple of (the ber-integer at the
+start, and the remainder of the bytearray).
+'''
+    if not len(ba):   # 6.7
+        _warn('_unshift_ber_int: no integer found')
+        return ((0, b""))
+    byte = ba.pop(0)
+    integer = 0
+    while True:
+        integer += (byte & 0x7F)
+        if not (byte & 0x80):
+            return ((integer, ba))
+        if not len(ba):
+            _warn('_unshift_ber_int: no end-of-integer found')
+            return ((0, ba))
+        byte = ba.pop(0)
+        integer <<= 7
+def _clean_up_warnings():  # 5.4
+    # Call this before returning from any publicly callable function
+    # whenever there's a possibility that a warning might have been printed
+    # by the function, or by any private functions it might have called.
+    if _no_warning:
+        return
+    global _previous_times
+    global _previous_warning
+    if _previous_times > 1:
+        # E:1176, 0: invalid syntax (<string>, line 1176) (syntax-error) ???
+        # print('  previous message repeated '+str(_previous_times)+' times', file=sys.stderr)
+        # 6.7
+        sys.stderr.write('  previous message repeated {0} times\n'.format(_previous_times))
+    elif _previous_times > 0:
+        sys.stderr.write('  previous message repeated\n')
+    _previous_times = 0
+    _previous_warning = ''
+def _warn(s=''):
+    if _no_warning:
+        return
+    global _previous_times
+    global _previous_warning
+    if s == _previous_warning:  # 5.4
+        _previous_times = _previous_times + 1
+    else:
+        _clean_up_warnings()
+        sys.stderr.write(str(s)+"\n")
+        _previous_warning = s
+def _some_text_event(which_kind=0x01, text=b'some_text'):
+    if str(type(text)).find("'str'") >= 0:   # 6.4 test for back-compatibility
+        data = bytes(text, encoding='ISO-8859-1')
+    else:
+        data = bytes(text)
+    return b'\xFF'+bytes((which_kind,))+_ber_compressed_int(len(data))+data
+def _consistentise_ticks(scores):  # 3.6
+    # used by mix_scores, merge_scores, concatenate_scores
+    if len(scores) == 1:
+         return copy.deepcopy(scores)
+    are_consistent = True
+    ticks = scores[0][0]
+    iscore = 1
+    while iscore < len(scores):
+        if scores[iscore][0] != ticks:
+            are_consistent = False
+            break
+        iscore += 1
+    if are_consistent:
+        return copy.deepcopy(scores)
+    new_scores = []
+    iscore = 0
+    while iscore < len(scores):
+        score = scores[iscore]
+        new_scores.append(opus2score(to_millisecs(score2opus(score))))
+        iscore += 1
+    return new_scores
+###########################################################################
+def _decode(trackdata=b'', exclude=None, include=None,
+ event_callback=None, exclusive_event_callback=None, no_eot_magic=False):
+    r'''Decodes MIDI track data into an opus-style list of events.
+The options:
+  'exclude' is a list of event types which will be ignored SHOULD BE A SET
+  'include' (and no exclude), makes exclude a list
+       of all possible events, /minus/ what include specifies
+  'event_callback' is a coderef
+  'exclusive_event_callback' is a coderef
+'''
+    trackdata = bytearray(trackdata)
+    if exclude == None:
+        exclude = []
+    if include == None:
+        include = []
+    if include and not exclude:
+        exclude = All_events
+    include = set(include)
+    exclude = set(exclude)
+    # Pointer = 0;  not used here; we eat through the bytearray instead.
+    event_code = -1; # used for running status
+    event_count = 0;
+    events = []
+    while(len(trackdata)):
+        # loop while there's anything to analyze ...
+        eot = False   # When True, the event registrar aborts this loop
+        event_count += 1
+        E = []
+        # E for events - we'll feed it to the event registrar at the end.
+        # Slice off the delta time code, and analyze it
+        [time, remainder] = _unshift_ber_int(trackdata)
+        # Now let's see what we can make of the command
+        first_byte = trackdata.pop(0) & 0xFF
+        if (first_byte < 0xF0):  # It's a MIDI event
+            if (first_byte & 0x80):
+                event_code = first_byte
+            else:
+                # It wants running status; use last event_code value
+                trackdata.insert(0, first_byte)
+                if (event_code == -1):
+                    _warn("Running status not set; Aborting track.")
+                    return []
+            command = event_code & 0xF0
+            channel = event_code & 0x0F
+            if (command == 0xF6):  #  0-byte argument
+                pass
+            elif (command == 0xC0 or command == 0xD0):  #  1-byte argument
+                parameter = trackdata.pop(0)  # could be B
+            else: # 2-byte argument could be BB or 14-bit
+                parameter = (trackdata.pop(0), trackdata.pop(0))
+            #################################################################
+            # MIDI events
+            if (command      == 0x80):
+                if 'note_off' in exclude:
+                    continue
+                E = ['note_off', time, channel, parameter[0], parameter[1]]
+            elif (command == 0x90):
+                if 'note_on' in exclude:
+                    continue
+                E = ['note_on', time, channel, parameter[0], parameter[1]]
+            elif (command == 0xA0):
+                if 'key_after_touch' in exclude:
+                    continue
+                E = ['key_after_touch',time,channel,parameter[0],parameter[1]]
+            elif (command == 0xB0):
+                if 'control_change' in exclude:
+                    continue
+                E = ['control_change',time,channel,parameter[0],parameter[1]]
+            elif (command == 0xC0):
+                if 'patch_change' in exclude:
+                    continue
+                E = ['patch_change', time, channel, parameter]
+            elif (command == 0xD0):
+                if 'channel_after_touch' in exclude:
+                    continue
+                E = ['channel_after_touch', time, channel, parameter]
+            elif (command == 0xE0):
+                if 'pitch_wheel_change' in exclude:
+                    continue
+                E = ['pitch_wheel_change', time, channel,
+                 _read_14_bit(parameter)-0x2000]
+            else:
+                _warn("Shouldn't get here; command="+hex(command))
+        elif (first_byte == 0xFF):  # It's a Meta-Event! ##################
+            #[command, length, remainder] =
+            #    unpack("xCwa*", substr(trackdata, $Pointer, 6));
+            #Pointer += 6 - len(remainder);
+            #    # Move past JUST the length-encoded.
+            command = trackdata.pop(0) & 0xFF
+            [length, trackdata] = _unshift_ber_int(trackdata)
+            if (command      == 0x00):
+                 if (length == 2):
+                     E = ['set_sequence_number',time,_twobytes2int(trackdata)]
+                 else:
+                     _warn('set_sequence_number: length must be 2, not '+str(length))
+                     E = ['set_sequence_number', time, 0]
+            elif command >= 0x01 and command <= 0x0f:   # Text events
+                # 6.2 take it in bytes; let the user get the right encoding.
+                # text_str = trackdata[0:length].decode('ascii','ignore')
+                # text_str = trackdata[0:length].decode('ISO-8859-1')
+                # 6.4 take it in bytes; let the user get the right encoding.
+                text_data = bytes(trackdata[0:length])   # 6.4
+                # Defined text events
+                if (command == 0x01):
+                     E = ['text_event', time, text_data]
+                elif (command == 0x02):
+                     E = ['copyright_text_event', time, text_data]
+                elif (command == 0x03):
+                     E = ['track_name', time, text_data]
+                elif (command == 0x04):
+                     E = ['instrument_name', time, text_data]
+                elif (command == 0x05):
+                     E = ['lyric', time, text_data]
+                elif (command == 0x06):
+                     E = ['marker', time, text_data]
+                elif (command == 0x07):
+                     E = ['cue_point', time, text_data]
+                # Reserved but apparently unassigned text events
+                elif (command == 0x08):
+                     E = ['text_event_08', time, text_data]
+                elif (command == 0x09):
+                     E = ['text_event_09', time, text_data]
+                elif (command == 0x0a):
+                     E = ['text_event_0a', time, text_data]
+                elif (command == 0x0b):
+                     E = ['text_event_0b', time, text_data]
+                elif (command == 0x0c):
+                     E = ['text_event_0c', time, text_data]
+                elif (command == 0x0d):
+                     E = ['text_event_0d', time, text_data]
+                elif (command == 0x0e):
+                     E = ['text_event_0e', time, text_data]
+                elif (command == 0x0f):
+                     E = ['text_event_0f', time, text_data]
+            # Now the sticky events -------------------------------------
+            elif (command == 0x2F):
+                 E = ['end_track', time]
+                     # The code for handling this, oddly, comes LATER,
+                     # in the event registrar.
+            elif (command == 0x51): # DTime, Microseconds/Crochet
+                 if length != 3:
+                     _warn('set_tempo event, but length='+str(length))
+                 E = ['set_tempo', time,
+                      struct.unpack(">I", b'\x00'+trackdata[0:3])[0]]
+            elif (command == 0x54):
+                 if length != 5:   # DTime, HR, MN, SE, FR, FF
+                     _warn('smpte_offset event, but length='+str(length))
+                 E = ['smpte_offset',time] + list(struct.unpack(">BBBBB",trackdata[0:5]))
+            elif (command == 0x58):
+                 if length != 4:   # DTime, NN, DD, CC, BB
+                     _warn('time_signature event, but length='+str(length))
+                 E = ['time_signature', time]+list(trackdata[0:4])
+            elif (command == 0x59):
+                 if length != 2:   # DTime, SF(signed), MI
+                     _warn('key_signature event, but length='+str(length))
+                 E = ['key_signature',time] + list(struct.unpack(">bB",trackdata[0:2]))
+            elif (command == 0x7F):   # 6.4
+                 E = ['sequencer_specific',time, bytes(trackdata[0:length])]
+            else:
+                 E = ['raw_meta_event', time, command,
+                   bytes(trackdata[0:length])]   # 6.0
+                 #"[uninterpretable meta-event command of length length]"
+                 # DTime, Command, Binary Data
+                 # It's uninterpretable; record it as raw_data.
+            # Pointer += length; #  Now move Pointer
+            trackdata = trackdata[length:]
+        ######################################################################
+        elif (first_byte == 0xF0 or first_byte == 0xF7):
+            # Note that sysexes in MIDI /files/ are different than sysexes
+            # in MIDI transmissions!! The vast majority of system exclusive
+            # messages will just use the F0 format. For instance, the
+            # transmitted message F0 43 12 00 07 F7 would be stored in a
+            # MIDI file as F0 05 43 12 00 07 F7. As mentioned above, it is
+            # required to include the F7 at the end so that the reader of the
+            # MIDI file knows that it has read the entire message. (But the F7
+            # is omitted if this is a non-final block in a multiblock sysex;
+            # but the F7 (if there) is counted in the message's declared
+            # length, so we don't have to think about it anyway.)
+            #command = trackdata.pop(0)
+            [length, trackdata] = _unshift_ber_int(trackdata)
+            if first_byte == 0xF0:
+                # 20091008 added ISO-8859-1 to get an 8-bit str
+                # 6.4 return bytes instead
+                E = ['sysex_f0', time, bytes(trackdata[0:length])]
+            else:
+                E = ['sysex_f7', time, bytes(trackdata[0:length])]
+            trackdata = trackdata[length:]
+        ######################################################################
+        # Now, the MIDI file spec says:
+        #  <track data> = <MTrk event>+
+        #  <MTrk event> = <delta-time> <event>
+        #  <event> = <MIDI event> | <sysex event> | <meta-event>
+        # I know that, on the wire, <MIDI event> can include note_on,
+        # note_off, and all the other 8x to Ex events, AND Fx events
+        # other than F0, F7, and FF -- namely, <song position msg>,
+        # <song select msg>, and <tune request>.
+        #
+        # Whether these can occur in MIDI files is not clear specified
+        # from the MIDI file spec.  So, I'm going to assume that
+        # they CAN, in practice, occur.  I don't know whether it's
+        # proper for you to actually emit these into a MIDI file.
+        elif (first_byte == 0xF2):   # DTime, Beats
+            #  <song position msg> ::=     F2 <data pair>
+            E = ['song_position', time, _read_14_bit(trackdata[:2])]
+            trackdata = trackdata[2:]
+        elif (first_byte == 0xF3):   # <song select msg> ::= F3 <data singlet>
+            # E = ['song_select', time, struct.unpack('>B',trackdata.pop(0))[0]]
+            E = ['song_select', time, trackdata[0]]
+            trackdata = trackdata[1:]
+            # DTime, Thing (what?! song number?  whatever ...)
+        elif (first_byte == 0xF6):   # DTime
+            E = ['tune_request', time]
+            # What would a tune request be doing in a MIDI /file/?
+        #########################################################
+        # ADD MORE META-EVENTS HERE.  TODO:
+        # f1 -- MTC Quarter Frame Message. One data byte follows
+        #     the Status; it's the time code value, from 0 to 127.
+        # f8 -- MIDI clock.    no data.
+        # fa -- MIDI start.    no data.
+        # fb -- MIDI continue. no data.
+        # fc -- MIDI stop.     no data.
+        # fe -- Active sense.  no data.
+        # f4 f5 f9 fd -- unallocated
+            r'''
+        elif (first_byte > 0xF0) { # Some unknown kinda F-series event ####
+            # Here we only produce a one-byte piece of raw data.
+            # But the encoder for 'raw_data' accepts any length of it.
+            E = [ 'raw_data',
+                         time, substr(trackdata,Pointer,1) ]
+            # DTime and the Data (in this case, the one Event-byte)
+            ++Pointer;  # itself
+'''
+        elif first_byte > 0xF0:  # Some unknown F-series event
+            # Here we only produce a one-byte piece of raw data.
+            # E = ['raw_data', time, bytest(trackdata[0])]   # 6.4
+            E = ['raw_data', time, trackdata[0]]   # 6.4 6.7
+            trackdata = trackdata[1:]
+        else:  # Fallthru.
+            _warn("Aborting track.  Command-byte first_byte="+hex(first_byte))
+            break
+        # End of the big if-group
+        ######################################################################
+        #  THE EVENT REGISTRAR...
+        if E and  (E[0] == 'end_track'):
+            # This is the code for exceptional handling of the EOT event.
+            eot = True
+            if not no_eot_magic:
+                if E[1] > 0:  # a null text-event to carry the delta-time
+                    E = ['text_event', E[1], '']
+                else:
+                    E = []   # EOT with a delta-time of 0; ignore it.
+        if E and not (E[0] in exclude):
+            #if ( $exclusive_event_callback ):
+            #    &{ $exclusive_event_callback }( @E );
+            #else:
+            #    &{ $event_callback }( @E ) if $event_callback;
+                events.append(E)
+        if eot:
+            break
+    # End of the big "Event" while-block
+    return events
+###########################################################################
+def _encode(events_lol, unknown_callback=None, never_add_eot=False,
+  no_eot_magic=False, no_running_status=False):
+    # encode an event structure, presumably for writing to a file
+    # Calling format:
+    #   $data_r = MIDI::Event::encode( \@event_lol, { options } );
+    # Takes a REFERENCE to an event structure (a LoL)
+    # Returns an (unblessed) REFERENCE to track data.
+    # If you want to use this to encode a /single/ event,
+    # you still have to do it as a reference to an event structure (a LoL)
+    # that just happens to have just one event.  I.e.,
+    #   encode( [ $event ] ) or encode( [ [ 'note_on', 100, 5, 42, 64] ] )
+    # If you're doing this, consider the never_add_eot track option, as in
+    #   print MIDI ${ encode( [ $event], { 'never_add_eot' => 1} ) };
+    data = [] # what I'll store the chunks of byte-data in
+    # This is so my end_track magic won't corrupt the original
+    events = copy.deepcopy(events_lol)
+    if not never_add_eot:
+        # One way or another, tack on an 'end_track'
+        if events:
+            last = events[-1]
+            if not (last[0] == 'end_track'):  # no end_track already
+                if (last[0] == 'text_event' and len(last[2]) == 0):
+                    # 0-length text event at track-end.
+                    if no_eot_magic:
+                        # Exceptional case: don't mess with track-final
+                        # 0-length text_events; just peg on an end_track
+                        events.append(['end_track', 0])
+                    else:
+                        # NORMAL CASE: replace with an end_track, leaving DTime
+                        last[0] = 'end_track'
+                else:
+                    # last event was neither 0-length text_event nor end_track
+                    events.append(['end_track', 0])
+        else:  # an eventless track!
+            events = [['end_track', 0],]
+    # maybe_running_status = not no_running_status # unused? 4.7
+    last_status = -1
+    for event_r in (events):
+        E = copy.deepcopy(event_r)
+        # otherwise the shifting'd corrupt the original
+        if not E:
+            continue
+        event = E.pop(0)
+        if not len(event):
+            continue
+        dtime = int(E.pop(0))
+        # print('event='+str(event)+' dtime='+str(dtime))
+        event_data = ''
+        if (   # MIDI events -- eligible for running status
+             event    == 'note_on'
+             or event == 'note_off'
+             or event == 'control_change'
+             or event == 'key_after_touch'
+             or event == 'patch_change'
+             or event == 'channel_after_touch'
+             or event == 'pitch_wheel_change'  ):
+            # This block is where we spend most of the time.  Gotta be tight.
+            if (event == 'note_off'):
+                status = 0x80 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0x7F, int(E[2])&0x7F)
+            elif (event == 'note_on'):
+                status = 0x90 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0x7F, int(E[2])&0x7F)
+            elif (event == 'key_after_touch'):
+                status = 0xA0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0x7F, int(E[2])&0x7F)
+            elif (event == 'control_change'):
+                status = 0xB0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0xFF, int(E[2])&0xFF)
+            elif (event == 'patch_change'):
+                status = 0xC0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>B', int(E[1]) & 0xFF)
+            elif (event == 'channel_after_touch'):
+                status = 0xD0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>B', int(E[1]) & 0xFF)
+            elif (event == 'pitch_wheel_change'):
+                status = 0xE0 | (int(E[0]) & 0x0F)
+                parameters =  _write_14_bit(int(E[1]) + 0x2000)
+            else:
+                _warn("BADASS FREAKOUT ERROR 31415!")
+            # And now the encoding
+            # w = BER compressed integer (not ASN.1 BER, see perlpacktut for
+            # details).  Its bytes represent an unsigned integer in base 128,
+            # most significant digit first, with as few digits as possible.
+            # Bit eight (the high bit) is set on each byte except the last.
+            data.append(_ber_compressed_int(dtime))
+            if (status != last_status) or no_running_status:
+                data.append(struct.pack('>B', status))
+            data.append(parameters)
+            last_status = status
+            continue
+        else:
+            # Not a MIDI event.
+            # All the code in this block could be more efficient,
+            # but this is not where the code needs to be tight.
+            # print "zaz $event\n";
+            last_status = -1
+            if event == 'raw_meta_event':
+                event_data = _some_text_event(int(E[0]), E[1])
+            elif (event == 'set_sequence_number'):  # 3.9
+                event_data = b'\xFF\x00\x02'+_int2twobytes(E[0])
+            # Text meta-events...
+            # a case for a dict, I think (pjb) ...
+            elif (event == 'text_event'):
+                event_data = _some_text_event(0x01, E[0])
+            elif (event == 'copyright_text_event'):
+                event_data = _some_text_event(0x02, E[0])
+            elif (event == 'track_name'):
+                event_data = _some_text_event(0x03, E[0])
+            elif (event == 'instrument_name'):
+                event_data = _some_text_event(0x04, E[0])
+            elif (event == 'lyric'):
+                event_data = _some_text_event(0x05, E[0])
+            elif (event == 'marker'):
+                event_data = _some_text_event(0x06, E[0])
+            elif (event == 'cue_point'):
+                event_data = _some_text_event(0x07, E[0])
+            elif (event == 'text_event_08'):
+                event_data = _some_text_event(0x08, E[0])
+            elif (event == 'text_event_09'):
+                event_data = _some_text_event(0x09, E[0])
+            elif (event == 'text_event_0a'):
+                event_data = _some_text_event(0x0A, E[0])
+            elif (event == 'text_event_0b'):
+                event_data = _some_text_event(0x0B, E[0])
+            elif (event == 'text_event_0c'):
+                event_data = _some_text_event(0x0C, E[0])
+            elif (event == 'text_event_0d'):
+                event_data = _some_text_event(0x0D, E[0])
+            elif (event == 'text_event_0e'):
+                event_data = _some_text_event(0x0E, E[0])
+            elif (event == 'text_event_0f'):
+                event_data = _some_text_event(0x0F, E[0])
+            # End of text meta-events
+            elif (event == 'end_track'):
+                event_data = b"\xFF\x2F\x00"
+            elif (event == 'set_tempo'):
+                #event_data = struct.pack(">BBwa*", 0xFF, 0x51, 3,
+                #              substr( struct.pack('>I', E[0]), 1, 3))
+                event_data = b'\xFF\x51\x03'+struct.pack('>I',E[0])[1:]
+            elif (event == 'smpte_offset'):
+                # event_data = struct.pack(">BBwBBBBB", 0xFF, 0x54, 5, E[0:5] )
+                event_data = struct.pack(">BBBbBBBB", 0xFF,0x54,0x05,E[0],E[1],E[2],E[3],E[4])
+            elif (event == 'time_signature'):
+                # event_data = struct.pack(">BBwBBBB",  0xFF, 0x58, 4, E[0:4] )
+                event_data = struct.pack(">BBBbBBB", 0xFF, 0x58, 0x04, E[0],E[1],E[2],E[3])
+            elif (event == 'key_signature'):
+                event_data = struct.pack(">BBBbB", 0xFF, 0x59, 0x02, E[0],E[1])
+            elif (event == 'sequencer_specific'):
+                # event_data = struct.pack(">BBwa*", 0xFF,0x7F, len(E[0]), E[0])
+                event_data = _some_text_event(0x7F, E[0])
+            # End of Meta-events
+            # Other Things...
+            elif (event == 'sysex_f0'):
+                 #event_data = struct.pack(">Bwa*", 0xF0, len(E[0]), E[0])
+                 #B=bitstring w=BER-compressed-integer a=null-padded-ascii-str
+                 event_data = bytearray(b'\xF0')+_ber_compressed_int(len(E[0]))+bytearray(E[0])
+            elif (event == 'sysex_f7'):
+                 #event_data = struct.pack(">Bwa*", 0xF7, len(E[0]), E[0])
+                 event_data = bytearray(b'\xF7')+_ber_compressed_int(len(E[0]))+bytearray(E[0])
+            elif (event == 'song_position'):
+                 event_data = b"\xF2" + _write_14_bit( E[0] )
+            elif (event == 'song_select'):
+                 event_data = struct.pack('>BB', 0xF3, E[0] )
+            elif (event == 'tune_request'):
+                 event_data = b"\xF6"
+            elif (event == 'raw_data'):
+                _warn("_encode: raw_data event not supported")
+                # event_data = E[0]
+                continue
+            # End of Other Stuff
+            else:
+                # The Big Fallthru
+                if unknown_callback:
+                    # push(@data, &{ $unknown_callback }( @$event_r ))
+                    pass
+                else:
+                    _warn("Unknown event: "+str(event))
+                    # To surpress complaint here, just set
+                    #  'unknown_callback' => sub { return () }
+                continue
+            #print "Event $event encoded part 2\n"
+            if str(type(event_data)).find("'str'") >= 0:
+                event_data = bytearray(event_data.encode('Latin1', 'ignore'))
+            if len(event_data): # how could $event_data be empty
+                # data.append(struct.pack('>wa*', dtime, event_data))
+                # print(' event_data='+str(event_data))
+                data.append(_ber_compressed_int(dtime)+event_data)
+    return b''.join(data)

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
----
-title: Music
-emoji: 📊
-colorFrom: yellow
-colorTo: blue
-sdk: gradio
-sdk_version: 5.0.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Midi Music Generator
+emoji: 🎼🎶
+colorFrom: red
+colorTo: indigo
+sdk: gradio
+sdk_version: 5.0.1
+app_file: app_onnx.py
+pinned: true
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,534 @@

+import spaces
+import random
+import argparse
+import glob
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+import gradio as gr
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tqdm
+from huggingface_hub import hf_hub_download
+from transformers import DynamicCache
+import MIDI
+from midi_model import MIDIModel, MIDIModelConfig
+from midi_synthesizer import MidiSynthesizer
+MAX_SEED = np.iinfo(np.int32).max
+in_space = os.getenv("SYSTEM") == "spaces"
+@torch.inference_mode()
+def generate(model: MIDIModel, prompt=None, batch_size=1, max_len=512, temp=1.0, top_p=0.98, top_k=20,
+             disable_patch_change=False, disable_control_change=False, disable_channels=None, generator=None):
+    tokenizer = model.tokenizer
+    if disable_channels is not None:
+        disable_channels = [tokenizer.parameter_ids["channel"][c] for c in disable_channels]
+    else:
+        disable_channels = []
+    max_token_seq = tokenizer.max_token_seq
+    if prompt is None:
+        input_tensor = torch.full((1, max_token_seq), tokenizer.pad_id, dtype=torch.long, device=model.device)
+        input_tensor[0, 0] = tokenizer.bos_id  # bos
+        input_tensor = input_tensor.unsqueeze(0)
+        input_tensor = torch.cat([input_tensor] * batch_size, dim=0)
+    else:
+        if len(prompt.shape) == 2:
+            prompt = prompt[None, :]
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif prompt.shape[0] == 1:
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif len(prompt.shape) != 3 or prompt.shape[0] != batch_size:
+            raise ValueError(f"invalid shape for prompt, {prompt.shape}")
+        prompt = prompt[..., :max_token_seq]
+        if prompt.shape[-1] < max_token_seq:
+            prompt = np.pad(prompt, ((0, 0), (0, 0), (0, max_token_seq - prompt.shape[-1])),
+                            mode="constant", constant_values=tokenizer.pad_id)
+        input_tensor = torch.from_numpy(prompt).to(dtype=torch.long, device=model.device)
+    cur_len = input_tensor.shape[1]
+    bar = tqdm.tqdm(desc="generating", total=max_len - cur_len)
+    cache1 = DynamicCache()
+    past_len = 0
+    with bar:
+        while cur_len < max_len:
+            end = [False] * batch_size
+            hidden = model.forward(input_tensor[:, past_len:], cache=cache1)[:, -1]
+            next_token_seq = None
+            event_names = [""] * batch_size
+            cache2 = DynamicCache()
+            for i in range(max_token_seq):
+                mask = torch.zeros((batch_size, tokenizer.vocab_size), dtype=torch.int64, device=model.device)
+                for b in range(batch_size):
+                    if end[b]:
+                        mask[b, tokenizer.pad_id] = 1
+                        continue
+                    if i == 0:
+                        mask_ids = list(tokenizer.event_ids.values()) + [tokenizer.eos_id]
+                        if disable_patch_change:
+                            mask_ids.remove(tokenizer.event_ids["patch_change"])
+                        if disable_control_change:
+                            mask_ids.remove(tokenizer.event_ids["control_change"])
+                        mask[b, mask_ids] = 1
+                    else:
+                        param_names = tokenizer.events[event_names[b]]
+                        if i > len(param_names):
+                            mask[b, tokenizer.pad_id] = 1
+                            continue
+                        param_name = param_names[i - 1]
+                        mask_ids = tokenizer.parameter_ids[param_name]
+                        if param_name == "channel":
+                            mask_ids = [i for i in mask_ids if i not in disable_channels]
+                        mask[b, mask_ids] = 1
+                mask = mask.unsqueeze(1)
+                x = next_token_seq
+                if i != 0:
+                    hidden = None
+                    x = x[:, -1:]
+                logits = model.forward_token(hidden, x, cache=cache2)[:, -1:]
+                scores = torch.softmax(logits / temp, dim=-1) * mask
+                samples = model.sample_top_p_k(scores, top_p, top_k, generator=generator)
+                if i == 0:
+                    next_token_seq = samples
+                    for b in range(batch_size):
+                        if end[b]:
+                            continue
+                        eid = samples[b].item()
+                        if eid == tokenizer.eos_id:
+                            end[b] = True
+                        else:
+                            event_names[b] = tokenizer.id_events[eid]
+                else:
+                    next_token_seq = torch.cat([next_token_seq, samples], dim=1)
+                    if all([len(tokenizer.events[event_names[b]]) == i for b in range(batch_size) if not end[b]]):
+                        break
+            if next_token_seq.shape[1] < max_token_seq:
+                next_token_seq = F.pad(next_token_seq, (0, max_token_seq - next_token_seq.shape[1]),
+                                       "constant", value=tokenizer.pad_id)
+            next_token_seq = next_token_seq.unsqueeze(1)
+            input_tensor = torch.cat([input_tensor, next_token_seq], dim=1)
+            past_len = cur_len
+            cur_len += 1
+            bar.update(1)
+            yield next_token_seq[:, 0].cpu().numpy()
+            if all(end):
+                break
+def create_msg(name, data):
+    return {"name": name, "data": data}
+def send_msgs(msgs):
+    return json.dumps(msgs)
+def get_duration(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm,
+                 time_sig, key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr,
+                 remove_empty_channels, seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    t = gen_events // 23
+    if "large" in model_name:
+        t = gen_events // 14
+    return t + 5
+@spaces.GPU(duration=get_duration)
+def run(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm, time_sig,
+        key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr, remove_empty_channels,
+        seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    model = models[model_name]
+    model.to(device=opt.device)
+    tokenizer = model.tokenizer
+    bpm = int(bpm)
+    if time_sig == "auto":
+        time_sig = None
+        time_sig_nn = 4
+        time_sig_dd = 2
+    else:
+        time_sig_nn, time_sig_dd = time_sig.split('/')
+        time_sig_nn = int(time_sig_nn)
+        time_sig_dd = {2: 1, 4: 2, 8: 3}[int(time_sig_dd)]
+    if key_sig == 0:
+        key_sig = None
+        key_sig_sf = 0
+        key_sig_mi = 0
+    else:
+        key_sig = (key_sig - 1)
+        key_sig_sf = key_sig // 2 - 7
+        key_sig_mi = key_sig % 2
+    gen_events = int(gen_events)
+    max_len = gen_events
+    if seed_rand:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(opt.device).manual_seed(seed)
+    disable_patch_change = False
+    disable_channels = None
+    if tab == 0:
+        i = 0
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        if tokenizer.version == "v2":
+            if time_sig is not None:
+                mid.append(tokenizer.event2tokens(["time_signature", 0, 0, 0, time_sig_nn - 1, time_sig_dd - 1]))
+            if key_sig is not None:
+                mid.append(tokenizer.event2tokens(["key_signature", 0, 0, 0, key_sig_sf + 7, key_sig_mi]))
+        if bpm != 0:
+            mid.append(tokenizer.event2tokens(["set_tempo", 0, 0, 0, bpm]))
+        patches = {}
+        if instruments is None:
+            instruments = []
+        for instr in instruments:
+            patches[i] = patch2number[instr]
+            i = (i + 1) if i != 8 else 10
+        if drum_kit != "None":
+            patches[9] = drum_kits2number[drum_kit]
+        for i, (c, p) in enumerate(patches.items()):
+            mid.append(tokenizer.event2tokens(["patch_change", 0, 0, i + 1, c, p]))
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+        if len(instruments) > 0:
+            disable_patch_change = True
+            disable_channels = [i for i in range(16) if i not in patches]
+    elif tab == 1 and mid is not None:
+        eps = 4 if reduce_cc_st else 0
+        mid = tokenizer.tokenize(MIDI.midi2score(mid), cc_eps=eps, tempo_eps=eps,
+                                 remap_track_channel=remap_track_channel,
+                                 add_default_instr=add_default_instr,
+                                 remove_empty_channels=remove_empty_channels)
+        mid = mid[:int(midi_events)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    elif tab == 2 and mid_seq is not None:
+        mid = np.asarray(mid_seq, dtype=np.int64)
+        if continuation_select > 0:
+            continuation_state.append(mid_seq)
+            mid = np.repeat(mid[continuation_select - 1:continuation_select], repeats=OUTPUT_BATCH_SIZE, axis=0)
+            mid_seq = mid.tolist()
+        else:
+            continuation_state.append(mid.shape[1])
+    else:
+        continuation_state = [0]
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    if mid is not None:
+        max_len += mid.shape[1]
+    init_msgs = [create_msg("progress", [0, gen_events])]
+    if not (tab == 2 and continuation_select == 0):
+        for i in range(OUTPUT_BATCH_SIZE):
+            events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+            init_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                          create_msg("visualizer_append", [i, events])]
+    yield mid_seq, continuation_state, seed, send_msgs(init_msgs)
+    midi_generator = generate(model, mid, batch_size=OUTPUT_BATCH_SIZE, max_len=max_len, temp=temp,
+                              top_p=top_p, top_k=top_k, disable_patch_change=disable_patch_change,
+                              disable_control_change=not allow_cc, disable_channels=disable_channels,
+                              generator=generator)
+    events = [list() for i in range(OUTPUT_BATCH_SIZE)]
+    t = time.time() + 1
+    for i, token_seqs in enumerate(midi_generator):
+        token_seqs = token_seqs.tolist()
+        for j in range(OUTPUT_BATCH_SIZE):
+            token_seq = token_seqs[j]
+            mid_seq[j].append(token_seq)
+            events[j].append(tokenizer.tokens2event(token_seq))
+        if time.time() - t > 0.5:
+            msgs = [create_msg("progress", [i + 1, gen_events])]
+            for j in range(OUTPUT_BATCH_SIZE):
+                msgs += [create_msg("visualizer_append", [j, events[j]])]
+                events[j] = list()
+            yield mid_seq, continuation_state, seed, send_msgs(msgs)
+            t = time.time()
+    yield mid_seq, continuation_state, seed, send_msgs([])
+def finish_run(model_name, mid_seq):
+    if mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return *outputs, []
+    tokenizer = models[model_name].tokenizer
+    outputs = []
+    end_msgs = [create_msg("progress", [0, 0])]
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        mid = tokenizer.detokenize(mid_seq[i])
+        with open(f"outputs/output{i + 1}.mid", 'wb') as f:
+            f.write(MIDI.score2midi(mid))
+        outputs.append(f"outputs/output{i + 1}.mid")
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return *outputs, send_msgs(end_msgs)
+def synthesis_task(mid):
+    return synthesizer.synthesis(MIDI.score2opus(mid))
+def render_audio(model_name, mid_seq, should_render_audio):
+    if (not should_render_audio) or mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return tuple(outputs)
+    tokenizer = models[model_name].tokenizer
+    outputs = []
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    audio_futures = []
+    for i in range(OUTPUT_BATCH_SIZE):
+        mid = tokenizer.detokenize(mid_seq[i])
+        audio_future = thread_pool.submit(synthesis_task, mid)
+        audio_futures.append(audio_future)
+    for future in audio_futures:
+        outputs.append((44100, future.result()))
+    if OUTPUT_BATCH_SIZE == 1:
+        return outputs[0]
+    return tuple(outputs)
+def undo_continuation(model_name, mid_seq, continuation_state):
+    if mid_seq is None or len(continuation_state) < 2:
+        return mid_seq, continuation_state, send_msgs([])
+    tokenizer = models[model_name].tokenizer
+    if isinstance(continuation_state[-1], list):
+        mid_seq = continuation_state[-1]
+    else:
+        mid_seq = [ms[:continuation_state[-1]] for ms in mid_seq]
+    continuation_state = continuation_state[:-1]
+    end_msgs = [create_msg("progress", [0, 0])]
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return mid_seq, continuation_state, send_msgs(end_msgs)
+def load_javascript(dir="javascript"):
+    scripts_list = glob.glob(f"{dir}/*.js")
+    javascript = ""
+    for path in scripts_list:
+        with open(path, "r", encoding="utf8") as jsfile:
+            js_content = jsfile.read()
+            js_content = js_content.replace("const MIDI_OUTPUT_BATCH_SIZE=4;",
+                                            f"const MIDI_OUTPUT_BATCH_SIZE={OUTPUT_BATCH_SIZE};")
+            javascript += f"\n<!-- {path} --><script>{js_content}</script>"
+    template_response_ori = gr.routes.templates.TemplateResponse
+    def template_response(*args, **kwargs):
+        res = template_response_ori(*args, **kwargs)
+        res.body = res.body.replace(
+            b'</head>', f'{javascript}</head>'.encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response
+def hf_hub_download_retry(repo_id, filename):
+    print(f"downloading {repo_id} {filename}")
+    retry = 0
+    err = None
+    while retry < 30:
+        try:
+            return hf_hub_download(repo_id=repo_id, filename=filename)
+        except Exception as e:
+            err = e
+            retry += 1
+    if err:
+        raise err
+number2drum_kits = {-1: "None", 0: "Standard", 8: "Room", 16: "Power", 24: "Electric", 25: "TR-808", 32: "Jazz",
+                    40: "Blush", 48: "Orchestra"}
+patch2number = {v: k for k, v in MIDI.Number2patch.items()}
+drum_kits2number = {v: k for k, v in number2drum_kits.items()}
+key_signatures = ['C♭', 'A♭m', 'G♭', 'E♭m', 'D♭', 'B♭m', 'A♭', 'Fm', 'E♭', 'Cm', 'B♭', 'Gm', 'F', 'Dm',
+                  'C', 'Am', 'G', 'Em', 'D', 'Bm', 'A', 'F♯m', 'E', 'C♯m', 'B', 'G♯m', 'F♯', 'D♯m', 'C♯', 'A♯m']
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
+    parser.add_argument("--port", type=int, default=7860, help="gradio server port")
+    parser.add_argument("--device", type=str, default="cuda", help="device to run model")
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--max-gen", type=int, default=1024, help="max")
+    opt = parser.parse_args()
+    OUTPUT_BATCH_SIZE = opt.batch
+    soundfont_path = hf_hub_download_retry(repo_id="skytnt/midi-model", filename="soundfont.sf2")
+    thread_pool = ThreadPoolExecutor(max_workers=OUTPUT_BATCH_SIZE)
+    synthesizer = MidiSynthesizer(soundfont_path)
+    models_info = {
+        "generic pretrain model (tv2o-medium) by skytnt": [
+            "skytnt/midi-model-tv2o-medium", {
+                "jpop": "skytnt/midi-model-tv2om-jpop-lora",
+                "touhou": "skytnt/midi-model-tv2om-touhou-lora"
+            }
+        ],
+        "generic pretrain model (tv2o-large) by asigalov61": [
+            "asigalov61/Music-Llama", {}
+        ],
+        "generic pretrain model (tv2o-medium) by asigalov61": [
+            "asigalov61/Music-Llama-Medium", {}
+        ],
+        "generic pretrain model (tv1-medium) by skytnt": [
+            "skytnt/midi-model", {}
+        ]
+    }
+    models = {}
+    if opt.device == "cuda":
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.enable_mem_efficient_sdp(True)
+        torch.backends.cuda.enable_flash_sdp(True)
+    for name, (repo_id, loras) in models_info.items():
+        model = MIDIModel.from_pretrained(repo_id)
+        model.to(device="cpu", dtype=torch.float32)
+        models[name] = model
+        for lora_name, lora_repo in loras.items():
+            model = MIDIModel.from_pretrained(repo_id)
+            print(f"loading lora {lora_repo} for {name}")
+            model = model.load_merge_lora(lora_repo)
+            model.to(device="cpu", dtype=torch.float32)
+            models[f"{name} with {lora_name} lora"] = model
+    load_javascript()
+    app = gr.Blocks(theme=gr.themes.Soft())
+    with app:
+        gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Midi Composer</h1>")
+        gr.Markdown("![Visitors](https://api.visitorbadge.io/api/visitors?path=skytnt.midi-composer&style=flat)\n\n"
+                    "Midi event transformer for symbolic music generation\n\n"
+                    "Demo for [SkyTNT/midi-model](https://github.com/SkyTNT/midi-model)\n\n"
+                    "[Open In Colab]"
+                    "(https://colab.research.google.com/github/SkyTNT/midi-model/blob/main/demo.ipynb)"
+                    " or [download windows app](https://github.com/SkyTNT/midi-model/releases)"
+                    " for unlimited generation\n\n"
+                    "**Update v1.3**: MIDITokenizerV2 and new MidiVisualizer\n\n"
+                    "The current **best** model: generic pretrain model (tv2o-medium) by skytnt"
+                    )
+        js_msg = gr.Textbox(elem_id="msg_receiver", visible=False)
+        js_msg.change(None, [js_msg], [], js="""
+        (msg_json) =>{
+            let msgs = JSON.parse(msg_json);
+            executeCallbacks(msgReceiveCallbacks, msgs);
+            return [];
+        }
+        """)
+        input_model = gr.Dropdown(label="select model", choices=list(models.keys()),
+                                  type="value", value=list(models.keys())[0])
+        tab_select = gr.State(value=0)
+        with gr.Tabs():
+            with gr.TabItem("custom prompt") as tab1:
+                input_instruments = gr.Dropdown(label="🪗instruments (auto if empty)", choices=list(patch2number.keys()),
+                                                multiselect=True, max_choices=15, type="value")
+                input_drum_kit = gr.Dropdown(label="🥁drum kit", choices=list(drum_kits2number.keys()), type="value",
+                                             value="None")
+                input_bpm = gr.Slider(label="BPM (beats per minute, auto if 0)", minimum=0, maximum=255,
+                                      step=1,
+                                      value=0)
+                input_time_sig = gr.Radio(label="time signature (only for tv2 models)",
+                                          value="auto",
+                                          choices=["auto", "4/4", "2/4", "3/4", "6/4", "7/4",
+                                                   "2/2", "3/2", "4/2", "3/8", "5/8", "6/8", "7/8", "9/8", "12/8"]
+                                          )
+                input_key_sig = gr.Radio(label="key signature (only for tv2 models)",
+                                         value="auto",
+                                         choices=["auto"] + key_signatures,
+                                         type="index"
+                                         )
+                example1 = gr.Examples([
+                    [[], "None"],
+                    [["Acoustic Grand"], "None"],
+                    [['Acoustic Grand', 'SynthStrings 2', 'SynthStrings 1', 'Pizzicato Strings',
+                      'Pad 2 (warm)', 'Tremolo Strings', 'String Ensemble 1'], "Orchestra"],
+                    [['Trumpet', 'Oboe', 'Trombone', 'String Ensemble 1', 'Clarinet',
+                      'French Horn', 'Pad 4 (choir)', 'Bassoon', 'Flute'], "None"],
+                    [['Flute', 'French Horn', 'Clarinet', 'String Ensemble 2', 'English Horn', 'Bassoon',
+                      'Oboe', 'Pizzicato Strings'], "Orchestra"],
+                    [['Electric Piano 2', 'Lead 5 (charang)', 'Electric Bass(pick)', 'Lead 2 (sawtooth)',
+                      'Pad 1 (new age)', 'Orchestra Hit', 'Cello', 'Electric Guitar(clean)'], "Standard"],
+                    [["Electric Guitar(clean)", "Electric Guitar(muted)", "Overdriven Guitar", "Distortion Guitar",
+                      "Electric Bass(finger)"], "Standard"]
+                ], [input_instruments, input_drum_kit])
+            with gr.TabItem("midi prompt") as tab2:
+                input_midi = gr.File(label="input midi", file_types=[".midi", ".mid"], type="binary")
+                input_midi_events = gr.Slider(label="use first n midi events as prompt", minimum=1, maximum=512,
+                                              step=1,
+                                              value=128)
+                input_reduce_cc_st = gr.Checkbox(label="reduce control_change and set_tempo events", value=True)
+                input_remap_track_channel = gr.Checkbox(
+                    label="remap tracks and channels so each track has only one channel and in order", value=True)
+                input_add_default_instr = gr.Checkbox(
+                    label="add a default instrument to channels that don't have an instrument", value=True)
+                input_remove_empty_channels = gr.Checkbox(label="remove channels without notes", value=False)
+                example2 = gr.Examples([[file, 128] for file in glob.glob("example/*.mid")],
+                                       [input_midi, input_midi_events])
+            with gr.TabItem("last output prompt") as tab3:
+                gr.Markdown("Continue generating on the last output.")
+                input_continuation_select = gr.Radio(label="select output to continue generating", value="all",
+                                                     choices=["all"] + [f"output{i + 1}" for i in
+                                                                        range(OUTPUT_BATCH_SIZE)],
+                                                     type="index"
+                                                     )
+                undo_btn = gr.Button("undo the last continuation")
+        tab1.select(lambda: 0, None, tab_select, queue=False)
+        tab2.select(lambda: 1, None, tab_select, queue=False)
+        tab3.select(lambda: 2, None, tab_select, queue=False)
+        input_seed = gr.Slider(label="seed", minimum=0, maximum=2 ** 31 - 1,
+                               step=1, value=0)
+        input_seed_rand = gr.Checkbox(label="random seed", value=True)
+        input_gen_events = gr.Slider(label="generate max n midi events", minimum=1, maximum=opt.max_gen,
+                                     step=1, value=opt.max_gen // 2)
+        with gr.Accordion("options", open=False):
+            input_temp = gr.Slider(label="temperature", minimum=0.1, maximum=1.2, step=0.01, value=1)
+            input_top_p = gr.Slider(label="top p", minimum=0.1, maximum=1, step=0.01, value=0.95)
+            input_top_k = gr.Slider(label="top k", minimum=1, maximum=128, step=1, value=20)
+            input_allow_cc = gr.Checkbox(label="allow midi cc event", value=True)
+            input_render_audio = gr.Checkbox(label="render audio after generation", value=True)
+            example3 = gr.Examples([[1, 0.94, 128], [1, 0.98, 20], [1, 0.98, 12]],
+                                   [input_temp, input_top_p, input_top_k])
+        run_btn = gr.Button("generate", variant="primary")
+        # stop_btn = gr.Button("stop and output")
+        output_midi_seq = gr.State()
+        output_continuation_state = gr.State([0])
+        midi_outputs = []
+        audio_outputs = []
+        with gr.Tabs(elem_id="output_tabs"):
+            for i in range(OUTPUT_BATCH_SIZE):
+                with gr.TabItem(f"output {i + 1}") as tab1:
+                    output_midi_visualizer = gr.HTML(elem_id=f"midi_visualizer_container_{i}")
+                    output_audio = gr.Audio(label="output audio", format="mp3", elem_id=f"midi_audio_{i}")
+                    output_midi = gr.File(label="output midi", file_types=[".mid"])
+                    midi_outputs.append(output_midi)
+                    audio_outputs.append(output_audio)
+        run_event = run_btn.click(run, [input_model, tab_select, output_midi_seq, output_continuation_state,
+                                        input_continuation_select, input_instruments, input_drum_kit, input_bpm,
+                                        input_time_sig, input_key_sig, input_midi, input_midi_events,
+                                        input_reduce_cc_st, input_remap_track_channel,
+                                        input_add_default_instr, input_remove_empty_channels,
+                                        input_seed, input_seed_rand, input_gen_events, input_temp, input_top_p,
+                                        input_top_k, input_allow_cc],
+                                  [output_midi_seq, output_continuation_state, input_seed, js_msg],
+                                  concurrency_limit=10, queue=True)
+        finish_run_event = run_event.then(fn=finish_run,
+                                          inputs=[input_model, output_midi_seq],
+                                          outputs=midi_outputs + [js_msg],
+                                          queue=False)
+        finish_run_event.then(fn=render_audio,
+                              inputs=[input_model, output_midi_seq, input_render_audio],
+                              outputs=audio_outputs,
+                              queue=False)
+        # stop_btn.click(None, [], [], cancels=run_event,
+        #                queue=False)
+        undo_btn.click(undo_continuation, [input_model, output_midi_seq, output_continuation_state],
+                       [output_midi_seq, output_continuation_state, js_msg], queue=False)
+    app.queue().launch(server_port=opt.port, share=opt.share, inbrowser=True, ssr_mode=False)
+    thread_pool.shutdown()

app_onnx.py ADDED Viewed

	@@ -0,0 +1,626 @@

+import spaces
+import random
+import argparse
+import glob
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+import gradio as gr
+import numpy as np
+import onnxruntime as rt
+import tqdm
+from huggingface_hub import hf_hub_download
+import MIDI
+from midi_synthesizer import MidiSynthesizer
+from midi_tokenizer import MIDITokenizer
+MAX_SEED = np.iinfo(np.int32).max
+in_space = os.getenv("SYSTEM") == "spaces"
+def softmax(x, axis):
+    x_max = np.amax(x, axis=axis, keepdims=True)
+    exp_x_shifted = np.exp(x - x_max)
+    return exp_x_shifted / np.sum(exp_x_shifted, axis=axis, keepdims=True)
+def sample_top_p_k(probs, p, k, generator=None):
+    if generator is None:
+        generator = np.random
+    probs_idx = np.argsort(-probs, axis=-1)
+    probs_sort = np.take_along_axis(probs, probs_idx, -1)
+    probs_sum = np.cumsum(probs_sort, axis=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    mask = np.zeros(probs_sort.shape[-1])
+    mask[:k] = 1
+    probs_sort = probs_sort * mask
+    probs_sort /= np.sum(probs_sort, axis=-1, keepdims=True)
+    shape = probs_sort.shape
+    probs_sort_flat = probs_sort.reshape(-1, shape[-1])
+    probs_idx_flat = probs_idx.reshape(-1, shape[-1])
+    next_token = np.stack([generator.choice(idxs, p=pvals) for pvals, idxs in zip(probs_sort_flat, probs_idx_flat)])
+    next_token = next_token.reshape(*shape[:-1])
+    return next_token
+def apply_io_binding(model: rt.InferenceSession, inputs, outputs, batch_size, past_len, cur_len):
+    io_binding = model.io_binding()
+    for input_ in  model.get_inputs():
+        name = input_.name
+        if name.startswith("past_key_values"):
+            present_name = name.replace("past_key_values", "present")
+            if present_name in outputs:
+                v = outputs[present_name]
+            else:
+                v = rt.OrtValue.ortvalue_from_shape_and_type(
+                    (batch_size, input_.shape[1], past_len, input_.shape[3]),
+                    element_type=np.float32,
+                    device_type=device)
+            inputs[name] = v
+        else:
+            v = inputs[name]
+        io_binding.bind_ortvalue_input(name, v)
+    for output in model.get_outputs():
+        name = output.name
+        if name.startswith("present"):
+            v = rt.OrtValue.ortvalue_from_shape_and_type(
+                (batch_size, output.shape[1], cur_len, output.shape[3]),
+                element_type=np.float32,
+                device_type=device)
+            outputs[name] = v
+        else:
+            v = outputs[name]
+        io_binding.bind_ortvalue_output(name, v)
+    return io_binding
+def generate(model, prompt=None, batch_size=1, max_len=512, temp=1.0, top_p=0.98, top_k=20,
+             disable_patch_change=False, disable_control_change=False, disable_channels=None, generator=None):
+    tokenizer = model[2]
+    if disable_channels is not None:
+        disable_channels = [tokenizer.parameter_ids["channel"][c] for c in disable_channels]
+    else:
+        disable_channels = []
+    if generator is None:
+        generator = np.random
+    max_token_seq = tokenizer.max_token_seq
+    if prompt is None:
+        input_tensor = np.full((1, max_token_seq), tokenizer.pad_id, dtype=np.int64)
+        input_tensor[0, 0] = tokenizer.bos_id  # bos
+        input_tensor = input_tensor[None, :, :]
+        input_tensor = np.repeat(input_tensor, repeats=batch_size, axis=0)
+    else:
+        if len(prompt.shape) == 2:
+            prompt = prompt[None, :]
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif prompt.shape[0] == 1:
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif len(prompt.shape) != 3 or prompt.shape[0] != batch_size:
+            raise ValueError(f"invalid shape for prompt, {prompt.shape}")
+        prompt = prompt[..., :max_token_seq]
+        if prompt.shape[-1] < max_token_seq:
+            prompt = np.pad(prompt, ((0, 0), (0, 0), (0, max_token_seq - prompt.shape[-1])),
+                            mode="constant", constant_values=tokenizer.pad_id)
+        input_tensor = prompt
+    cur_len = input_tensor.shape[1]
+    bar = tqdm.tqdm(desc="generating", total=max_len - cur_len)
+    model0_inputs = {}
+    model0_outputs = {}
+    emb_size = 1024
+    for output in model[0].get_outputs():
+        if output.name == "hidden":
+            emb_size = output.shape[2]
+    past_len = 0
+    with bar:
+        while cur_len < max_len:
+            end = [False] * batch_size
+            model0_inputs["x"] = rt.OrtValue.ortvalue_from_numpy(input_tensor[:, past_len:], device_type=device)
+            model0_outputs["hidden"] = rt.OrtValue.ortvalue_from_shape_and_type(
+                (batch_size, cur_len - past_len, emb_size),
+                element_type=np.float32,
+                device_type=device)
+            io_binding = apply_io_binding(model[0], model0_inputs, model0_outputs, batch_size, past_len, cur_len)
+            io_binding.synchronize_inputs()
+            model[0].run_with_iobinding(io_binding)
+            io_binding.synchronize_outputs()
+            hidden = model0_outputs["hidden"].numpy()[:, -1:]
+            next_token_seq = np.zeros((batch_size, 0), dtype=np.int64)
+            event_names = [""] * batch_size
+            model1_inputs = {"hidden": rt.OrtValue.ortvalue_from_numpy(hidden, device_type=device)}
+            model1_outputs = {}
+            for i in range(max_token_seq):
+                mask = np.zeros((batch_size, tokenizer.vocab_size), dtype=np.int64)
+                for b in range(batch_size):
+                    if end[b]:
+                        mask[b, tokenizer.pad_id] = 1
+                        continue
+                    if i == 0:
+                        mask_ids = list(tokenizer.event_ids.values()) + [tokenizer.eos_id]
+                        if disable_patch_change:
+                            mask_ids.remove(tokenizer.event_ids["patch_change"])
+                        if disable_control_change:
+                            mask_ids.remove(tokenizer.event_ids["control_change"])
+                        mask[b, mask_ids] = 1
+                    else:
+                        param_names = tokenizer.events[event_names[b]]
+                        if i > len(param_names):
+                            mask[b, tokenizer.pad_id] = 1
+                            continue
+                        param_name = param_names[i - 1]
+                        mask_ids = tokenizer.parameter_ids[param_name]
+                        if param_name == "channel":
+                            mask_ids = [i for i in mask_ids if i not in disable_channels]
+                        mask[b, mask_ids] = 1
+                mask = mask[:, None, :]
+                x = next_token_seq
+                if i != 0:
+                    # cached
+                    if i == 1:
+                        hidden = np.zeros((batch_size, 0, emb_size), dtype=np.float32)
+                        model1_inputs["hidden"] = rt.OrtValue.ortvalue_from_numpy(hidden, device_type=device)
+                    x = x[:, -1:]
+                model1_inputs["x"] = rt.OrtValue.ortvalue_from_numpy(x, device_type=device)
+                model1_outputs["y"] = rt.OrtValue.ortvalue_from_shape_and_type(
+                    (batch_size, 1, tokenizer.vocab_size),
+                    element_type=np.float32,
+                    device_type=device
+                )
+                io_binding = apply_io_binding(model[1], model1_inputs, model1_outputs, batch_size, i, i+1)
+                io_binding.synchronize_inputs()
+                model[1].run_with_iobinding(io_binding)
+                io_binding.synchronize_outputs()
+                logits = model1_outputs["y"].numpy()
+                scores = softmax(logits / temp, -1) * mask
+                samples = sample_top_p_k(scores, top_p, top_k, generator)
+                if i == 0:
+                    next_token_seq = samples
+                    for b in range(batch_size):
+                        if end[b]:
+                            continue
+                        eid = samples[b].item()
+                        if eid == tokenizer.eos_id:
+                            end[b] = True
+                        else:
+                            event_names[b] = tokenizer.id_events[eid]
+                else:
+                    next_token_seq = np.concatenate([next_token_seq, samples], axis=1)
+                    if all([len(tokenizer.events[event_names[b]]) == i for b in range(batch_size) if not end[b]]):
+                        break
+            if next_token_seq.shape[1] < max_token_seq:
+                next_token_seq = np.pad(next_token_seq,
+                                        ((0, 0), (0, max_token_seq - next_token_seq.shape[-1])),
+                                        mode="constant", constant_values=tokenizer.pad_id)
+            next_token_seq = next_token_seq[:, None, :]
+            input_tensor = np.concatenate([input_tensor, next_token_seq], axis=1)
+            past_len = cur_len
+            cur_len += 1
+            bar.update(1)
+            yield next_token_seq[:, 0]
+            if all(end):
+                break
+def create_msg(name, data):
+    return {"name": name, "data": data}
+def send_msgs(msgs):
+    return json.dumps(msgs)
+def get_duration(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm,
+                 time_sig, key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr,
+                 remove_empty_channels, seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    t = gen_events // 30
+    if "large" in model_name:
+        t = gen_events // 23
+    return t + 5
+@spaces.GPU(duration=get_duration)
+def run(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm, time_sig,
+        key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr, remove_empty_channels,
+        seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    model = models[model_name]
+    model_base = rt.InferenceSession(model[0], providers=providers)
+    model_token = rt.InferenceSession(model[1], providers=providers)
+    tokenizer = model[2]
+    model = [model_base, model_token, tokenizer]
+    bpm = int(bpm)
+    if time_sig == "auto":
+        time_sig = None
+        time_sig_nn = 4
+        time_sig_dd = 2
+    else:
+        time_sig_nn, time_sig_dd = time_sig.split('/')
+        time_sig_nn = int(time_sig_nn)
+        time_sig_dd = {2: 1, 4: 2, 8: 3}[int(time_sig_dd)]
+    if key_sig == 0:
+        key_sig = None
+        key_sig_sf = 0
+        key_sig_mi = 0
+    else:
+        key_sig = (key_sig - 1)
+        key_sig_sf = key_sig // 2 - 7
+        key_sig_mi = key_sig % 2
+    gen_events = int(gen_events)
+    max_len = gen_events
+    if seed_rand:
+        seed = random.randint(0, MAX_SEED)
+    generator = np.random.RandomState(seed)
+    disable_patch_change = False
+    disable_channels = None
+    if tab == 0:
+        i = 0
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        if tokenizer.version == "v2":
+            if time_sig is not None:
+                mid.append(tokenizer.event2tokens(["time_signature", 0, 0, 0, time_sig_nn - 1, time_sig_dd - 1]))
+            if key_sig is not None:
+                mid.append(tokenizer.event2tokens(["key_signature", 0, 0, 0, key_sig_sf + 7, key_sig_mi]))
+        if bpm != 0:
+            mid.append(tokenizer.event2tokens(["set_tempo", 0, 0, 0, bpm]))
+        patches = {}
+        if instruments is None:
+            instruments = []
+        for instr in instruments:
+            patches[i] = patch2number[instr]
+            i = (i + 1) if i != 8 else 10
+        if drum_kit != "None":
+            patches[9] = drum_kits2number[drum_kit]
+        for i, (c, p) in enumerate(patches.items()):
+            mid.append(tokenizer.event2tokens(["patch_change", 0, 0, i + 1, c, p]))
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+        if len(instruments) > 0:
+            disable_patch_change = True
+            disable_channels = [i for i in range(16) if i not in patches]
+    elif tab == 1 and mid is not None:
+        eps = 4 if reduce_cc_st else 0
+        mid = tokenizer.tokenize(MIDI.midi2score(mid), cc_eps=eps, tempo_eps=eps,
+                                 remap_track_channel=remap_track_channel,
+                                 add_default_instr=add_default_instr,
+                                 remove_empty_channels=remove_empty_channels)
+        mid = mid[:int(midi_events)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    elif tab == 2 and mid_seq is not None:
+        mid = np.asarray(mid_seq, dtype=np.int64)
+        if continuation_select > 0:
+            continuation_state.append(mid_seq)
+            mid = np.repeat(mid[continuation_select - 1:continuation_select], repeats=OUTPUT_BATCH_SIZE, axis=0)
+            mid_seq = mid.tolist()
+        else:
+            continuation_state.append(mid.shape[1])
+    else:
+        continuation_state = [0]
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    if mid is not None:
+        max_len += mid.shape[1]
+    init_msgs = [create_msg("progress", [0, gen_events])]
+    if not (tab == 2 and continuation_select == 0):
+        for i in range(OUTPUT_BATCH_SIZE):
+            events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+            init_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                          create_msg("visualizer_append", [i, events])]
+    yield mid_seq, continuation_state, seed, send_msgs(init_msgs)
+    midi_generator = generate(model, mid, batch_size=OUTPUT_BATCH_SIZE, max_len=max_len, temp=temp,
+                              top_p=top_p, top_k=top_k, disable_patch_change=disable_patch_change,
+                              disable_control_change=not allow_cc, disable_channels=disable_channels,
+                              generator=generator)
+    events = [list() for i in range(OUTPUT_BATCH_SIZE)]
+    t = time.time() + 1
+    for i, token_seqs in enumerate(midi_generator):
+        token_seqs = token_seqs.tolist()
+        for j in range(OUTPUT_BATCH_SIZE):
+            token_seq = token_seqs[j]
+            mid_seq[j].append(token_seq)
+            events[j].append(tokenizer.tokens2event(token_seq))
+        if time.time() - t > 0.5:
+            msgs = [create_msg("progress", [i + 1, gen_events])]
+            for j in range(OUTPUT_BATCH_SIZE):
+                msgs += [create_msg("visualizer_append", [j, events[j]])]
+                events[j] = list()
+            yield mid_seq, continuation_state, seed, send_msgs(msgs)
+            t = time.time()
+    yield mid_seq, continuation_state, seed, send_msgs([])
+def finish_run(model_name, mid_seq):
+    if mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return *outputs, []
+    tokenizer = models[model_name][2]
+    outputs = []
+    end_msgs = [create_msg("progress", [0, 0])]
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        mid = tokenizer.detokenize(mid_seq[i])
+        with open(f"outputs/output{i + 1}.mid", 'wb') as f:
+            f.write(MIDI.score2midi(mid))
+        outputs.append(f"outputs/output{i + 1}.mid")
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return *outputs, send_msgs(end_msgs)
+def synthesis_task(mid):
+    return synthesizer.synthesis(MIDI.score2opus(mid))
+def render_audio(model_name, mid_seq, should_render_audio):
+    if (not should_render_audio) or mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return tuple(outputs)
+    tokenizer = models[model_name][2]
+    outputs = []
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    audio_futures = []
+    for i in range(OUTPUT_BATCH_SIZE):
+        mid = tokenizer.detokenize(mid_seq[i])
+        audio_future = thread_pool.submit(synthesis_task, mid)
+        audio_futures.append(audio_future)
+    for future in audio_futures:
+        outputs.append((44100, future.result()))
+    if OUTPUT_BATCH_SIZE == 1:
+        return outputs[0]
+    return tuple(outputs)
+def undo_continuation(model_name, mid_seq, continuation_state):
+    if mid_seq is None or len(continuation_state) < 2:
+        return mid_seq, continuation_state, send_msgs([])
+    tokenizer = models[model_name][2]
+    if isinstance(continuation_state[-1], list):
+        mid_seq = continuation_state[-1]
+    else:
+        mid_seq = [ms[:continuation_state[-1]] for ms in mid_seq]
+    continuation_state = continuation_state[:-1]
+    end_msgs = [create_msg("progress", [0, 0])]
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return mid_seq, continuation_state, send_msgs(end_msgs)
+def load_javascript(dir="javascript"):
+    scripts_list = glob.glob(f"{dir}/*.js")
+    javascript = ""
+    for path in scripts_list:
+        with open(path, "r", encoding="utf8") as jsfile:
+            js_content = jsfile.read()
+            js_content = js_content.replace("const MIDI_OUTPUT_BATCH_SIZE=4;",
+                                            f"const MIDI_OUTPUT_BATCH_SIZE={OUTPUT_BATCH_SIZE};")
+            javascript += f"\n<!-- {path} --><script>{js_content}</script>"
+    template_response_ori = gr.routes.templates.TemplateResponse
+    def template_response(*args, **kwargs):
+        res = template_response_ori(*args, **kwargs)
+        res.body = res.body.replace(
+            b'</head>', f'{javascript}</head>'.encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response
+def hf_hub_download_retry(repo_id, filename):
+    print(f"downloading {repo_id} {filename}")
+    retry = 0
+    err = None
+    while retry < 30:
+        try:
+            return hf_hub_download(repo_id=repo_id, filename=filename)
+        except Exception as e:
+            err = e
+            retry += 1
+    if err:
+        raise err
+def get_tokenizer(repo_id):
+    config_path = hf_hub_download_retry(repo_id=repo_id, filename=f"config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    tokenizer = MIDITokenizer(config["tokenizer"]["version"])
+    tokenizer.set_optimise_midi(config["tokenizer"]["optimise_midi"])
+    return tokenizer
+number2drum_kits = {-1: "None", 0: "Standard", 8: "Room", 16: "Power", 24: "Electric", 25: "TR-808", 32: "Jazz",
+                    40: "Blush", 48: "Orchestra"}
+patch2number = {v: k for k, v in MIDI.Number2patch.items()}
+drum_kits2number = {v: k for k, v in number2drum_kits.items()}
+key_signatures = ['C♭', 'A♭m', 'G♭', 'E♭m', 'D♭', 'B♭m', 'A♭', 'Fm', 'E♭', 'Cm', 'B♭', 'Gm', 'F', 'Dm',
+                  'C', 'Am', 'G', 'Em', 'D', 'Bm', 'A', 'F♯m', 'E', 'C♯m', 'B', 'G♯m', 'F♯', 'D♯m', 'C♯', 'A♯m']
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
+    parser.add_argument("--port", type=int, default=7860, help="gradio server port")
+    parser.add_argument("--device", type=str, default="cuda", help="device to run model")
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--max-gen", type=int, default=1024, help="max")
+    opt = parser.parse_args()
+    OUTPUT_BATCH_SIZE = opt.batch
+    soundfont_path = hf_hub_download_retry(repo_id="skytnt/midi-model", filename="soundfont.sf2")
+    thread_pool = ThreadPoolExecutor(max_workers=OUTPUT_BATCH_SIZE)
+    synthesizer = MidiSynthesizer(soundfont_path)
+    models_info = {
+        "generic pretrain model (tv2o-medium) by skytnt": [
+            "skytnt/midi-model-tv2o-medium", "", {
+                "jpop": "skytnt/midi-model-tv2om-jpop-lora",
+                "touhou": "skytnt/midi-model-tv2om-touhou-lora"
+            }
+        ],
+        "generic pretrain model (tv2o-large) by asigalov61": [
+            "asigalov61/Music-Llama", "", {}
+        ],
+        "generic pretrain model (tv2o-medium) by asigalov61": [
+            "asigalov61/Music-Llama-Medium", "", {}
+        ],
+        "generic pretrain model (tv1-medium) by skytnt": [
+            "skytnt/midi-model", "", {}
+        ]
+    }
+    models = {}
+    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+    device = "cuda"
+    for name, (repo_id, path, loras) in models_info.items():
+        model_base_path = hf_hub_download_retry(repo_id=repo_id, filename=f"{path}onnx/model_base.onnx")
+        model_token_path = hf_hub_download_retry(repo_id=repo_id, filename=f"{path}onnx/model_token.onnx")
+        tokenizer = get_tokenizer(repo_id)
+        models[name] = [model_base_path, model_token_path, tokenizer]
+        for lora_name, lora_repo in loras.items():
+            model_base_path = hf_hub_download_retry(repo_id=lora_repo, filename=f"onnx/model_base.onnx")
+            model_token_path = hf_hub_download_retry(repo_id=lora_repo, filename=f"onnx/model_token.onnx")
+            models[f"{name} with {lora_name} lora"] = [model_base_path, model_token_path, tokenizer]
+    load_javascript()
+    app = gr.Blocks(theme=gr.themes.Soft())
+    with app:
+        gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Midi Composer</h1>")
+        gr.Markdown("![Visitors](https://api.visitorbadge.io/api/visitors?path=skytnt.midi-composer&style=flat)\n\n"
+                    "Midi event transformer for symbolic music generation\n\n"
+                    "Demo for [SkyTNT/midi-model](https://github.com/SkyTNT/midi-model)\n\n"
+                    "[Open In Colab]"
+                    "(https://colab.research.google.com/github/SkyTNT/midi-model/blob/main/demo.ipynb)"
+                    " or [download windows app](https://github.com/SkyTNT/midi-model/releases)"
+                    " for unlimited generation\n\n"
+                    "**Update v1.3**: MIDITokenizerV2 and new MidiVisualizer\n\n"
+                    "The current **best** model: generic pretrain model (tv2o-medium) by skytnt"
+                    )
+        js_msg = gr.Textbox(elem_id="msg_receiver", visible=False)
+        js_msg.change(None, [js_msg], [], js="""
+        (msg_json) =>{
+            let msgs = JSON.parse(msg_json);
+            executeCallbacks(msgReceiveCallbacks, msgs);
+            return [];
+        }
+        """)
+        input_model = gr.Dropdown(label="select model", choices=list(models.keys()),
+                                  type="value", value=list(models.keys())[0])
+        tab_select = gr.State(value=0)
+        with gr.Tabs():
+            with gr.TabItem("custom prompt") as tab1:
+                input_instruments = gr.Dropdown(label="🪗instruments (auto if empty)", choices=list(patch2number.keys()),
+                                                multiselect=True, max_choices=15, type="value")
+                input_drum_kit = gr.Dropdown(label="🥁drum kit", choices=list(drum_kits2number.keys()), type="value",
+                                             value="None")
+                input_bpm = gr.Slider(label="BPM (beats per minute, auto if 0)", minimum=0, maximum=255,
+                                      step=1,
+                                      value=0)
+                input_time_sig = gr.Radio(label="time signature (only for tv2 models)",
+                                          value="auto",
+                                          choices=["auto", "4/4", "2/4", "3/4", "6/4", "7/4",
+                                                   "2/2", "3/2", "4/2", "3/8", "5/8", "6/8", "7/8", "9/8", "12/8"]
+                                          )
+                input_key_sig = gr.Radio(label="key signature (only for tv2 models)",
+                                         value="auto",
+                                         choices=["auto"] + key_signatures,
+                                         type="index"
+                                         )
+                example1 = gr.Examples([
+                    [[], "None"],
+                    [["Acoustic Grand"], "None"],
+                    [['Acoustic Grand', 'SynthStrings 2', 'SynthStrings 1', 'Pizzicato Strings',
+                      'Pad 2 (warm)', 'Tremolo Strings', 'String Ensemble 1'], "Orchestra"],
+                    [['Trumpet', 'Oboe', 'Trombone', 'String Ensemble 1', 'Clarinet',
+                      'French Horn', 'Pad 4 (choir)', 'Bassoon', 'Flute'], "None"],
+                    [['Flute', 'French Horn', 'Clarinet', 'String Ensemble 2', 'English Horn', 'Bassoon',
+                      'Oboe', 'Pizzicato Strings'], "Orchestra"],
+                    [['Electric Piano 2', 'Lead 5 (charang)', 'Electric Bass(pick)', 'Lead 2 (sawtooth)',
+                      'Pad 1 (new age)', 'Orchestra Hit', 'Cello', 'Electric Guitar(clean)'], "Standard"],
+                    [["Electric Guitar(clean)", "Electric Guitar(muted)", "Overdriven Guitar", "Distortion Guitar",
+                      "Electric Bass(finger)"], "Standard"]
+                ], [input_instruments, input_drum_kit])
+            with gr.TabItem("midi prompt") as tab2:
+                input_midi = gr.File(label="input midi", file_types=[".midi", ".mid"], type="binary")
+                input_midi_events = gr.Slider(label="use first n midi events as prompt", minimum=1, maximum=512,
+                                              step=1,
+                                              value=128)
+                input_reduce_cc_st = gr.Checkbox(label="reduce control_change and set_tempo events", value=True)
+                input_remap_track_channel = gr.Checkbox(
+                    label="remap tracks and channels so each track has only one channel and in order", value=True)
+                input_add_default_instr = gr.Checkbox(
+                    label="add a default instrument to channels that don't have an instrument", value=True)
+                input_remove_empty_channels = gr.Checkbox(label="remove channels without notes", value=False)
+                example2 = gr.Examples([[file, 128] for file in glob.glob("example/*.mid")],
+                                       [input_midi, input_midi_events])
+            with gr.TabItem("last output prompt") as tab3:
+                gr.Markdown("Continue generating on the last output.")
+                input_continuation_select = gr.Radio(label="select output to continue generating", value="all",
+                                                     choices=["all"] + [f"output{i + 1}" for i in
+                                                                        range(OUTPUT_BATCH_SIZE)],
+                                                     type="index"
+                                                     )
+                undo_btn = gr.Button("undo the last continuation")
+        tab1.select(lambda: 0, None, tab_select, queue=False)
+        tab2.select(lambda: 1, None, tab_select, queue=False)
+        tab3.select(lambda: 2, None, tab_select, queue=False)
+        input_seed = gr.Slider(label="seed", minimum=0, maximum=2 ** 31 - 1,
+                               step=1, value=0)
+        input_seed_rand = gr.Checkbox(label="random seed", value=True)
+        input_gen_events = gr.Slider(label="generate max n midi events", minimum=1, maximum=opt.max_gen,
+                                     step=1, value=opt.max_gen // 2)
+        with gr.Accordion("options", open=False):
+            input_temp = gr.Slider(label="temperature", minimum=0.1, maximum=1.2, step=0.01, value=1)
+            input_top_p = gr.Slider(label="top p", minimum=0.1, maximum=1, step=0.01, value=0.95)
+            input_top_k = gr.Slider(label="top k", minimum=1, maximum=128, step=1, value=20)
+            input_allow_cc = gr.Checkbox(label="allow midi cc event", value=True)
+            input_render_audio = gr.Checkbox(label="render audio after generation", value=True)
+            example3 = gr.Examples([[1, 0.94, 128], [1, 0.98, 20], [1, 0.98, 12]],
+                                   [input_temp, input_top_p, input_top_k])
+        run_btn = gr.Button("generate", variant="primary")
+        # stop_btn = gr.Button("stop and output")
+        output_midi_seq = gr.State()
+        output_continuation_state = gr.State([0])
+        midi_outputs = []
+        audio_outputs = []
+        with gr.Tabs(elem_id="output_tabs"):
+            for i in range(OUTPUT_BATCH_SIZE):
+                with gr.TabItem(f"output {i + 1}") as tab1:
+                    output_midi_visualizer = gr.HTML(elem_id=f"midi_visualizer_container_{i}")
+                    output_audio = gr.Audio(label="output audio", format="mp3", elem_id=f"midi_audio_{i}")
+                    output_midi = gr.File(label="output midi", file_types=[".mid"])
+                    midi_outputs.append(output_midi)
+                    audio_outputs.append(output_audio)
+        run_event = run_btn.click(run, [input_model, tab_select, output_midi_seq, output_continuation_state,
+                                        input_continuation_select, input_instruments, input_drum_kit, input_bpm,
+                                        input_time_sig, input_key_sig, input_midi, input_midi_events,
+                                        input_reduce_cc_st, input_remap_track_channel,
+                                        input_add_default_instr, input_remove_empty_channels,
+                                        input_seed, input_seed_rand, input_gen_events, input_temp, input_top_p,
+                                        input_top_k, input_allow_cc],
+                                  [output_midi_seq, output_continuation_state, input_seed, js_msg],
+                                  concurrency_limit=10, queue=True)
+        finish_run_event = run_event.then(fn=finish_run,
+                                          inputs=[input_model, output_midi_seq],
+                                          outputs=midi_outputs + [js_msg],
+                                          queue=False)
+        finish_run_event.then(fn=render_audio,
+                              inputs=[input_model, output_midi_seq, input_render_audio],
+                              outputs=audio_outputs,
+                              queue=False)
+        # stop_btn.click(None, [], [], cancels=run_event,
+        #                queue=False)
+        undo_btn.click(undo_continuation, [input_model, output_midi_seq, output_continuation_state],
+                       [output_midi_seq, output_continuation_state, js_msg], queue=False)
+    app.queue().launch(server_port=opt.port, share=opt.share, inbrowser=True, ssr_mode=False)
+    thread_pool.shutdown()

example/Bach--Fugue-in-D-Minor.mid ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1398121eb86a33e73f90ec84be71dac6abc0ddf11372ea7cdd9e01586938a56b
+size 7720

example/Beethoven--Symphony-No5-in-C-Minor-Fate-Opus-67.mid ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28ff6fdcd644e781d36411bf40ab7a1f4849adddbcd1040eaec22751c5ca99d2
+size 87090

example/Chopin--Nocturne No. 9 in B Major, Opus 32 No.1, Andante Sostenuto.mid ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a236e647ad9f5d0af680d3ca19d3b60f334c4bde6b4f86310f63405245c476e
+size 13484

example/Mozart--Requiem, No.1..mid ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa49bf4633401e16777fe47f6f53a494c2166f5101af6dafc60114932a59b9bd
+size 14695

example/castle_in_the_sky.mid ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa14aec6f1be15c4fddd0decc6d9152204f160d4e07e05d8d1dc9f209c309ff7
+size 7957

example/eva-残酷な天使のテーゼ.mid ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e513487543d7e27ec5dc30f027302d2a3b5a3aaf9af554def1e5cd6a7a8d355a
+size 17671

javascript/app.js ADDED Viewed

	@@ -0,0 +1,732 @@

+const MIDI_OUTPUT_BATCH_SIZE=4;
+//Do not change MIDI_OUTPUT_BATCH_SIZE. It will be automatically replaced.
+/**
+ * 自动绕过 shadowRoot 的 querySelector
+ * @param {string} selector - 要查询的 CSS 选择器
+ * @returns {Element|null} - 匹配的元素或 null 如果未找到
+ */
+function deepQuerySelector(selector) {
+  /**
+   * 在指定的根元素或文档对象下深度查询元素
+   * @param {Element|Document} root - 要开始搜索的根元素或文档对象
+   * @param {string} selector - 要查询的 CSS 选择器
+   * @returns {Element|null} - 匹配的元素或 null 如果未找到
+   */
+  function deepSearch(root, selector) {
+    // 在当前根元素下查找
+    let element = root.querySelector(selector);
+    if (element) {
+      return element;
+    }
+    // 如果未找到，递归检查 shadow DOM
+    const shadowHosts = root.querySelectorAll('*');
+    for (let i = 0; i < shadowHosts.length; i++) {
+      const host = shadowHosts[i];
+      // 检查当前元素是否有 shadowRoot
+      if (host.shadowRoot) {
+        element = deepSearch(host.shadowRoot, selector);
+        if (element) {
+          return element;
+        }
+      }
+    }
+    // 未找到元素
+    return null;
+  }
+  return deepSearch(this, selector);
+}
+Element.prototype.deepQuerySelector = deepQuerySelector;
+Document.prototype.deepQuerySelector = deepQuerySelector;
+function gradioApp() {
+    const elems = document.getElementsByTagName('gradio-app')
+    const gradioShadowRoot = elems.length == 0 ? null : elems[0].shadowRoot
+    return !!gradioShadowRoot ? gradioShadowRoot : document;
+}
+uiUpdateCallbacks = []
+msgReceiveCallbacks = []
+function onUiUpdate(callback){
+    uiUpdateCallbacks.push(callback)
+}
+function onMsgReceive(callback){
+    msgReceiveCallbacks.push(callback)
+}
+function runCallback(x, m){
+    try {
+        x(m)
+    } catch (e) {
+        (console.error || console.log).call(console, e.message, e);
+    }
+}
+function executeCallbacks(queue, m) {
+    queue.forEach(function(x){runCallback(x, m)})
+}
+document.addEventListener("DOMContentLoaded", function() {
+    var mutationObserver = new MutationObserver(function(m){
+        executeCallbacks(uiUpdateCallbacks, m);
+    });
+    mutationObserver.observe( gradioApp(), { childList:true, subtree:true })
+});
+function HSVtoRGB(h, s, v) {
+    let r, g, b, i, f, p, q, t;
+    i = Math.floor(h * 6);
+    f = h * 6 - i;
+    p = v * (1 - s);
+    q = v * (1 - f * s);
+    t = v * (1 - (1 - f) * s);
+    switch (i % 6) {
+        case 0: r = v; g = t; b = p; break;
+        case 1: r = q; g = v; b = p; break;
+        case 2: r = p; g = v; b = t; break;
+        case 3: r = p; g = q; b = v; break;
+        case 4: r = t; g = p; b = v; break;
+        case 5: r = v; g = p; b = q; break;
+    }
+    return {
+        r: Math.round(r * 255),
+        g: Math.round(g * 255),
+        b: Math.round(b * 255)
+    };
+}
+function isMobile(){
+  return /(iPhone|iPad|iPod|iOS|Android|Windows Phone)/i.test(navigator.userAgent);
+}
+const number2patch = ['Acoustic Grand', 'Bright Acoustic', 'Electric Grand', 'Honky-Tonk', 'Electric Piano 1', 'Electric Piano 2', 'Harpsichord', 'Clav', 'Celesta', 'Glockenspiel', 'Music Box', 'Vibraphone', 'Marimba', 'Xylophone', 'Tubular Bells', 'Dulcimer', 'Drawbar Organ', 'Percussive Organ', 'Rock Organ', 'Church Organ', 'Reed Organ', 'Accordion', 'Harmonica', 'Tango Accordion', 'Acoustic Guitar(nylon)', 'Acoustic Guitar(steel)', 'Electric Guitar(jazz)', 'Electric Guitar(clean)', 'Electric Guitar(muted)', 'Overdriven Guitar', 'Distortion Guitar', 'Guitar Harmonics', 'Acoustic Bass', 'Electric Bass(finger)', 'Electric Bass(pick)', 'Fretless Bass', 'Slap Bass 1', 'Slap Bass 2', 'Synth Bass 1', 'Synth Bass 2', 'Violin', 'Viola', 'Cello', 'Contrabass', 'Tremolo Strings', 'Pizzicato Strings', 'Orchestral Harp', 'Timpani', 'String Ensemble 1', 'String Ensemble 2', 'SynthStrings 1', 'SynthStrings 2', 'Choir Aahs', 'Voice Oohs', 'Synth Voice', 'Orchestra Hit', 'Trumpet', 'Trombone', 'Tuba', 'Muted Trumpet', 'French Horn', 'Brass Section', 'SynthBrass 1', 'SynthBrass 2', 'Soprano Sax', 'Alto Sax', 'Tenor Sax', 'Baritone Sax', 'Oboe', 'English Horn', 'Bassoon', 'Clarinet', 'Piccolo', 'Flute', 'Recorder', 'Pan Flute', 'Blown Bottle', 'Skakuhachi', 'Whistle', 'Ocarina', 'Lead 1 (square)', 'Lead 2 (sawtooth)', 'Lead 3 (calliope)', 'Lead 4 (chiff)', 'Lead 5 (charang)', 'Lead 6 (voice)', 'Lead 7 (fifths)', 'Lead 8 (bass+lead)', 'Pad 1 (new age)', 'Pad 2 (warm)', 'Pad 3 (polysynth)', 'Pad 4 (choir)', 'Pad 5 (bowed)', 'Pad 6 (metallic)', 'Pad 7 (halo)', 'Pad 8 (sweep)', 'FX 1 (rain)', 'FX 2 (soundtrack)', 'FX 3 (crystal)', 'FX 4 (atmosphere)', 'FX 5 (brightness)', 'FX 6 (goblins)', 'FX 7 (echoes)', 'FX 8 (sci-fi)', 'Sitar', 'Banjo', 'Shamisen', 'Koto', 'Kalimba', 'Bagpipe', 'Fiddle', 'Shanai', 'Tinkle Bell', 'Agogo', 'Steel Drums', 'Woodblock', 'Taiko Drum', 'Melodic Tom', 'Synth Drum', 'Reverse Cymbal', 'Guitar Fret Noise', 'Breath Noise', 'Seashore', 'Bird Tweet', 'Telephone Ring', 'Helicopter', 'Applause', 'Gunshot']
+const number2drum_kits = {0: "Standard", 8: "Room", 16: "Power", 24: "Electric", 25: "TR-808", 32: "Jazz", 40: "Blush", 48: "Orchestra"}
+class MidiVisualizer extends HTMLElement{
+    constructor() {
+        super();
+        this.midiEvents = [];
+        this.activeNotes = [];
+        this.midiTimes = [];
+        this.trackMap = new Map()
+        this.patches = [];
+        for (let i=0;i<16;i++){
+            this.patches.push([[0,0]])
+        }
+        this.container = null;
+        this.trackList = null
+        this.pianoRoll = null;
+        this.svg = null;
+        this.timeLine = null;
+        this.config = {
+            noteHeight : 4,
+            beatWidth: 32
+        }
+        if (isMobile()){
+            this.config.noteHeight = 1;
+            this.config.beatWidth = 16;
+        }
+        this.timePreBeat = 16
+        this.svgWidth = 0;
+        this.t1 = 0;
+        this.totalTimeMs = 0
+        this.playTime = 0
+        this.playTimeMs = 0
+        this.lastUpdateTime = 0
+        this.colorMap = new Map();
+        this.playing = false;
+        this.timer = null;
+        this.version = "v2"
+        this.init();
+    }
+    init(){
+        this.innerHTML=''
+        const shadow = this.attachShadow({mode: 'open'});
+        const style = document.createElement("style");
+        style.textContent = ".note.active {stroke: black;stroke-width: 0.75;stroke-opacity: 0.75;}";
+        const container = document.createElement('div');
+        container.style.display="flex";
+        container.style.height=`${this.config.noteHeight*128 + 25}px`;
+        const trackListContainer = document.createElement('div');
+        trackListContainer.style.width = "260px";
+        trackListContainer.style.minWidth = "260px";
+        trackListContainer.style.height = "100%";
+        trackListContainer.style.display="flex";
+        trackListContainer.style.flexDirection="column";
+        const trackList = document.createElement('div');
+        trackList.style.width = "100%";
+        trackList.style.height = "100%";
+        trackList.style.overflowY= "scroll";
+        trackList.style.display="flex";
+        trackList.style.flexDirection="column";
+        trackList.style.flexGrow="1";
+        const trackControls = document.createElement('div');
+        trackControls.style.display="flex";
+        trackControls.style.flexDirection="row";
+        trackControls.style.width = "100%";
+        trackControls.style.height = "50px";
+        trackControls.style.minHeight = "50px";
+        const allTrackBtn = document.createElement('button');
+        allTrackBtn.textContent = "All";
+        allTrackBtn.style.width = "50%";
+        allTrackBtn.style.height = "100%";
+        allTrackBtn.style.backgroundColor = "rgba(200, 200, 200, 0.3)";
+        allTrackBtn.style.color = 'inherit';
+        allTrackBtn.style.border = "none";
+        allTrackBtn.style.cursor = 'pointer';
+        let self = this;
+        allTrackBtn.onclick = function (){
+            self.trackMap.forEach((track, id) => {
+                track.setChecked(true);
+            })
+        };
+        const noneTrackBtn = document.createElement('button');
+        noneTrackBtn.textContent = "None";
+        noneTrackBtn.style.width = "50%";
+        noneTrackBtn.style.height = "100%";
+        noneTrackBtn.style.backgroundColor = "rgba(200, 200, 200, 0.3)";
+        noneTrackBtn.style.color = 'inherit';
+        noneTrackBtn.style.border = "none";
+        noneTrackBtn.style.cursor = 'pointer';
+        noneTrackBtn.onclick = function (){
+            self.trackMap.forEach((track, id) => {
+                track.setChecked(false);
+            });
+        };
+        const pianoRoll = document.createElement('div');
+        pianoRoll.style.overflowX= "scroll";
+        pianoRoll.style.flexGrow="1";
+        const svg = document.createElementNS('http://www.w3.org/2000/svg', 'svg');
+        svg.style.height = `${this.config.noteHeight*128}px`;
+        svg.style.width = `${this.svgWidth}px`;
+        const timeLine = document.createElementNS('http://www.w3.org/2000/svg', 'line');
+        timeLine.style.stroke = "green"
+        timeLine.style.strokeWidth = "2";
+        if (isMobile()){
+            trackListContainer.style.display = "none";
+            timeLine.style.strokeWidth = "1";
+        }
+        shadow.appendChild(style)
+        shadow.appendChild(container);
+        container.appendChild(trackListContainer);
+        trackListContainer.appendChild(trackList);
+        trackListContainer.appendChild(trackControls);
+        trackControls.appendChild(allTrackBtn);
+        trackControls.appendChild(noneTrackBtn);
+        container.appendChild(pianoRoll);
+        pianoRoll.appendChild(svg);
+        svg.appendChild(timeLine)
+        this.container = container;
+        this.trackList = trackList;
+        this.pianoRoll = pianoRoll;
+        this.svg = svg;
+        this.timeLine= timeLine;
+        for(let i = 0; i < 128 ; i++){
+            this.colorMap.set(i, HSVtoRGB(i / 128, 1, 1))
+        }
+        this.setPlayTime(0);
+    }
+    addTrack(id, tr, cl, name, color){
+        const track = {id, tr, cl, name, color, empty: true,
+            lastCC: new Map(),
+            instrument: cl===9?"Standard Drum":"Acoustic Grand",
+            svg: document.createElementNS('http://www.w3.org/2000/svg', 'g'),
+            ccPaths: new Map()
+        }
+        this.svg.appendChild(track.svg)
+        const trackItem = this.createTrackItem(track);
+        this.trackList.appendChild(trackItem);
+        this.trackMap.set(id, track);
+        return track;
+    }
+    getTrack(tr, cl){
+        const id = tr * 16 + cl
+        let track = this.trackMap.get(id)
+        if (!!track){
+            return track
+        }
+        let color = this.colorMap.get((this.trackMap.size*53)%128)
+        return this.addTrack(id, tr, cl, `Track ${tr}, Channel ${cl}`, color)
+    }
+    createTrackItem(track) {
+        const trackItem = document.createElement('div');
+        trackItem.style.display = 'flex';
+        trackItem.style.alignItems = 'center';
+        trackItem.style.width = '100%';
+        trackItem.style.position = 'relative';
+        const colorBar = document.createElement('div');
+        colorBar.style.width = '5%';
+        colorBar.style.height = '100%';
+        colorBar.style.position = 'absolute';
+        colorBar.style.left = '0';
+        colorBar.style.top = '0';
+        let color = track.color;
+        colorBar.style.backgroundColor = `rgb(${color.r}, ${color.g}, ${color.b})`;
+        trackItem.appendChild(colorBar);
+        const content = document.createElement('div');
+        content.style.paddingLeft = '30px';
+        content.style.flexGrow = '1';
+        content.style.color = "grey"
+        content.innerHTML = `<p>${track.name}<br>${track.instrument}</p>`;
+        trackItem.appendChild(content);
+        track.updateInstrument = function (instrument){
+            track.instrument = instrument;
+            content.innerHTML = `<p>${track.name}<br>${track.instrument}</p>`;
+        }
+        track.setEmpty = function (empty){
+            if (empty!==track.empty){
+                content.style.color = empty?"grey":"inherit";
+            }
+        }
+        const toggleSwitch = document.createElement('input');
+        toggleSwitch.type = 'checkbox';
+        toggleSwitch.checked = true;
+        toggleSwitch.style.marginLeft = 'auto';
+        toggleSwitch.style.marginRight = '10px';
+        toggleSwitch.style.width = '20px';
+        toggleSwitch.style.height = '20px';
+        toggleSwitch.style.cursor = 'pointer';
+        toggleSwitch.onchange = function () {
+            track.svg.setAttribute('visibility',toggleSwitch.checked? "visible" : "hidden")
+        };
+        track.setChecked = function (checked){
+            toggleSwitch.checked = checked;
+            track.svg.setAttribute('visibility',toggleSwitch.checked? "visible" : "hidden")
+        }
+        trackItem.appendChild(toggleSwitch);
+        return trackItem;
+    }
+    clearMidiEvents(){
+        this.pause()
+        this.midiEvents = [];
+        this.activeNotes = [];
+        this.midiTimes = [];
+        this.trackMap = new Map()
+        this.patches = [];
+        for (let i=0;i<16;i++){
+            this.patches.push([[0,0]])
+        }
+        this.t1 = 0
+        this.setPlayTime(0);
+        this.totalTimeMs = 0;
+        this.playTimeMs = 0
+        this.lastUpdateTime = 0
+        this.trackList.innerHTML = ''
+        this.svgWidth = 0
+        this.svg.innerHTML = ''
+        this.svg.style.width = `${this.svgWidth}px`;
+        this.svg.appendChild(this.timeLine)
+    }
+    appendMidiEvent(midiEvent){
+        if(midiEvent instanceof Array && midiEvent.length > 0){
+            this.t1 += midiEvent[1]
+            let t = this.t1*this.timePreBeat + midiEvent[2]
+            midiEvent = [midiEvent[0], t].concat(midiEvent.slice(3))
+            if(midiEvent[0] === "note"){
+                let track = midiEvent[2]
+                let duration = 0
+                let channel = 0
+                let pitch = 0
+                let velocity = 0
+                if(this.version === "v1"){
+                    duration = midiEvent[3]
+                    channel = midiEvent[4]
+                    pitch = midiEvent[5]
+                    velocity = midiEvent[6]
+                }else if (this.version === "v2"){
+                    channel = midiEvent[3]
+                    pitch = midiEvent[4]
+                    velocity = midiEvent[5]
+                    duration = midiEvent[6]
+                }
+                let vis_track = this.getTrack(track, channel);
+                vis_track.setEmpty(false);
+                let x = (t/this.timePreBeat)*this.config.beatWidth
+                let y = (127 - pitch)*this.config.noteHeight
+                let w = (duration/this.timePreBeat)*this.config.beatWidth
+                let h = this.config.noteHeight
+                this.svgWidth = Math.ceil(Math.max(x + w, this.svgWidth))
+                let opacity = Math.min(1, velocity/127 + 0.1).toFixed(2)
+                let rect = this.drawNote(vis_track, x,y,w,h, opacity)
+                midiEvent.push(rect);
+                this.setPlayTime(t);
+                this.pianoRoll.scrollTo(this.svgWidth - this.pianoRoll.offsetWidth, this.pianoRoll.scrollTop)
+            }else if(midiEvent[0] === "patch_change"){
+                let track = midiEvent[2];
+                let channel = midiEvent[3];
+                this.patches[channel].push([t, midiEvent[4]]);
+                this.patches[channel].sort((a, b) => a[0] - b[0]);
+                this.getTrack(track, channel);
+            }else if(midiEvent[0] === "control_change"){
+                let track = midiEvent[2];
+                let channel = midiEvent[3];
+                let controller = midiEvent[4];
+                let value = midiEvent[5];
+                let vis_track = this.getTrack(track, channel);
+                this.drawCC(vis_track, t, controller, value);
+                this.setPlayTime(t);
+            }
+            this.midiEvents.push(midiEvent);
+            this.svg.style.width = `${this.svgWidth}px`;
+        }
+    }
+    drawNote(track, x, y, w, h, opacity) {
+        if (!track.svg) {
+          return null;
+        }
+        const rect = document.createElementNS('http://www.w3.org/2000/svg', 'rect');
+        rect.classList.add('note');
+        const color = track.color;
+        rect.setAttribute('fill', `rgba(${color.r}, ${color.g}, ${color.b}, ${opacity})`);
+        // Round values to the nearest integer to avoid partially filled pixels.
+        rect.setAttribute('x', `${Math.round(x)}`);
+        rect.setAttribute('y', `${Math.round(y)}`);
+        rect.setAttribute('width', `${Math.round(w)}`);
+        rect.setAttribute('height', `${Math.round(h)}`);
+        track.svg.appendChild(rect);
+        return rect
+    }
+    drawCC(track, t, controller, value){
+        if (!track.svg) {
+          return null;
+        }
+        let path = track.ccPaths.get(controller);
+        let x = (t/this.timePreBeat)*this.config.beatWidth
+        let y = (127 - value)*this.config.noteHeight
+        if (!path){
+            path = document.createElementNS('http://www.w3.org/2000/svg', 'path');
+            path.setAttribute('visibility',"hidden");
+            path.setAttribute('fill', "transparent");
+            const color = track.color;
+            path.setAttribute('stroke', `rgba(${color.r}, ${color.g}, ${color.b}, 0.6)`);
+            path.setAttribute('stroke-width', "1");
+            path.setAttribute('d',
+                t===0?`M ${x} ${y}`:`M 0 ${127*this.config.noteHeight} H ${x} V ${y}`);
+            track.svg.appendChild(path);
+            track.ccPaths.set(controller, path);
+            track.lastCC.set(controller, value);
+            return path;
+        }
+        let lastVal = track.lastCC.get(controller);
+        if(lastVal !== value){
+            path.removeAttribute('visibility');
+        }
+        let d = path.getAttribute("d");
+        d += `H ${x} V ${y}`
+        path.setAttribute('d', d);
+        return path
+    }
+    finishAppendMidiEvent(){
+        this.pause()
+        let midiEvents = this.midiEvents.sort((a, b)=>a[1]-b[1])
+        let tempo = (60 / 120) * 10 ** 3
+        let ms = 0
+        let lastT = 0
+        this.midiTimes.push({ms:ms, t: 0, tempo: tempo})
+        midiEvents.forEach((midiEvent)=>{
+            let t = midiEvent[1]
+            ms += ((t- lastT) / this.timePreBeat) * tempo
+            if(midiEvent[0]==="set_tempo"){
+                tempo = (60 / midiEvent[3]) * 10 ** 3
+                this.midiTimes.push({ms:ms, t: t, tempo: tempo})
+            }
+            if(midiEvent[0]==="note"){
+                this.totalTimeMs = Math.max(this.totalTimeMs, ms + (midiEvent[3]/ this.timePreBeat)*tempo)
+            }else{
+                this.totalTimeMs = Math.max(this.totalTimeMs, ms);
+            }
+            lastT = t;
+        })
+        let x = (lastT/this.timePreBeat)*this.config.beatWidth;
+        this.trackMap.forEach((track, id)=>{
+            track.ccPaths.forEach((path, controller)=>{
+                let d = path.getAttribute("d");
+                d += `H ${x}`
+                path.setAttribute('d', d);
+            })
+        })
+    }
+    setPlayTime(t){
+        this.playTime = t
+        let x = Math.round((t/this.timePreBeat)*this.config.beatWidth)
+        this.timeLine.setAttribute('x1', `${x}`);
+        this.timeLine.setAttribute('y1', '0');
+        this.timeLine.setAttribute('x2', `${x}`);
+        this.timeLine.setAttribute('y2', `${this.config.noteHeight*128}`);
+        this.pianoRoll.scrollTo(Math.max(0, x - this.pianoRoll.offsetWidth/2), this.pianoRoll.scrollTop)
+        this.trackMap.forEach((track, id)=>{
+            let instrument = track.instrument
+            let cl = track.cl;
+            let patches = this.patches[cl]
+            let p = 0
+            for (let i = 0; i < patches.length ; i++){
+                let tp = patches[i]
+                if (t < tp[0])
+                    break
+                p = tp[1]
+            }
+            if (cl === 9){
+                let drumKit = number2drum_kits[`${p}`];
+                if (!!drumKit)
+                    instrument = drumKit + " Drum";
+            }else{
+                instrument = number2patch[p]
+            }
+            if (instrument !== track.instrument)
+                track.updateInstrument(instrument)
+        });
+        let dt = Date.now() - this.lastUpdateTime; // limit the update rate of ActiveNotes
+        if(this.playing && dt > 50){
+            let activeNotes = []
+            this.removeActiveNotes(this.activeNotes)
+            this.midiEvents.forEach((midiEvent)=>{
+                if(midiEvent[0] === "note"){
+                    let time = midiEvent[1]
+                    let duration = this.version==="v1"? midiEvent[3]:midiEvent[6]
+                    let note = midiEvent[midiEvent.length - 1]
+                    if(time <=this.playTime && time+duration>= this.playTime){
+                        activeNotes.push(note)
+                    }
+                }
+            });
+            this.addActiveNotes(activeNotes)
+            this.lastUpdateTime = Date.now();
+        }
+    }
+    setPlayTimeMs(ms){
+        this.playTimeMs = ms
+        let playTime = 0
+        for(let i =0;i<this.midiTimes.length;i++){
+            let midiTime = this.midiTimes[i]
+            if(midiTime.ms>=ms){
+                break;
+            }
+            playTime = midiTime.t + (ms-midiTime.ms) * this.timePreBeat / midiTime.tempo
+        }
+        this.setPlayTime(playTime)
+    }
+    addActiveNotes(notes){
+        notes.forEach((note)=>{
+            this.activeNotes.push(note)
+            note.classList.add('active');
+        });
+    }
+    removeActiveNotes(notes){
+        notes.forEach((note)=>{
+            let idx = this.activeNotes.indexOf(note)
+            if(idx>-1)
+                this.activeNotes.splice(idx, 1);
+            note.classList.remove('active');
+        });
+    }
+    play(){
+        this.playing = true;
+    }
+    pause(){
+        this.removeActiveNotes(this.activeNotes)
+        this.playing = false;
+    }
+    bindAudioPlayer(audio){
+        this.pause()
+        audio.addEventListener("play", (event)=>{
+            this.play()
+        })
+        audio.addEventListener("pause", (event)=>{
+            this.pause()
+        })
+        audio.addEventListener("loadedmetadata", (event)=>{
+            //I don't know why the calculated totalTimeMs is different from audio.duration*10**3
+            this.totalTimeMs = audio.duration*10**3;
+        })
+    }
+    bindWaveformCursor(cursor){
+        let self = this;
+        const callback = function(mutationsList, observer) {
+            for(let mutation of mutationsList) {
+                if (mutation.type === 'attributes' && mutation.attributeName === 'style') {
+                    let progress = parseFloat(mutation.target.style.left.slice(0,-1))*0.01;
+                    if(!isNaN(progress)){
+                        self.setPlayTimeMs(progress*self.totalTimeMs);
+                    }
+                }
+            }
+        };
+        const observer = new MutationObserver(callback);
+        observer.observe(cursor, {
+            attributes: true,
+            attributeFilter: ['style']
+        });
+    }
+}
+customElements.define('midi-visualizer', MidiVisualizer);
+(()=>{
+    function midi_visualizer_setup(idx, midi_visualizer){
+        let midi_visualizer_container_inited = null
+        let midi_audio_audio_inited = null;
+        let midi_audio_cursor_inited = null;
+        onUiUpdate((m)=>{
+            let app = gradioApp()
+            let midi_visualizer_container = app.querySelector(`#midi_visualizer_container_${idx}`);
+            if(!!midi_visualizer_container && midi_visualizer_container_inited!== midi_visualizer_container){
+                midi_visualizer_container.appendChild(midi_visualizer)
+                midi_visualizer_container_inited = midi_visualizer_container;
+            }
+            let midi_audio = app.querySelector(`#midi_audio_${idx}`);
+            if (!!midi_audio){
+                let midi_audio_cursor = midi_audio.deepQuerySelector(".cursor");
+                if(!!midi_audio_cursor && midi_audio_cursor_inited!==midi_audio_cursor){
+                    midi_visualizer.bindWaveformCursor(midi_audio_cursor)
+                    midi_audio_cursor_inited = midi_audio_cursor
+                }
+                let midi_audio_waveform = midi_audio.deepQuerySelector("#waveform");
+                if(!!midi_audio_waveform){
+                    let midi_audio_audio = midi_audio_waveform.deepQuerySelector("audio");
+                    if(!!midi_audio_audio && midi_audio_audio_inited!==midi_audio_audio){
+                        midi_visualizer.bindAudioPlayer(midi_audio_audio)
+                        midi_audio_audio_inited = midi_audio_audio
+                    }
+                }
+            }
+        });
+    }
+    let midi_visualizers = []
+    for (let i = 0; i < MIDI_OUTPUT_BATCH_SIZE ; i++){
+        let midi_visualizer = document.createElement('midi-visualizer');
+        midi_visualizers.push(midi_visualizer);
+        midi_visualizer_setup(i, midi_visualizer)
+    }
+    let hasProgressBar = false;
+    let output_tabs_inited = null;
+    onUiUpdate((m)=>{
+        let app = gradioApp()
+        let output_tabs = app.querySelector("#output_tabs");
+        if(!!output_tabs && output_tabs_inited!== output_tabs){
+            output_tabs_inited = output_tabs;
+        }
+    });
+    function createProgressBar(progressbarContainer){
+        let parentProgressbar = progressbarContainer.parentNode;
+        let divProgress = document.createElement('div');
+        divProgress.className='progressDiv';
+        let rect = progressbarContainer.getBoundingClientRect();
+        divProgress.style.width = rect.width + "px";
+        divProgress.style.background = "#b4c0cc";
+        divProgress.style.borderRadius = "8px";
+        let divInner = document.createElement('div');
+        divInner.className='progress';
+        divInner.style.color = "white";
+        divInner.style.background = "#0060df";
+        divInner.style.textAlign = "right";
+        divInner.style.fontWeight = "bold";
+        divInner.style.borderRadius = "8px";
+        divInner.style.height = "20px";
+        divInner.style.lineHeight = "20px";
+        divInner.style.paddingRight = "8px"
+        divInner.style.width = "0%";
+        divProgress.appendChild(divInner);
+        parentProgressbar.insertBefore(divProgress, progressbarContainer);
+        hasProgressBar = true;
+    }
+    function removeProgressBar(progressbarContainer){
+        let parentProgressbar = progressbarContainer.parentNode;
+        let divProgress = parentProgressbar.querySelector(".progressDiv");
+        parentProgressbar.removeChild(divProgress);
+        hasProgressBar = false;
+    }
+    function setProgressBar(progress, total){
+        if (!hasProgressBar)
+            createProgressBar(output_tabs_inited)
+        if (hasProgressBar && total === 0){
+            removeProgressBar(output_tabs_inited)
+            return
+        }
+        let parentProgressbar = output_tabs_inited.parentNode;
+        // let divProgress = parentProgressbar.querySelector(".progressDiv");
+        let divInner = parentProgressbar.querySelector(".progress");
+        if(total===0)
+            total = 1;
+        divInner.style.width = `${(progress/total)*100}%`;
+        divInner.textContent = `${progress}/${total}`;
+    }
+    onMsgReceive((msgs)=>{
+        for(let msg of msgs){
+            if(msg instanceof Array){
+                msg.forEach((o)=>{handleMsg(o)});
+            }else{
+                handleMsg(msg);
+            }
+        }
+    })
+    function handleMsg(msg){
+        let idx;
+        switch (msg.name) {
+            case "visualizer_clear":
+                idx = msg.data[0];
+                let ver = msg.data[1];
+                midi_visualizers[idx].clearMidiEvents(false);
+                midi_visualizers[idx].version = ver;
+                break;
+            case "visualizer_append":
+                idx = msg.data[0];
+                let events = msg.data[1];
+                events.forEach( value => {
+                    midi_visualizers[idx].appendMidiEvent(value);
+                })
+                break;
+            case "visualizer_end":
+                idx = msg.data;
+                midi_visualizers[idx].finishAppendMidiEvent()
+                midi_visualizers[idx].setPlayTime(0);
+                break;
+            case "progress":
+                let progress = msg.data[0]
+                let total = msg.data[1]
+                setProgressBar(progress, total)
+                break;
+            default:
+        }
+    }
+})();

midi_model.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import json
+from typing import Union, Dict, Any
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tqdm
+from peft import PeftConfig, LoraModel, load_peft_weights, set_peft_model_state_dict
+from transformers import LlamaModel, LlamaConfig, DynamicCache, PretrainedConfig, PreTrainedModel
+from midi_tokenizer import MIDITokenizerV1, MIDITokenizerV2, MIDITokenizer
+config_name_list = ["tv1-medium", "tv2-medium", "tv2o-medium", "tv2-large", "tv2o-large"]
+class MIDIModelConfig(PretrainedConfig):
+    model_type = "midi_model"
+    def __init__(self,
+                 tokenizer: Union[MIDITokenizerV1, MIDITokenizerV2, Dict]=None,
+                 net_config: Union[LlamaConfig, Dict]=None,
+                 net_token_config: Union[LlamaConfig, Dict]=None,
+                 **kwargs):
+        super().__init__(**kwargs)
+        if tokenizer:
+            if isinstance(tokenizer, dict):
+                self.tokenizer = MIDITokenizer(tokenizer["version"])
+                self.tokenizer.set_optimise_midi(tokenizer["optimise_midi"])
+            else:
+                self.tokenizer = tokenizer
+        else:
+            self.tokenizer = MIDITokenizer()
+        if net_config:
+            if isinstance(net_config, dict):
+                self.net_config = LlamaConfig(**net_config)
+            else:
+                self.net_config = net_config
+        else:
+            self.net_config = LlamaConfig()
+        if net_token_config:
+            if isinstance(net_token_config, dict):
+                self.net_token_config = LlamaConfig(**net_token_config)
+            else:
+                self.net_token_config = net_token_config
+        else:
+            self.net_token_config = LlamaConfig()
+        self.n_embd = self.net_token_config.hidden_size
+    def to_dict(self) -> Dict[str, Any]:
+        d = super().to_dict()
+        d["tokenizer"] = self.tokenizer.to_dict()
+        return d
+    def __str__(self):
+        d = {
+            "net": self.net_config.to_json_string(use_diff=False),
+            "net_token": self.net_token_config.to_json_string(use_diff=False)
+        }
+        return json.dumps(d, indent=4)
+    @staticmethod
+    def get_config(tokenizer_ver="v2", optimise_midi=True, n_layer=12, n_head=16, n_embd=1024, n_inner=4096):
+        tokenizer = MIDITokenizer(tokenizer_ver)
+        tokenizer.set_optimise_midi(optimise_midi)
+        net_config = LlamaConfig(vocab_size=tokenizer.vocab_size,
+                                 hidden_size=n_embd, num_attention_heads=n_head,
+                                 num_hidden_layers=n_layer, intermediate_size=n_inner,
+                                 pad_token_id=tokenizer.pad_id, max_position_embeddings=4096,
+                                 use_cache=False)
+        net_token_config = LlamaConfig(vocab_size=tokenizer.vocab_size,
+                                       hidden_size=n_embd, num_attention_heads=n_head // 4,
+                                       num_hidden_layers=n_layer // 4, intermediate_size=n_inner // 4,
+                                       pad_token_id=tokenizer.pad_id, max_position_embeddings=4096,
+                                       use_cache=False)
+        return MIDIModelConfig(tokenizer, net_config, net_token_config)
+    @staticmethod
+    def from_name(name="tv2o-medium"):
+        tv, size = name.split("-")
+        tv = tv[1:]
+        if tv[-1] == "o":
+            o = True
+            tv = tv[:-1]
+        else:
+            o = False
+        if tv not in ["v1", "v2"]:
+            raise ValueError(f"Unknown tokenizer version {tv}")
+        if size == "medium":
+            return MIDIModelConfig.get_config(tokenizer_ver=tv, optimise_midi=o,
+                                              n_layer=12, n_head=16, n_embd=1024, n_inner=4096)
+        elif size == "large":
+            return MIDIModelConfig.get_config(tokenizer_ver=tv, optimise_midi=o,
+                                              n_layer=24, n_head=16, n_embd=1024, n_inner=4096)
+        else:
+            raise ValueError(f"Unknown model size {size}")
+class MIDIModel(PreTrainedModel):
+    config_class = MIDIModelConfig
+    def __init__(self, config: MIDIModelConfig, *args, **kwargs):
+        super(MIDIModel, self).__init__(config, *args, **kwargs)
+        self.tokenizer = config.tokenizer
+        self.net = LlamaModel(config.net_config)
+        self.net_token = LlamaModel(config.net_token_config)
+        self.lm_head = nn.Linear(config.n_embd, self.tokenizer.vocab_size, bias=False)
+    def load_merge_lora(self, model_id):
+        peft_config = PeftConfig.from_pretrained(model_id)
+        model = LoraModel(self, peft_config, adapter_name="default")
+        adapter_state_dict = load_peft_weights(model_id, device=str(self.device))
+        set_peft_model_state_dict(self, adapter_state_dict, "default")
+        return model.merge_and_unload()
+    def forward_token(self, hidden_state=None, x=None, cache=None):
+        """
+        :param hidden_state: (batch_size, n_embd)
+        :param x: (batch_size, token_sequence_length)
+        :param cache: Cache
+        :return: (batch_size, 1 + token_sequence_length, vocab_size)
+        """
+        if hidden_state is not None:
+            #if you use cache, you don't need to pass in hidden_state
+            hidden_state = hidden_state.unsqueeze(1)  # (batch_size, 1, n_embd)
+        if x is not None:
+            x = self.net_token.embed_tokens(x)
+            if hidden_state is not None:
+                x = torch.cat([hidden_state, x], dim=1)
+            hidden_state = x
+        hidden_state = self.net_token.forward(inputs_embeds=hidden_state,
+                                              past_key_values=cache,
+                                              use_cache=cache is not None).last_hidden_state
+        return self.lm_head(hidden_state)
+    def forward(self, x, cache = None):
+        """
+        :param x: (batch_size, midi_sequence_length, token_sequence_length)
+        :param cache: Cache
+        :return: hidden (batch_size, midi_sequence_length, n_embd)
+        """
+        # merge token sequence
+        x = self.net.embed_tokens(x)
+        x = x.sum(dim=-2)
+        x = self.net.forward(inputs_embeds=x,
+                             past_key_values=cache,
+                             use_cache=cache is not None)
+        return x.last_hidden_state
+    def sample_top_p_k(self, probs, p, k, generator=None):
+        probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+        probs_sum = torch.cumsum(probs_sort, dim=-1)
+        mask = probs_sum - probs_sort > p
+        probs_sort[mask] = 0.0
+        mask = torch.zeros(probs_sort.shape[-1], device=probs_sort.device)
+        mask[:k] = 1
+        probs_sort = probs_sort * mask
+        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+        shape = probs_sort.shape
+        next_token = torch.multinomial(probs_sort.reshape(-1, shape[-1]),
+                                       num_samples=1, generator=generator).reshape(*shape[:-1], 1)
+        next_token = torch.gather(probs_idx, -1, next_token).reshape(*shape[:-1])
+        return next_token
+    @torch.inference_mode()
+    def generate(self, prompt=None, batch_size=1, max_len=512, temp=1.0, top_p=0.98, top_k=20, generator=None):
+        tokenizer = self.tokenizer
+        max_token_seq = tokenizer.max_token_seq
+        if prompt is None:
+            input_tensor = torch.full((1, max_token_seq), tokenizer.pad_id, dtype=torch.long, device=self.device)
+            input_tensor[0, 0] = tokenizer.bos_id  # bos
+            input_tensor = input_tensor.unsqueeze(0)
+            input_tensor = torch.cat([input_tensor] * batch_size, dim=0)
+        else:
+            if len(prompt.shape) == 2:
+                prompt = prompt[None, :]
+                prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+            elif prompt.shape[0] == 1:
+                prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+            elif len(prompt.shape) != 3 or prompt.shape[0] != batch_size:
+                raise ValueError(f"invalid shape for prompt, {prompt.shape}")
+            prompt = prompt[..., :max_token_seq]
+            if prompt.shape[-1] < max_token_seq:
+                prompt = np.pad(prompt, ((0, 0), (0, 0), (0, max_token_seq - prompt.shape[-1])),
+                                mode="constant", constant_values=tokenizer.pad_id)
+            input_tensor = torch.from_numpy(prompt).to(dtype=torch.long, device=self.device)
+        cur_len = input_tensor.shape[1]
+        bar = tqdm.tqdm(desc="generating", total=max_len - cur_len)
+        cache1 = DynamicCache()
+        past_len = 0
+        with bar:
+            while cur_len < max_len:
+                end = [False] * batch_size
+                hidden = self.forward(input_tensor[:, past_len:], cache=cache1)[:, -1]
+                next_token_seq = None
+                event_names = [""] * batch_size
+                cache2 = DynamicCache()
+                for i in range(max_token_seq):
+                    mask = torch.zeros((batch_size, tokenizer.vocab_size), dtype=torch.int64, device=self.device)
+                    for b in range(batch_size):
+                        if end[b]:
+                            mask[b, tokenizer.pad_id] = 1
+                            continue
+                        if i == 0:
+                            mask[b, list(tokenizer.event_ids.values()) + [tokenizer.eos_id]] = 1
+                        else:
+                            param_names = tokenizer.events[event_names[b]]
+                            if i > len(param_names):
+                                mask[b, tokenizer.pad_id] = 1
+                                continue
+                            mask[b, tokenizer.parameter_ids[param_names[i - 1]]] = 1
+                    mask = mask.unsqueeze(1)
+                    x = next_token_seq
+                    if i != 0:
+                        # cached
+                        hidden = None
+                        x = x[:, -1:]
+                    logits = self.forward_token(hidden, x, cache=cache2)[:, -1:]
+                    scores = torch.softmax(logits / temp, dim=-1) * mask
+                    samples = self.sample_top_p_k(scores, top_p, top_k, generator=generator)
+                    if i == 0:
+                        next_token_seq = samples
+                        for b in range(batch_size):
+                            if end[b]:
+                                continue
+                            eid = samples[b].item()
+                            if eid == tokenizer.eos_id:
+                                end[b] = True
+                            else:
+                                event_names[b] = tokenizer.id_events[eid]
+                    else:
+                        next_token_seq = torch.cat([next_token_seq, samples], dim=1)
+                        if all([len(tokenizer.events[event_names[b]]) == i for b in range(batch_size) if not end[b]]):
+                            break
+                if next_token_seq.shape[1] < max_token_seq:
+                    next_token_seq = F.pad(next_token_seq, (0, max_token_seq - next_token_seq.shape[1]),
+                                           "constant", value=tokenizer.pad_id)
+                next_token_seq = next_token_seq.unsqueeze(1)
+                input_tensor = torch.cat([input_tensor, next_token_seq], dim=1)
+                past_len = cur_len
+                cur_len += 1
+                bar.update(1)
+                if all(end):
+                    break
+        return input_tensor.cpu().numpy()

midi_synthesizer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from threading import Lock
+import fluidsynth
+import numpy as np
+class MidiSynthesizer:
+    def __init__(self, soundfont_path, sample_rate=44100):
+        self.soundfont_path = soundfont_path
+        self.sample_rate = sample_rate
+        fl = fluidsynth.Synth(samplerate=float(sample_rate))
+        sfid = fl.sfload(soundfont_path)
+        self.devices = [[fl, sfid, False]]
+        self.file_lock = Lock()
+    def get_fluidsynth(self):
+        for device in self.devices:
+            if not device[2]:
+                device[2] = True
+                return device
+        with self.file_lock:
+            fl = fluidsynth.Synth(samplerate=float(self.sample_rate))
+            sfid = fl.sfload(self.soundfont_path)
+        device = [fl, sfid, True]
+        self.devices.append(device)
+        return device
+    def release_fluidsynth(self, device):
+        device[0].system_reset()
+        device[0].get_samples(self.sample_rate*5) # wait for silence
+        device[2] = False
+    def synthesis(self, midi_opus):
+        ticks_per_beat = midi_opus[0]
+        event_list = []
+        for track_idx, track in enumerate(midi_opus[1:]):
+            abs_t = 0
+            for event in track:
+                abs_t += event[1]
+                event_new = [*event]
+                event_new[1] = abs_t
+                event_list.append(event_new)
+        event_list = sorted(event_list, key=lambda e: e[1])
+        tempo = int((60 / 120) * 10 ** 6)  # default 120 bpm
+        ss = np.empty((0, 2), dtype=np.int16)
+        device = self.get_fluidsynth()
+        fl, sfid = device[:-1]
+        last_t = 0
+        for c in range(16):
+            fl.program_select(c, sfid, 128 if c == 9 else 0, 0)
+        for event in event_list:
+            name = event[0]
+            sample_len = int(((event[1] / ticks_per_beat) * tempo / (10 ** 6)) * self.sample_rate)
+            sample_len -= int(((last_t / ticks_per_beat) * tempo / (10 ** 6)) * self.sample_rate)
+            last_t = event[1]
+            if sample_len > 0:
+                sample = fl.get_samples(sample_len).reshape(sample_len, 2)
+                ss = np.concatenate([ss, sample])
+            if name == "set_tempo":
+                tempo = event[2]
+            elif name == "patch_change":
+                c, p = event[2:4]
+                fl.program_select(c, sfid, 128 if c == 9 else 0, p)
+            elif name == "control_change":
+                c, cc, v = event[2:5]
+                fl.cc(c, cc, v)
+            elif name == "note_on" and event[3] > 0:
+                c, p, v = event[2:5]
+                fl.noteon(c, p, v)
+            elif name == "note_off" or (name == "note_on" and event[3] == 0):
+                c, p = event[2:4]
+                fl.noteoff(c, p)
+        self.release_fluidsynth(device)
+        if ss.shape[0] > 0:
+            max_val = np.abs(ss).max()
+            if max_val != 0:
+                ss = (ss / max_val) * np.iinfo(np.int16).max
+        ss = ss.astype(np.int16)
+        return ss

midi_tokenizer.py ADDED Viewed

	@@ -0,0 +1,1196 @@

+import random
+from typing import Dict, Any
+import PIL.Image
+import numpy as np
+class MIDITokenizerV1:
+    def __init__(self):
+        self.version = "v1"
+        self.optimise_midi = False
+        self.vocab_size = 0
+        def allocate_ids(size):
+            ids = [self.vocab_size + i for i in range(size)]
+            self.vocab_size += size
+            return ids
+        self.pad_id = allocate_ids(1)[0]
+        self.bos_id = allocate_ids(1)[0]
+        self.eos_id = allocate_ids(1)[0]
+        self.events = {
+            "note": ["time1", "time2", "track", "duration", "channel", "pitch", "velocity"],
+            "patch_change": ["time1", "time2", "track", "channel", "patch"],
+            "control_change": ["time1", "time2", "track", "channel", "controller", "value"],
+            "set_tempo": ["time1", "time2", "track", "bpm"],
+        }
+        self.event_parameters = {
+            "time1": 128, "time2": 16, "duration": 2048, "track": 128, "channel": 16, "pitch": 128, "velocity": 128,
+            "patch": 128, "controller": 128, "value": 128, "bpm": 256
+        }
+        self.event_ids = {e: allocate_ids(1)[0] for e in self.events.keys()}
+        self.id_events = {i: e for e, i in self.event_ids.items()}
+        self.parameter_ids = {p: allocate_ids(s) for p, s in self.event_parameters.items()}
+        self.max_token_seq = max([len(ps) for ps in self.events.values()]) + 1
+    def to_dict(self) -> Dict[str, Any]:
+        d = {
+            "version":self.version,
+            "optimise_midi":self.optimise_midi,
+            "vocab_size": self.vocab_size,
+            "events": self.events,
+            "event_parameters": self.event_parameters,
+            "max_token_seq": self.max_token_seq,
+            "pad_id": self.pad_id,
+            "bos_id": self.bos_id,
+            "eos_id": self.eos_id,
+        }
+        return d
+    def set_optimise_midi(self, optimise_midi=True):
+        self.optimise_midi = optimise_midi
+    @staticmethod
+    def tempo2bpm(tempo):
+        tempo = tempo / 10 ** 6  # us to s
+        bpm = 60 / tempo
+        return bpm
+    @staticmethod
+    def bpm2tempo(bpm):
+        if bpm == 0:
+            bpm = 1
+        tempo = int((60 / bpm) * 10 ** 6)
+        return tempo
+    def tokenize(self, midi_score, add_bos_eos=True, cc_eps=4, tempo_eps=4,
+                 remap_track_channel=None, add_default_instr=None, remove_empty_channels=None):
+        if remap_track_channel is None:  # set default value
+            remap_track_channel = self.optimise_midi
+        if add_default_instr is None:
+            add_default_instr = self.optimise_midi
+        if remove_empty_channels is None:
+            remove_empty_channels = self.optimise_midi
+        ticks_per_beat = midi_score[0]
+        event_list = {}
+        track_idx_map = {i: dict() for i in range(16)}
+        track_idx_dict = {}
+        channels = []
+        patch_channels = []
+        empty_channels = [True] * 16
+        channel_note_tracks = {i: list() for i in range(16)}
+        for track_idx, track in enumerate(midi_score[1:129]):
+            last_notes = {}
+            patch_dict = {}
+            control_dict = {}
+            last_tempo = 0
+            for event in track:
+                if event[0] not in self.events:
+                    continue
+                c = -1
+                t = round(16 * event[1] / ticks_per_beat)  # quantization
+                new_event = [event[0], t // 16, t % 16, track_idx] + event[2:]
+                if event[0] == "note":
+                    c = event[3]
+                    if c > 15 or c < 0:
+                        continue
+                    empty_channels[c] = False
+                    track_idx_dict.setdefault(c, track_idx)
+                    note_tracks = channel_note_tracks[c]
+                    if track_idx not in note_tracks:
+                        note_tracks.append(track_idx)
+                    new_event[4] = max(1, round(16 * new_event[4] / ticks_per_beat))
+                elif event[0] == "set_tempo":
+                    if new_event[4] == 0:  # invalid tempo
+                        continue
+                    bpm = int(self.tempo2bpm(new_event[4]))
+                    new_event[4] = min(bpm, 255)
+                if event[0] == "note":
+                    key = tuple(new_event[:4] + new_event[5:-1])
+                else:
+                    key = tuple(new_event[:-1])
+                if event[0] == "patch_change":
+                    c, p = event[2:]
+                    if c > 15 or c < 0:
+                        continue
+                    last_p = patch_dict.setdefault(c, None)
+                    if last_p == p:
+                        continue
+                    patch_dict[c] = p
+                    if c not in patch_channels:
+                        patch_channels.append(c)
+                elif event[0] == "control_change":
+                    c, cc, v = event[2:]
+                    if c > 15 or c < 0:
+                        continue
+                    last_v = control_dict.setdefault((c, cc), 0)
+                    if abs(last_v - v) < cc_eps:
+                        continue
+                    control_dict[(c, cc)] = v
+                elif event[0] == "set_tempo":
+                    tempo = new_event[-1]
+                    if abs(last_tempo - tempo) < tempo_eps:
+                        continue
+                    last_tempo = tempo
+                if c != -1:
+                    if c not in channels:
+                        channels.append(c)
+                    tr_map = track_idx_map[c]
+                    if track_idx not in tr_map:
+                        tr_map[track_idx] = 0
+                if event[0] == "note":  # to eliminate note overlap due to quantization
+                    cp = tuple(new_event[5:7])
+                    if cp in last_notes:
+                        last_note_key, last_note = last_notes[cp]
+                        last_t = last_note[1] * 16 + last_note[2]
+                        last_note[4] = max(0, min(last_note[4], t - last_t))
+                        if last_note[4] == 0:
+                            event_list.pop(last_note_key)
+                    last_notes[cp] = (key, new_event)
+                event_list[key] = new_event
+        event_list = list(event_list.values())
+        empty_channels = [c for c in channels if empty_channels[c]]
+        if remap_track_channel:
+            patch_channels = []
+            channels_count = 0
+            channels_map = {9: 9} if 9 in channels else {}
+            if remove_empty_channels:
+                channels = sorted(channels, key=lambda x: 1 if x in empty_channels else 0)
+            for c in channels:
+                if c == 9:
+                    continue
+                channels_map[c] = channels_count
+                channels_count += 1
+                if channels_count == 9:
+                    channels_count = 10
+            channels = list(channels_map.values())
+            track_count = 0
+            track_idx_map_order = [k for k, v in sorted(list(channels_map.items()), key=lambda x: x[1])]
+            for c in track_idx_map_order:  # tracks not to remove
+                if remove_empty_channels and c in empty_channels:
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            for c in track_idx_map_order:  # tracks to remove
+                if not (remove_empty_channels and c in empty_channels):
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if not (len(note_tracks) != 0 and track_idx not in note_tracks):
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            empty_channels = [channels_map[c] for c in empty_channels]
+            track_idx_dict = {}
+            for event in event_list:
+                name = event[0]
+                track_idx = event[3]
+                if name == "note":
+                    c = event[5]
+                    event[5] = channels_map[c]
+                    event[3] = track_idx_map[c][track_idx]
+                    track_idx_dict.setdefault(event[5], event[3])
+                    # setdefault, so the track_idx is first of the channel
+                elif name == "set_tempo":
+                    event[3] = 0
+                elif name == "control_change" or name == "patch_change":
+                    c = event[4]
+                    event[4] = channels_map[c]
+                    tr_map = track_idx_map[c]
+                    # move the event to first track of the channel if it's original track is empty
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        track_idx = channel_note_tracks[c][0]
+                    new_track_idx = tr_map[track_idx]
+                    event[3] = new_track_idx
+                    if name == "patch_change" and event[4] not in patch_channels:
+                        patch_channels.append(event[4])
+        if add_default_instr:
+            for c in channels:
+                if c not in patch_channels and c in track_idx_dict:
+                    event_list.append(["patch_change", 0, 0, track_idx_dict[c], c, 0])
+        events_name_order = {"set_tempo": 0, "patch_change": 1, "control_change": 2, "note": 3}
+        events_order = lambda e: e[1:4] + [events_name_order[e[0]]]
+        event_list = sorted(event_list, key=events_order)
+        setup_events = {}
+        notes_in_setup = False
+        for i, event in enumerate(event_list):  # optimise setup
+            new_event = [*event]
+            if event[0] != "note":
+                new_event[1] = 0
+                new_event[2] = 0
+            has_next = False
+            has_pre = False
+            if i < len(event_list) - 1:
+                next_event = event_list[i + 1]
+                has_next = event[1] + event[2] == next_event[1] + next_event[2]
+            if notes_in_setup and i > 0:
+                pre_event = event_list[i - 1]
+                has_pre = event[1] + event[2] == pre_event[1] + pre_event[2]
+            if (event[0] == "note" and not has_next) or (notes_in_setup and not has_pre):
+                event_list = sorted(setup_events.values(), key=events_order) + event_list[i:]
+                break
+            else:
+                if event[0] == "note":
+                    notes_in_setup = True
+                    key = tuple([event[0]] + event[3:-2])
+                else:
+                    key = tuple([event[0]] + event[3:-1])
+            setup_events[key] = new_event
+        last_t1 = 0
+        midi_seq = []
+        for event in event_list:
+            if remove_empty_channels and event[0] in ["control_change", "patch_change"] and event[4] in empty_channels:
+                continue
+            cur_t1 = event[1]
+            event[1] = event[1] - last_t1
+            tokens = self.event2tokens(event)
+            if not tokens:
+                continue
+            midi_seq.append(tokens)
+            last_t1 = cur_t1
+        if add_bos_eos:
+            bos = [self.bos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            eos = [self.eos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            midi_seq = [bos] + midi_seq + [eos]
+        return midi_seq
+    def event2tokens(self, event):
+        name = event[0]
+        params = event[1:]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        tokens = [self.event_ids[name]] + [self.parameter_ids[p][params[i]]
+                                           for i, p in enumerate(self.events[name])]
+        tokens += [self.pad_id] * (self.max_token_seq - len(tokens))
+        return tokens
+    def tokens2event(self, tokens):
+        if tokens[0] not in self.id_events:
+            return []
+        name = self.id_events[tokens[0]]
+        if len(tokens) <= len(self.events[name]):
+            return []
+        params = tokens[1:]
+        params = [params[i] - self.parameter_ids[p][0] for i, p in enumerate(self.events[name])]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        event = [name] + params
+        return event
+    def detokenize(self, midi_seq):
+        ticks_per_beat = 480
+        tracks_dict = {}
+        t1 = 0
+        for tokens in midi_seq:
+            if tokens[0] in self.id_events:
+                event = self.tokens2event(tokens)
+                if not event:
+                    continue
+                name = event[0]
+                if name == "set_tempo":
+                    event[4] = self.bpm2tempo(event[4])
+                if event[0] == "note":
+                    event[4] = int(event[4] * ticks_per_beat / 16)
+                t1 += event[1]
+                t = t1 * 16 + event[2]
+                t = int(t * ticks_per_beat / 16)
+                track_idx = event[3]
+                if track_idx not in tracks_dict:
+                    tracks_dict[track_idx] = []
+                tracks_dict[track_idx].append([event[0], t] + event[4:])
+        tracks = [tr for idx, tr in sorted(list(tracks_dict.items()), key=lambda it: it[0])]
+        for i in range(len(tracks)):  # to eliminate note overlap
+            track = tracks[i]
+            track = sorted(track, key=lambda e: e[1])
+            last_note_t = {}
+            zero_len_notes = []
+            for e in reversed(track):
+                if e[0] == "note":
+                    t, d, c, p = e[1:5]
+                    key = (c, p)
+                    if key in last_note_t:
+                        d = min(d, max(last_note_t[key] - t, 0))
+                    last_note_t[key] = t
+                    e[2] = d
+                    if d == 0:
+                        zero_len_notes.append(e)
+            for e in zero_len_notes:
+                track.remove(e)
+            tracks[i] = track
+        return [ticks_per_beat, *tracks]
+    def midi2img(self, midi_score):
+        ticks_per_beat = midi_score[0]
+        notes = []
+        max_time = 1
+        track_num = len(midi_score[1:])
+        for track_idx, track in enumerate(midi_score[1:]):
+            for event in track:
+                t = round(16 * event[1] / ticks_per_beat)
+                if event[0] == "note":
+                    d = max(1, round(16 * event[2] / ticks_per_beat))
+                    c, p = event[3:5]
+                    max_time = max(max_time, t + d + 1)
+                    notes.append((track_idx, c, p, t, d))
+        img = np.zeros((128, max_time, 3), dtype=np.uint8)
+        colors = {(i, j): np.random.randint(50, 256, 3) for i in range(track_num) for j in range(16)}
+        for note in notes:
+            tr, c, p, t, d = note
+            img[p, t: t + d] = colors[(tr, c)]
+        img = PIL.Image.fromarray(np.flip(img, 0))
+        return img
+    def augment(self, midi_seq, max_pitch_shift=4, max_vel_shift=10, max_cc_val_shift=10, max_bpm_shift=10,
+                max_track_shift=0, max_channel_shift=16):
+        pitch_shift = random.randint(-max_pitch_shift, max_pitch_shift)
+        vel_shift = random.randint(-max_vel_shift, max_vel_shift)
+        cc_val_shift = random.randint(-max_cc_val_shift, max_cc_val_shift)
+        bpm_shift = random.randint(-max_bpm_shift, max_bpm_shift)
+        track_shift = random.randint(0, max_track_shift)
+        channel_shift = random.randint(0, max_channel_shift)
+        midi_seq_new = []
+        for tokens in midi_seq:
+            tokens_new = [*tokens]
+            if tokens[0] in self.id_events:
+                name = self.id_events[tokens[0]]
+                for i, pn in enumerate(self.events[name]):
+                    if pn == "track":
+                        tr = tokens[1 + i] - self.parameter_ids[pn][0]
+                        tr += track_shift
+                        tr = tr % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][tr]
+                    elif pn == "channel":
+                        c = tokens[1 + i] - self.parameter_ids[pn][0]
+                        c0 = c
+                        c += channel_shift
+                        c = c % self.event_parameters[pn]
+                        if c0 == 9:
+                            c = 9
+                        elif c == 9:
+                            c = (9 + channel_shift) % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][c]
+                if name == "note":
+                    c = tokens[5] - self.parameter_ids["channel"][0]
+                    p = tokens[6] - self.parameter_ids["pitch"][0]
+                    v = tokens[7] - self.parameter_ids["velocity"][0]
+                    if c != 9:  # no shift for drums
+                        p += pitch_shift
+                    if not 0 <= p < 128:
+                        return midi_seq
+                    v += vel_shift
+                    v = max(1, min(127, v))
+                    tokens_new[6] = self.parameter_ids["pitch"][p]
+                    tokens_new[7] = self.parameter_ids["velocity"][v]
+                elif name == "control_change":
+                    cc = tokens[5] - self.parameter_ids["controller"][0]
+                    val = tokens[6] - self.parameter_ids["value"][0]
+                    if cc in [1, 2, 7, 11]:
+                        val += cc_val_shift
+                        val = max(1, min(127, val))
+                    tokens_new[6] = self.parameter_ids["value"][val]
+                elif name == "set_tempo":
+                    bpm = tokens[4] - self.parameter_ids["bpm"][0]
+                    bpm += bpm_shift
+                    bpm = max(1, min(255, bpm))
+                    tokens_new[4] = self.parameter_ids["bpm"][bpm]
+            midi_seq_new.append(tokens_new)
+        return midi_seq_new
+    def check_quality(self, midi_seq, alignment_min=0.3, tonality_min=0.8, piano_max=0.7, notes_bandwidth_min=3,
+                      notes_density_max=50, notes_density_min=2.5, total_notes_max=20000, total_notes_min=256,
+                      note_window_size=16):
+        total_notes = 0
+        channels = []
+        time_hist = [0] * 16
+        note_windows = {}
+        notes_sametime = []
+        notes_density_list = []
+        tonality_list = []
+        notes_bandwidth_list = []
+        instruments = {}
+        piano_channels = []
+        abs_t1 = 0
+        last_t = 0
+        for tsi, tokens in enumerate(midi_seq):
+            event = self.tokens2event(tokens)
+            if not event:
+                continue
+            t1, t2, tr = event[1:4]
+            abs_t1 += t1
+            t = abs_t1 * 16 + t2
+            c = None
+            if event[0] == "note":
+                d, c, p, v = event[4:]
+                total_notes += 1
+                time_hist[t2] += 1
+                if c != 9:  # ignore drum channel
+                    if c not in instruments:
+                        instruments[c] = 0
+                        if c not in piano_channels:
+                            piano_channels.append(c)
+                    note_windows.setdefault(abs_t1 // note_window_size, []).append(p)
+                if last_t != t:
+                    notes_sametime = [(et, p_) for et, p_ in notes_sametime if et > last_t]
+                    notes_sametime_p = [p_ for _, p_ in notes_sametime]
+                    if len(notes_sametime) > 0:
+                        notes_bandwidth_list.append(max(notes_sametime_p) - min(notes_sametime_p))
+                notes_sametime.append((t + d - 1, p))
+            elif event[0] == "patch_change":
+                c, p = event[4:]
+                instruments[c] = p
+                if p == 0 and c not in piano_channels:
+                    piano_channels.append(c)
+            if c is not None and c not in channels:
+                channels.append(c)
+            last_t = t
+        reasons = []
+        if total_notes < total_notes_min:
+            reasons.append("total_min")
+        if total_notes > total_notes_max:
+            reasons.append("total_max")
+        if len(note_windows) == 0 and total_notes > 0:
+            reasons.append("drum_only")
+        if reasons:
+            return False, reasons
+        time_hist = sorted(time_hist, reverse=True)
+        alignment = sum(time_hist[:2]) / total_notes
+        for notes in note_windows.values():
+            key_hist = [0] * 12
+            for p in notes:
+                key_hist[p % 12] += 1
+            key_hist = sorted(key_hist, reverse=True)
+            tonality_list.append(sum(key_hist[:7]) / len(notes))
+            notes_density_list.append(len(notes) / note_window_size)
+        tonality_list = sorted(tonality_list)
+        tonality = sum(tonality_list) / len(tonality_list)
+        notes_bandwidth = sum(notes_bandwidth_list) / len(notes_bandwidth_list) if notes_bandwidth_list else 0
+        notes_density = max(notes_density_list) if notes_density_list else 0
+        piano_ratio = len(piano_channels) / len(channels)
+        if len(channels) <= 3:  # ignore piano threshold if it is a piano solo midi
+            piano_max = 1
+        if alignment < alignment_min:  # check weather the notes align to the bars (because some midi files are recorded)
+            reasons.append("alignment")
+        if tonality < tonality_min:  # check whether the music is tonal
+            reasons.append("tonality")
+        if notes_bandwidth < notes_bandwidth_min:  # check whether music is melodic line only
+            reasons.append("bandwidth")
+        if not notes_density_min < notes_density < notes_density_max:
+            reasons.append("density")
+        if piano_ratio > piano_max:  # check whether most instruments is piano (because some midi files don't have instruments assigned correctly)
+            reasons.append("piano")
+        return not reasons, reasons
+class MIDITokenizerV2:
+    def __init__(self):
+        self.version = "v2"
+        self.optimise_midi = False
+        self.vocab_size = 0
+        def allocate_ids(size):
+            ids = [self.vocab_size + i for i in range(size)]
+            self.vocab_size += size
+            return ids
+        self.pad_id = allocate_ids(1)[0]
+        self.bos_id = allocate_ids(1)[0]
+        self.eos_id = allocate_ids(1)[0]
+        self.events = {
+            "note": ["time1", "time2", "track", "channel", "pitch", "velocity", "duration"],
+            "patch_change": ["time1", "time2", "track", "channel", "patch"],
+            "control_change": ["time1", "time2", "track", "channel", "controller", "value"],
+            "set_tempo": ["time1", "time2", "track", "bpm"],
+            "time_signature": ["time1", "time2", "track", "nn", "dd"],
+            "key_signature": ["time1", "time2", "track", "sf", "mi"],
+        }
+        self.event_parameters = {
+            "time1": 128, "time2": 16, "duration": 2048, "track": 128, "channel": 16, "pitch": 128, "velocity": 128,
+            "patch": 128, "controller": 128, "value": 128, "bpm": 384, "nn": 16, "dd": 4, "sf": 15, "mi": 2
+        }
+        self.event_ids = {e: allocate_ids(1)[0] for e in self.events.keys()}
+        self.id_events = {i: e for e, i in self.event_ids.items()}
+        self.parameter_ids = {p: allocate_ids(s) for p, s in self.event_parameters.items()}
+        self.max_token_seq = max([len(ps) for ps in self.events.values()]) + 1
+    def to_dict(self) -> Dict[str, Any]:
+        d = {
+            "version":self.version,
+            "optimise_midi":self.optimise_midi,
+            "vocab_size": self.vocab_size,
+            "events": self.events,
+            "event_parameters": self.event_parameters,
+            "max_token_seq": self.max_token_seq,
+            "pad_id": self.pad_id,
+            "bos_id": self.bos_id,
+            "eos_id": self.eos_id,
+        }
+        return d
+    def set_optimise_midi(self, optimise_midi=True):
+        self.optimise_midi = optimise_midi
+    @staticmethod
+    def tempo2bpm(tempo):
+        tempo = tempo / 10 ** 6  # us to s
+        bpm = 60 / tempo
+        return bpm
+    @staticmethod
+    def bpm2tempo(bpm):
+        if bpm == 0:
+            bpm = 1
+        tempo = int((60 / bpm) * 10 ** 6)
+        return tempo
+    @staticmethod
+    def sf2key(sf):
+        # sf in key_signature to key.
+        # key represents the sequence from C note to B note (12 in total)
+        return (sf * 7) % 12
+    @staticmethod
+    def key2sf(k, mi):
+        # key to sf
+        sf = (k * 7) % 12
+        if sf > 6 or (mi == 1 and sf >= 5):
+            sf -= 12
+        return sf
+    @staticmethod
+    def detect_key_signature(key_hist, threshold=0.7):
+        if len(key_hist) != 12:
+            return None
+        if sum(key_hist) == 0:
+            return None
+        p = sum(sorted(key_hist, reverse=True)[:7]) / sum(key_hist)
+        if p < threshold:
+            return None
+        keys = [x[1] for x in sorted(zip(key_hist, range(len(key_hist))), reverse=True, key=lambda x: x[0])[:7]]
+        keys = sorted(keys)
+        semitones = []
+        for i in range(len(keys)):
+            dis = keys[i] - keys[i - 1]
+            if dis == 1 or dis == -11:
+                semitones.append(keys[i])
+        if len(semitones) != 2:
+            return None
+        semitones_dis = semitones[1] - semitones[0]
+        if semitones_dis == 5:
+            root_key = semitones[0]
+        elif semitones_dis == 7:
+            root_key = semitones[1]
+        else:
+            return None
+        return root_key
+    def tokenize(self, midi_score, add_bos_eos=True, cc_eps=4, tempo_eps=4,
+                 remap_track_channel=None, add_default_instr=None, remove_empty_channels=None):
+        if remap_track_channel is None:  # set default value
+            remap_track_channel = self.optimise_midi
+        if add_default_instr is None:
+            add_default_instr = self.optimise_midi
+        if remove_empty_channels is None:
+            remove_empty_channels = self.optimise_midi
+        ticks_per_beat = midi_score[0]
+        event_list = {}
+        track_idx_map = {i: dict() for i in range(16)}
+        track_idx_dict = {}
+        channels = []
+        patch_channels = []
+        empty_channels = [True] * 16
+        channel_note_tracks = {i: list() for i in range(16)}
+        note_key_hist = [0]*12
+        key_sigs = []
+        track_to_channels = {}
+        for track_idx, track in enumerate(midi_score[1:129]):
+            last_notes = {}
+            patch_dict = {}
+            control_dict = {}
+            last_bpm = 0
+            track_channels = []
+            track_to_channels.setdefault(track_idx, track_channels)
+            for event in track:
+                if event[0] not in self.events:
+                    continue
+                name = event[0]
+                c = -1
+                t = round(16 * event[1] / ticks_per_beat)  # quantization
+                new_event = [name, t // 16, t % 16, track_idx]
+                if name == "note":
+                    d, c, p, v = event[2:]
+                    if not (0 <= c <= 15):
+                        continue
+                    d = max(1, round(16 * d / ticks_per_beat))
+                    new_event += [c, p, v, d]
+                    empty_channels[c] = False
+                    track_idx_dict.setdefault(c, track_idx)
+                    note_tracks = channel_note_tracks[c]
+                    if track_idx not in note_tracks:
+                        note_tracks.append(track_idx)
+                    if c != 9:
+                        note_key_hist[p%12] += 1
+                    if c not in track_channels:
+                        track_channels.append(c)
+                elif name == "patch_change":
+                    c, p = event[2:]
+                    if not (0 <= c <= 15):
+                        continue
+                    new_event += [c, p]
+                    last_p = patch_dict.setdefault(c, None)
+                    if last_p == p:
+                        continue
+                    patch_dict[c] = p
+                    if c not in patch_channels:
+                        patch_channels.append(c)
+                elif name == "control_change":
+                    c, cc, v = event[2:]
+                    if not (0 <= c <= 15):
+                        continue
+                    new_event += [c, cc, v]
+                    last_v = control_dict.setdefault((c, cc), 0)
+                    if abs(last_v - v) < cc_eps:
+                        continue
+                    control_dict[(c, cc)] = v
+                elif name == "set_tempo":
+                    tempo = event[2]
+                    if tempo == 0:  # invalid tempo
+                        continue
+                    bpm = min(int(self.tempo2bpm(tempo)), 383)
+                    new_event += [bpm]
+                    if abs(last_bpm - bpm) < tempo_eps:
+                        continue
+                    last_bpm = bpm
+                elif name == "time_signature":
+                    nn, dd = event[2:4]
+                    if not (1 <= nn <= 16 and 1 <= dd <= 4):  # invalid
+                        continue
+                    nn -= 1  # make it start from 0
+                    dd -= 1
+                    new_event += [nn, dd]
+                elif name == "key_signature":
+                    sf, mi = event[2:]
+                    if not (-7 <= sf <= 7 and 0 <= mi <= 1):  # invalid
+                        continue
+                    sf += 7
+                    new_event += [sf, mi]
+                    key_sigs.append(new_event)
+                if name in ["note", "time_signature", "key_signature"]:
+                    key = tuple(new_event[:-2])
+                else:
+                    key = tuple(new_event[:-1])
+                if c != -1:
+                    if c not in channels:
+                        channels.append(c)
+                    tr_map = track_idx_map[c]
+                    if track_idx not in tr_map:
+                        tr_map[track_idx] = 0
+                if event[0] == "note":  # to eliminate note overlap due to quantization
+                    cp = tuple(new_event[4:6])  # channel pitch
+                    if cp in last_notes:
+                        last_note_key, last_note = last_notes[cp]
+                        last_t = last_note[1] * 16 + last_note[2]
+                        last_note[-1] = max(0, min(last_note[-1], t - last_t))  # modify duration
+                        if last_note[-1] == 0:
+                            event_list.pop(last_note_key)
+                    last_notes[cp] = (key, new_event)
+                event_list[key] = new_event
+        event_list = list(event_list.values())
+        empty_channels = [c for c in channels if empty_channels[c]]
+        if remap_track_channel:
+            patch_channels = []
+            channels_count = 0
+            channels_map = {9: 9} if 9 in channels else {}
+            if remove_empty_channels:
+                channels = sorted(channels, key=lambda x: 1 if x in empty_channels else 0)
+            for c in channels:
+                if c == 9:
+                    continue
+                channels_map[c] = channels_count
+                channels_count += 1
+                if channels_count == 9:
+                    channels_count = 10
+            channels = list(channels_map.values())
+            track_count = 0
+            track_idx_map_order = [k for k, v in sorted(list(channels_map.items()), key=lambda x: x[1])]
+            for c in track_idx_map_order:  # tracks not to remove
+                if remove_empty_channels and c in empty_channels:
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            for c in track_idx_map_order:  # tracks to remove
+                if not (remove_empty_channels and c in empty_channels):
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if not (len(note_tracks) != 0 and track_idx not in note_tracks):
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            empty_channels = [channels_map[c] for c in empty_channels]
+            track_idx_dict = {}
+            key_sigs = []
+            key_signature_to_add = []
+            key_signature_to_remove = []
+            for event in event_list:
+                name = event[0]
+                track_idx = event[3]
+                if name == "note":
+                    c = event[4]
+                    event[4] = channels_map[c]  # channel
+                    event[3] = track_idx_map[c][track_idx]  # track
+                    track_idx_dict.setdefault(event[4], event[3])
+                    # setdefault, so the track_idx is first of the channel
+                elif name in ["set_tempo", "time_signature"]:
+                    event[3] = 0  # set track 0 for meta events
+                elif name == "key_signature":
+                    new_channel_track_idxs = []
+                    for c, tr_map in track_idx_map.items():
+                        if track_idx in tr_map:
+                            new_track_idx = tr_map[track_idx]
+                            c = channels_map[c]
+                            new_channel_track_idx = (c, new_track_idx)
+                            if new_track_idx == 0:
+                                continue
+                            if new_channel_track_idx not in new_channel_track_idxs:
+                                new_channel_track_idxs.append(new_channel_track_idx)
+                    if len(new_channel_track_idxs) == 0:
+                        if event[3] == 0: # keep key_signature on track 0 (meta)
+                            key_sigs.append(event)
+                            continue
+                        event[3] = -1 # avoid remove same event
+                        key_signature_to_remove.append(event) # empty track
+                        continue
+                    c, nt = new_channel_track_idxs[0]
+                    event[3] = nt
+                    key_sigs.append(event)
+                    if c == 9:
+                        event[4] = 7 # sf=0
+                    for c, nt in new_channel_track_idxs[1:]:
+                        new_event = [*event]
+                        new_event[3] = nt
+                        if c == 9:
+                            new_event[4] = 7  # sf=0
+                        key_sigs.append(new_event)
+                        key_signature_to_add.append(new_event)
+                elif name == "control_change" or name == "patch_change":
+                    c = event[4]
+                    event[4] = channels_map[c]  # channel
+                    tr_map = track_idx_map[c]
+                    # move the event to first track of the channel if it's original track is empty
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        track_idx = channel_note_tracks[c][0]
+                    new_track_idx = tr_map[track_idx]
+                    event[3] = new_track_idx
+                    if name == "patch_change" and event[4] not in patch_channels:
+                        patch_channels.append(event[4])
+            for key_sig in key_signature_to_remove:
+                event_list.remove(key_sig)
+            event_list += key_signature_to_add
+            track_to_channels ={}
+            for c, tr_map in track_idx_map.items():
+                if c not in channels_map:
+                    continue
+                c = channels_map[c]
+                for _, track_idx  in tr_map.items():
+                    track_to_channels.setdefault(track_idx, [])
+                    cs = track_to_channels[track_idx]
+                    if c not in cs:
+                        cs.append(c)
+        if add_default_instr:
+            for c in channels:
+                if c not in patch_channels and c in track_idx_dict:
+                    event_list.append(["patch_change", 0, 0, track_idx_dict[c], c, 0])
+        if len(key_sigs) == 0 or all([key_sig[4]==7 for key_sig in key_sigs]):
+            # detect key signature or fix the default key signature
+            root_key = self.detect_key_signature(note_key_hist)
+            if root_key is not None:
+                sf = self.key2sf(root_key, 0)
+                # print("detect_key_signature",sf)
+                if len(key_sigs) == 0:
+                    for tr, cs in track_to_channels.items():
+                        if remap_track_channel and tr == 0:
+                            continue
+                        new_event = ["key_signature", 0, 0, tr, (0 if (len(cs) == 1 and cs[0] == 9) else sf) + 7, 0]
+                        event_list.append(new_event)
+                else:
+                    for key_sig in key_sigs:
+                        tr = key_sig[3]
+                        if tr in track_to_channels:
+                            cs = track_to_channels[tr]
+                            if len(cs) == 1 and cs[0] == 9:
+                                continue
+                        key_sig[4] = sf + 7
+                        key_sig[5] = 0
+            else:
+                # remove default key signature
+                for key_sig in key_sigs:
+                    event_list.remove(key_sig)
+        events_name_order = ["time_signature", "key_signature", "set_tempo", "patch_change", "control_change", "note"]
+        events_name_order = {name: i for i, name in enumerate(events_name_order)}
+        events_order = lambda e: e[1:4] + [events_name_order[e[0]]]
+        event_list = sorted(event_list, key=events_order)
+        setup_events = {}
+        notes_in_setup = False
+        for i, event in enumerate(event_list):  # optimise setup
+            new_event = [*event]  # make copy of event
+            if event[0] not in ["note", "time_signature"]:
+                new_event[1] = 0
+                new_event[2] = 0
+            has_next = False
+            has_pre = False
+            if i < len(event_list) - 1:
+                next_event = event_list[i + 1]
+                has_next = event[1] + event[2] == next_event[1] + next_event[2]
+            if notes_in_setup and i > 0:
+                pre_event = event_list[i - 1]
+                has_pre = event[1] + event[2] == pre_event[1] + pre_event[2]
+            if (event[0] == "note" and not has_next) or (notes_in_setup and not has_pre):
+                event_list = sorted(setup_events.values(), key=events_order) + event_list[i:]
+                break
+            else:
+                if event[0] == "note":
+                    notes_in_setup = True
+                if event[0] in ["note", "time_signature", "key_signature"]:
+                    key = tuple([event[0]]+event[3:-2])
+                else:
+                    key = tuple([event[0]]+event[3:-1])
+            setup_events[key] = new_event
+        last_t1 = 0
+        midi_seq = []
+        for event in event_list:
+            if remove_empty_channels and event[0] in ["control_change", "patch_change"] and event[4] in empty_channels:
+                continue
+            cur_t1 = event[1]
+            event[1] = event[1] - last_t1
+            tokens = self.event2tokens(event)
+            if not tokens:
+                continue
+            midi_seq.append(tokens)
+            last_t1 = cur_t1
+        if add_bos_eos:
+            bos = [self.bos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            eos = [self.eos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            midi_seq = [bos] + midi_seq + [eos]
+        return midi_seq
+    def event2tokens(self, event):
+        name = event[0]
+        params = event[1:]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        tokens = [self.event_ids[name]] + [self.parameter_ids[p][params[i]]
+                                           for i, p in enumerate(self.events[name])]
+        tokens += [self.pad_id] * (self.max_token_seq - len(tokens))
+        return tokens
+    def tokens2event(self, tokens):
+        if tokens[0] not in self.id_events:
+            return []
+        name = self.id_events[tokens[0]]
+        if len(tokens) <= len(self.events[name]):
+            return []
+        params = tokens[1:]
+        params = [params[i] - self.parameter_ids[p][0] for i, p in enumerate(self.events[name])]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        event = [name] + params
+        return event
+    def detokenize(self, midi_seq):
+        ticks_per_beat = 480
+        tracks_dict = {}
+        t1 = 0
+        for tokens in midi_seq:
+            if tokens[0] in self.id_events:
+                event = self.tokens2event(tokens)
+                if not event:
+                    continue
+                name = event[0]
+                t1 += event[1]
+                t = t1 * 16 + event[2]
+                t = int(t * ticks_per_beat / 16)
+                track_idx = event[3]
+                event_new = [name, t]
+                if name == "note":
+                    c, p, v, d = event[4:]
+                    d = int(d * ticks_per_beat / 16)
+                    event_new += [d, c, p, v]
+                elif name == "control_change" or name == "patch_change":
+                    event_new += event[4:]
+                elif name == "set_tempo":
+                    event_new += [self.bpm2tempo(event[4])]
+                elif name == "time_signature":
+                    nn, dd = event[4:]
+                    nn += 1
+                    dd += 1
+                    event_new += [nn, dd, 24, 8]  # usually cc, bb = 24, 8
+                elif name == "key_signature":
+                    sf, mi = event[4:]
+                    sf -= 7
+                    event_new += [sf, mi]
+                else:  # should not go here
+                    continue
+                if track_idx not in tracks_dict:
+                    tracks_dict[track_idx] = []
+                tracks_dict[track_idx].append(event_new)
+        tracks = [tr for idx, tr in sorted(list(tracks_dict.items()), key=lambda it: it[0])]
+        for i in range(len(tracks)):  # to eliminate note overlap
+            track = tracks[i]
+            track = sorted(track, key=lambda e: e[1])
+            last_note_t = {}
+            zero_len_notes = []
+            for e in reversed(track):
+                if e[0] == "note":
+                    t, d, c, p = e[1:5]
+                    key = (c, p)
+                    if key in last_note_t:
+                        d = min(d, max(last_note_t[key] - t, 0))
+                    last_note_t[key] = t
+                    e[2] = d
+                    if d == 0:
+                        zero_len_notes.append(e)
+            for e in zero_len_notes:
+                track.remove(e)
+            tracks[i] = track
+        return [ticks_per_beat, *tracks]
+    def midi2img(self, midi_score):
+        ticks_per_beat = midi_score[0]
+        notes = []
+        max_time = 1
+        track_num = len(midi_score[1:])
+        for track_idx, track in enumerate(midi_score[1:]):
+            for event in track:
+                t = round(16 * event[1] / ticks_per_beat)
+                if event[0] == "note":
+                    d = max(1, round(16 * event[2] / ticks_per_beat))
+                    c, p = event[3:5]
+                    max_time = max(max_time, t + d + 1)
+                    notes.append((track_idx, c, p, t, d))
+        img = np.zeros((128, max_time, 3), dtype=np.uint8)
+        colors = {(i, j): np.random.randint(50, 256, 3) for i in range(track_num) for j in range(16)}
+        for note in notes:
+            tr, c, p, t, d = note
+            img[p, t: t + d] = colors[(tr, c)]
+        img = PIL.Image.fromarray(np.flip(img, 0))
+        return img
+    def augment(self, midi_seq, max_pitch_shift=4, max_vel_shift=10, max_cc_val_shift=10, max_bpm_shift=10,
+                max_track_shift=0, max_channel_shift=16):
+        pitch_shift = random.randint(-max_pitch_shift, max_pitch_shift)
+        vel_shift = random.randint(-max_vel_shift, max_vel_shift)
+        cc_val_shift = random.randint(-max_cc_val_shift, max_cc_val_shift)
+        bpm_shift = random.randint(-max_bpm_shift, max_bpm_shift)
+        track_shift = random.randint(0, max_track_shift)
+        channel_shift = random.randint(0, max_channel_shift)
+        midi_seq_new = []
+        key_signature_tokens = []
+        track_to_channels = {}
+        for tokens in midi_seq:
+            tokens_new = [*tokens]
+            if tokens[0] in self.id_events:
+                name = self.id_events[tokens[0]]
+                for i, pn in enumerate(self.events[name]):
+                    if pn == "track":
+                        tr = tokens[1 + i] - self.parameter_ids[pn][0]
+                        tr += track_shift
+                        tr = tr % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][tr]
+                    elif pn == "channel":
+                        c = tokens[1 + i] - self.parameter_ids[pn][0]
+                        c0 = c
+                        c += channel_shift
+                        c = c % self.event_parameters[pn]
+                        if c0 == 9:
+                            c = 9
+                        elif c == 9:
+                            c = (9 + channel_shift) % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][c]
+                if name == "note":
+                    tr = tokens[3] - self.parameter_ids["track"][0]
+                    c = tokens[4] - self.parameter_ids["channel"][0]
+                    p = tokens[5] - self.parameter_ids["pitch"][0]
+                    v = tokens[6] - self.parameter_ids["velocity"][0]
+                    if c != 9:  # no shift for drums
+                        p += pitch_shift
+                    if not 0 <= p < 128:
+                        return midi_seq
+                    v += vel_shift
+                    v = max(1, min(127, v))
+                    tokens_new[5] = self.parameter_ids["pitch"][p]
+                    tokens_new[6] = self.parameter_ids["velocity"][v]
+                    track_to_channels.setdefault(tr, [])
+                    cs = track_to_channels[tr]
+                    if c not in cs:
+                        cs.append(c)
+                elif name == "control_change":
+                    cc = tokens[5] - self.parameter_ids["controller"][0]
+                    val = tokens[6] - self.parameter_ids["value"][0]
+                    if cc in [1, 2, 7, 11]:
+                        val += cc_val_shift
+                        val = max(1, min(127, val))
+                    tokens_new[6] = self.parameter_ids["value"][val]
+                elif name == "set_tempo":
+                    bpm = tokens[4] - self.parameter_ids["bpm"][0]
+                    bpm += bpm_shift
+                    bpm = max(1, min(383, bpm))
+                    tokens_new[4] = self.parameter_ids["bpm"][bpm]
+                elif name == "key_signature":
+                    sf = tokens[4] - self.parameter_ids["sf"][0]
+                    mi = tokens[5] - self.parameter_ids["mi"][0]
+                    sf -= 7
+                    k = self.sf2key(sf)
+                    k = (k + pitch_shift) % 12
+                    sf = self.key2sf(k, mi)
+                    sf += 7
+                    tokens_new[4] = self.parameter_ids["sf"][sf]
+                    tokens_new[5] = self.parameter_ids["mi"][mi]
+                    key_signature_tokens.append(tokens_new)
+            midi_seq_new.append(tokens_new)
+        for tokens in  key_signature_tokens:
+            tr = tokens[3] - self.parameter_ids["track"][0]
+            if tr in track_to_channels:
+                cs = track_to_channels[tr]
+                if len(cs) == 1 and cs[0] == 9:
+                    tokens[4] = self.parameter_ids["sf"][7] # sf=0
+        return midi_seq_new
+    def check_quality(self, midi_seq, alignment_min=0.3, tonality_min=0.8, piano_max=0.7, notes_bandwidth_min=3,
+                      notes_density_max=50, notes_density_min=2.5, total_notes_max=20000, total_notes_min=256,
+                      note_window_size=16):
+        total_notes = 0
+        channels = []
+        time_hist = [0] * 16
+        note_windows = {}
+        notes_sametime = []
+        notes_density_list = []
+        tonality_list = []
+        notes_bandwidth_list = []
+        instruments = {}
+        piano_channels = []
+        abs_t1 = 0
+        last_t = 0
+        for tsi, tokens in enumerate(midi_seq):
+            event = self.tokens2event(tokens)
+            if not event:
+                continue
+            t1, t2, tr = event[1:4]
+            abs_t1 += t1
+            t = abs_t1 * 16 + t2
+            c = None
+            if event[0] == "note":
+                c, p, v, d = event[4:]
+                total_notes += 1
+                time_hist[t2] += 1
+                if c != 9:  # ignore drum channel
+                    if c not in instruments:
+                        instruments[c] = 0
+                        if c not in piano_channels:
+                            piano_channels.append(c)
+                    note_windows.setdefault(abs_t1 // note_window_size, []).append(p)
+                if last_t != t:
+                    notes_sametime = [(et, p_) for et, p_ in notes_sametime if et > last_t]
+                    notes_sametime_p = [p_ for _, p_ in notes_sametime]
+                    if len(notes_sametime) > 0:
+                        notes_bandwidth_list.append(max(notes_sametime_p) - min(notes_sametime_p))
+                notes_sametime.append((t + d - 1, p))
+            elif event[0] == "patch_change":
+                c, p = event[4:]
+                instruments[c] = p
+                if p == 0 and c not in piano_channels:
+                    piano_channels.append(c)
+            if c is not None and c not in channels:
+                channels.append(c)
+            last_t = t
+        reasons = []
+        if total_notes < total_notes_min:
+            reasons.append("total_min")
+        if total_notes > total_notes_max:
+            reasons.append("total_max")
+        if len(note_windows) == 0 and total_notes > 0:
+            reasons.append("drum_only")
+        if reasons:
+            return False, reasons
+        time_hist = sorted(time_hist, reverse=True)
+        alignment = sum(time_hist[:2]) / total_notes
+        for notes in note_windows.values():
+            key_hist = [0] * 12
+            for p in notes:
+                key_hist[p % 12] += 1
+            key_hist = sorted(key_hist, reverse=True)
+            tonality_list.append(sum(key_hist[:7]) / len(notes))
+            notes_density_list.append(len(notes) / note_window_size)
+        tonality_list = sorted(tonality_list)
+        tonality = sum(tonality_list) / len(tonality_list)
+        notes_bandwidth = sum(notes_bandwidth_list) / len(notes_bandwidth_list) if notes_bandwidth_list else 0
+        notes_density = max(notes_density_list) if notes_density_list else 0
+        piano_ratio = len(piano_channels) / len(channels)
+        if len(channels) <= 3:  # ignore piano threshold if it is a piano solo midi
+            piano_max = 1
+        if alignment < alignment_min:  # check weather the notes align to the bars (because some midi files are recorded)
+            reasons.append("alignment")
+        if tonality < tonality_min:  # check whether the music is tonal
+            reasons.append("tonality")
+        if notes_bandwidth < notes_bandwidth_min:  # check whether music is melodic line only
+            reasons.append("bandwidth")
+        if not notes_density_min < notes_density < notes_density_max:
+            reasons.append("density")
+        if piano_ratio > piano_max:  # check whether most instruments is piano (because some midi files don't have instruments assigned correctly)
+            reasons.append("piano")
+        return not reasons, reasons
+class MIDITokenizer:
+    def __new__(cls, version="v2"):
+        if version == "v1":
+            return MIDITokenizerV1()
+        elif version == "v2":
+            return MIDITokenizerV2()
+        else:
+            raise ValueError(f"Unsupported version: {version}")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ fluidsynth

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+--extra-index-url https://download.pytorch.org/whl/cu124
+Pillow
+numpy
+torch
+onnxruntime-gpu
+peft>=0.13.0
+transformers>=4.36
+gradio==5.0.1
+pyfluidsynth
+tqdm
+huggingface_hub