diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..b04f3e5617302cb35aa02393780f65d7c4d4bc9d
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..2fa032c07353b4dca90dd55f53d43e8087014b77 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tensorboardX/screenshots/image.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..ce50ed25eb708b38b8b10cddcb7bea11cbc1e253
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,133 @@
+# Global
+.DS_Store
+.idea
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..b3f30b059b68c1b65843acc1f632875fb9c6918c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+Noncommercial Use License
+
+Software Copyright (c) 2020 OpenAI
+
+We don’t claim ownership of the content you create with Jukebox.
+We only ask that you use Jukebox responsibly and clearly indicate your content was created using OpenAI’s Jukebox.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the Software, including without limitation the rights to use, copy,
+modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+No portion of the Software, nor any content created with the Software, may be used for commercial purposes.
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+The above copyright notice and this permission notice need not be included with content created by the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..960b1a02d9957e2aeaaf9b8d8cf147cb5d04c660
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include jukebox *.py
+recursive-include jukebox *.txt
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd87d6d075b89998d32e1734f6f2965ff308f530
--- /dev/null
+++ b/README.md
@@ -0,0 +1,284 @@
+**Status:** Archive (code is provided as-is, no updates expected)
+
+# Jukebox
+Code for "Jukebox: A Generative Model for Music"
+
+[Paper](https://arxiv.org/abs/2005.00341) 
+[Blog](https://openai.com/blog/jukebox) 
+[Explorer](http://jukebox.openai.com/) 
+[Colab](https://colab.research.google.com/github/openai/jukebox/blob/master/jukebox/Interacting_with_Jukebox.ipynb) 
+
+# Install
+Install the conda package manager from https://docs.conda.io/en/latest/miniconda.html    
+    
+``` 
+# Required: Sampling
+conda create --name jukebox python=3.7.5
+conda activate jukebox
+conda install mpi4py=3.0.3 # if this fails, try: pip install mpi4py==3.0.3
+conda install pytorch=1.4 torchvision=0.5 cudatoolkit=10.0 -c pytorch
+git clone https://github.com/openai/jukebox.git
+cd jukebox
+pip install -r requirements.txt
+pip install -e .
+
+# Required: Training
+conda install av=7.0.01 -c conda-forge 
+pip install ./tensorboardX
+ 
+# Optional: Apex for faster training with fused_adam
+conda install pytorch=1.1 torchvision=0.3 cudatoolkit=10.0 -c pytorch
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
+```
+
+# Sampling
+## Sampling from scratch
+To sample normally, run the following command. Model can be `5b`, `5b_lyrics`, `1b_lyrics`
+``` 
+python jukebox/sample.py --model=5b_lyrics --name=sample_5b --levels=3 --sample_length_in_seconds=20 \
+--total_sample_length_in_seconds=180 --sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
+```
+``` 
+python jukebox/sample.py --model=1b_lyrics --name=sample_1b --levels=3 --sample_length_in_seconds=20 \
+--total_sample_length_in_seconds=180 --sr=44100 --n_samples=16 --hop_fraction=0.5,0.5,0.125
+```
+The above generates the first `sample_length_in_seconds` seconds of audio from a song of total length `total_sample_length_in_seconds`.
+To use multiple GPU's, launch the above scripts as `mpiexec -n {ngpus} python jukebox/sample.py ...` so they use `{ngpus}`
+
+The samples decoded from each level are stored in `{name}/level_{level}`. 
+You can also view the samples as an html with the aligned lyrics under `{name}/level_{level}/index.html`. 
+Run `python -m http.server` and open the html through the server to see the lyrics animate as the song plays.  
+A summary of all sampling data including zs, x, labels and sampling_kwargs is stored in `{name}/level_{level}/data.pth.tar`.
+
+The hps are for a V100 GPU with 16 GB GPU memory. The `1b_lyrics`, `5b`, and `5b_lyrics` top-level priors take up 
+3.8 GB, 10.3 GB, and 11.5 GB, respectively. The peak memory usage to store transformer key, value cache is about 400 MB 
+for `1b_lyrics` and 1 GB for `5b_lyrics` per sample. If you are having trouble with CUDA OOM issues, try `1b_lyrics` or 
+decrease `max_batch_size` in sample.py, and `--n_samples` in the script call.
+
+On a V100, it takes about 3 hrs to fully sample 20 seconds of music. Since this is a long time, it is recommended to use `n_samples > 1` so you can generate as many samples as possible in parallel. The 1B lyrics and upsamplers can process 16 samples at a time, while 5B can fit only up to 3. Since the vast majority of time is spent on upsampling, we recommend using a multiple of 3 less than 16 like `--n_samples 15` for `5b_lyrics`. This will make the top-level generate samples in groups of three while upsampling is done in one pass.
+
+To continue sampling from already generated codes for a longer duration, you can run
+```
+python jukebox/sample.py --model=5b_lyrics --name=sample_5b --levels=3 --mode=continue \
+--codes_file=sample_5b/level_0/data.pth.tar --sample_length_in_seconds=40 --total_sample_length_in_seconds=180 \
+--sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
+```
+Here, we take the 20 seconds samples saved from the first sampling run at `sample_5b/level_0/data.pth.tar` and continue by adding 20 more seconds. 
+
+You could also continue directly from the level 2 saved outputs, just pass `--codes_file=sample_5b/level_2/data.pth.tar`.
+ Note this will upsample the full 40 seconds song at the end.
+
+If you stopped sampling at only the first level and want to upsample the saved codes, you can run
+```
+python jukebox/sample.py --model=5b_lyrics --name=sample_5b --levels=3 --mode=upsample \
+--codes_file=sample_5b/level_2/data.pth.tar --sample_length_in_seconds=20 --total_sample_length_in_seconds=180 \
+--sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
+```
+Here, we take the 20 seconds samples saved from the first sampling run at `sample_5b/level_2/data.pth.tar` and upsample the lower two levels.
+
+## Prompt with your own music
+If you want to prompt the model with your own creative piece or any other music, first save them as wave files and run
+```
+python jukebox/sample.py --model=5b_lyrics --name=sample_5b_prompted --levels=3 --mode=primed \
+--audio_file=path/to/recording.wav,awesome-mix.wav,fav-song.wav,etc.wav --prompt_length_in_seconds=12 \
+--sample_length_in_seconds=20 --total_sample_length_in_seconds=180 --sr=44100 --n_samples=6 --hop_fraction=0.5,0.5,0.125
+```
+This will load the four files, tile them to fill up to `n_samples` batch size, and prime the model with the first `prompt_length_in_seconds` seconds.
+
+# Training
+## VQVAE
+To train a small vqvae, run
+```
+mpiexec -n {ngpus} python jukebox/train.py --hps=small_vqvae --name=small_vqvae --sample_length=262144 --bs=4 \
+--audio_files_dir={audio_files_dir} --labels=False --train --aug_shift --aug_blend
+```
+Here, `{audio_files_dir}` is the directory in which you can put the audio files for your dataset, and `{ngpus}` is number of GPU's you want to use to train. 
+The above trains a two-level VQ-VAE with `downs_t = (5,3)`, and `strides_t = (2, 2)` meaning we downsample the audio by `2**5 = 32` to get the first level of codes, and `2**8 = 256` to get the second level codes.  
+Checkpoints are stored in the `logs` folder. You can monitor the training by running Tensorboard
+```
+tensorboard --logdir logs
+```
+    
+## Prior
+### Train prior or upsamplers
+Once the VQ-VAE is trained, we can restore it from its saved checkpoint and train priors on the learnt codes. 
+To train the top-level prior, we can run
+
+```
+mpiexec -n {ngpus} python jukebox/train.py --hps=small_vqvae,small_prior,all_fp16,cpu_ema --name=small_prior \
+--sample_length=2097152 --bs=4 --audio_files_dir={audio_files_dir} --labels=False --train --test --aug_shift --aug_blend \
+--restore_vqvae=logs/small_vqvae/checkpoint_latest.pth.tar --prior --levels=2 --level=1 --weight_decay=0.01 --save_iters=1000
+```
+
+To train the upsampler, we can run
+```
+mpiexec -n {ngpus} python jukebox/train.py --hps=small_vqvae,small_upsampler,all_fp16,cpu_ema --name=small_upsampler \
+--sample_length=262144 --bs=4 --audio_files_dir={audio_files_dir} --labels=False --train --test --aug_shift --aug_blend \
+--restore_vqvae=logs/small_vqvae/checkpoint_latest.pth.tar --prior --levels=2 --level=0 --weight_decay=0.01 --save_iters=1000
+```
+We pass `sample_length = n_ctx * downsample_of_level` so that after downsampling the tokens match the n_ctx of the prior hps. 
+Here, `n_ctx = 8192` and `downsamples = (32, 256)`, giving `sample_lengths = (8192 * 32, 8192 * 256) = (65536, 2097152)` respectively for the bottom and top level. 
+
+### Learning rate annealing
+To get the best sample quality anneal the learning rate to 0 near the end of training. To do so, continue training from the latest 
+checkpoint and run with
+```
+--restore_prior="path/to/checkpoint" --lr_use_linear_decay --lr_start_linear_decay={already_trained_steps} --lr_decay={decay_steps_as_needed}
+```
+
+### Reuse pre-trained VQ-VAE and train top-level prior on new dataset from scratch.
+#### Train without labels
+Our pre-trained VQ-VAE can produce compressed codes for a wide variety of genres of music, and the pre-trained upsamplers 
+can upsample them back to audio that sound very similar to the original audio.
+To re-use these for a new dataset of your choice, you can retrain just the top-level  
+
+To train top-level on a new dataset, run
+```
+mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,small_prior,all_fp16,cpu_ema --name=pretrained_vqvae_small_prior \
+--sample_length=1048576 --bs=4 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
+--labels=False --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
+```
+Training the `small_prior` with a batch size of 2, 4, and 8 requires 6.7 GB, 9.3 GB, and 15.8 GB of GPU memory, respectively. A few days to a week of training typically yields reasonable samples when the dataset is homogeneous (e.g. all piano pieces, songs of the same style, etc).
+
+Near the end of training, follow [this](#learning-rate-annealing) to anneal the learning rate to 0
+
+#### Sample from new model
+You can then run sample.py with the top-level of our models replaced by your new model. To do so,
+- Add an entry `my_model=("vqvae", "upsampler_level_0", "upsampler_level_1", "small_prior")` in `MODELS` in `make_models.py`. 
+- Update the `small_prior` dictionary in `hparams.py` to include `restore_prior='path/to/checkpoint'`. If you
+you changed any hps directly in the command line script (eg:`heads`), make sure to update them in the dictionary too so 
+that `make_models` restores our checkpoint correctly.
+- Run sample.py as outlined in the sampling section, but now with `--model=my_model` 
+
+For example, let's say we trained `small_vqvae`, `small_prior`, and `small_upsampler` under `/path/to/jukebox/logs`. In `make_models.py`, we are going to declare a tuple of the new models as `my_model`.
+```
+MODELS = {
+    '5b': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_5b"),
+    '5b_lyrics': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_5b_lyrics"),
+    '1b_lyrics': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_1b_lyrics"),
+    'my_model': ("my_small_vqvae", "my_small_upsampler", "my_small_prior"),
+}
+```
+
+Next, in `hparams.py`, we add them to the registry with the corresponding `restore_`paths and any other command line options used during training. Another important note is that for top-level priors with lyric conditioning, we have to locate a self-attention layer that shows alignment between the lyric and music tokens. Look for layers where `prior.prior.transformer._attn_mods[layer].attn_func` is either 6 or 7. If your model is starting to sing along lyrics, it means some layer, head pair has learned alignment. Congrats!
+```
+my_small_vqvae = Hyperparams(
+    restore_vqvae='/path/to/jukebox/logs/small_vqvae/checkpoint_some_step.pth.tar',
+)
+my_small_vqvae.update(small_vqvae)
+HPARAMS_REGISTRY["my_small_vqvae"] = my_small_vqvae
+
+my_small_prior = Hyperparams(
+    restore_prior='/path/to/jukebox/logs/small_prior/checkpoint_latest.pth.tar',
+    level=1,
+    labels=False,
+    # TODO For the two lines below, if `--labels` was used and the model is
+    # trained with lyrics, find and enter the layer, head pair that has learned
+    # alignment.
+    alignment_layer=47,
+    alignment_head=0,
+)
+my_small_prior.update(small_prior)
+HPARAMS_REGISTRY["my_small_prior"] = my_small_prior
+
+my_small_upsampler = Hyperparams(
+    restore_prior='/path/to/jukebox/logs/small_upsampler/checkpoint_latest.pth.tar',
+    level=0,
+    labels=False,
+)
+my_small_upsampler.update(small_upsampler)
+HPARAMS_REGISTRY["my_small_upsampler"] = my_small_upsampler
+```
+
+#### Train with labels 
+To train with you own metadata for your audio files, implement `get_metadata` in `data/files_dataset.py` to return the 
+`artist`, `genre` and `lyrics` for a given audio file. For now, you can pass `''` for lyrics to not use any lyrics.
+
+For training with labels, we'll use `small_labelled_prior` in `hparams.py`, and we set `labels=True,labels_v3=True`. 
+We use 2 kinds of labels information:
+- Artist/Genre: 
+  - For each file, we return an artist_id and a list of genre_ids. The reason we have a list and not a single genre_id 
+  is that in v2, we split genres like `blues_rock` into a bag of words `[blues, rock]`, and we pass atmost 
+  `max_bow_genre_size` of those, in `v3` we consider it as a single word and just set `max_bow_genre_size=1`.
+  - Update the `v3_artist_ids` and `v3_genre_ids` to use ids from your new dataset. 
+  - In `small_labelled_prior`, set the hps `y_bins = (number_of_genres, number_of_artists)` and `max_bow_genre_size=1`. 
+- Timing: 
+  - For each chunk of audio, we return the `total_length` of the song, the `offset` the current audio chunk is at and 
+  the `sample_length` of the audio chunk. We have three timing embeddings: total_length, our current position, and our 
+  current position as a fraction of the total length, and we divide the range of these values into `t_bins` discrete bins. 
+  - In `small_labelled_prior`, set the hps `min_duration` and `max_duration` to be the shortest/longest duration of audio 
+  files you want for your dataset, and `t_bins` for how many bins you want to discretize timing information into. Note 
+  `min_duration * sr` needs to be at least `sample_length` to have an audio chunk in it.
+
+After these modifications, to train a top-level with labels, run
+```
+mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,small_labelled_prior,all_fp16,cpu_ema --name=pretrained_vqvae_small_prior_labels \
+--sample_length=1048576 --bs=4 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
+--labels=True --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
+```
+
+For sampling, follow same instructions as [above](#sample-from-new-model) but use `small_labelled_prior` instead of `small_prior`.  
+
+#### Train with lyrics
+To train in addition with lyrics, update `get_metadata` in `data/files_dataset.py` to return `lyrics` too.
+For training with lyrics, we'll use `small_single_enc_dec_prior` in `hparams.py`. 
+- Lyrics: 
+  - For each file, we linearly align the lyric characters to the audio, find the position in lyric that corresponds to 
+  the midpoint of our audio chunk, and pass a window of `n_tokens` lyric characters centred around that. 
+  - In `small_single_enc_dec_prior`, set the hps `use_tokens=True` and `n_tokens` to be the number of lyric characters 
+  to use for an audio chunk. Set it according to the `sample_length` you're training on so that its large enough that 
+  the lyrics for an audio chunk are almost always found inside a window of that size.
+  - If you use a non-English vocabulary, update `text_processor.py` with your new vocab and set
+  `n_vocab = number of characters in vocabulary` accordingly in `small_single_enc_dec_prior`. In v2, we had a `n_vocab=80` 
+  and in v3 we missed `+` and so `n_vocab=79` of characters. 
+
+After these modifications, to train a top-level with labels and lyrics, run
+```
+mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,small_single_enc_dec_prior,all_fp16,cpu_ema --name=pretrained_vqvae_small_single_enc_dec_prior_labels \
+--sample_length=786432 --bs=4 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
+--labels=True --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
+```
+To simplify hps choices, here we used a `single_enc_dec` model like the `1b_lyrics` model that combines both encoder and 
+decoder of the transformer into a single model. We do so by merging the lyric vocab and vq-vae vocab into a single 
+larger vocab, and flattening the lyric tokens and the vq-vae codes into a single sequence of length `n_ctx + n_tokens`. 
+This uses `attn_order=12` which includes `prime_attention` layers with keys/values from lyrics and queries from audio. 
+If you instead want to use a model with the usual encoder-decoder style transformer, use `small_sep_enc_dec_prior`.
+
+For sampling, follow same instructions as [above](#sample-from-new-model) but use `small_single_enc_dec_prior` instead of 
+`small_prior`. To also get the alignment between lyrics and samples in the saved html, you'll need to set `alignment_layer` 
+and `alignment_head` in `small_single_enc_dec_prior`. To find which layer/head is best to use, run a forward pass on a training example,
+save the attention weight tensors for all prime_attention layers, and pick the (layer, head) which has the best linear alignment 
+pattern between the lyrics keys and music queries. 
+
+### Fine-tune pre-trained top-level prior to new style(s)
+Previously, we showed how to train a small top-level prior from scratch. Assuming you have a GPU with at least 15 GB of memory and support for fp16, you could fine-tune from our pre-trained 1B top-level prior. Here are the steps:
+
+- Support `--labels=True` by implementing `get_metadata` in `jukebox/data/files_dataset.py` for your dataset.
+- Add new entries in `jukebox/data/ids`. We recommend replacing existing mappings (e.g. rename `"unknown"`, etc with styles of your choice). This uses the pre-trained style vectors as initialization and could potentially save some compute.
+
+After these modifications, run 
+```
+mpiexec -n {ngpus} python jukebox/train.py --hps=vqvae,prior_1b_lyrics,all_fp16,cpu_ema --name=finetuned \
+--sample_length=1048576 --bs=1 --aug_shift --aug_blend --audio_files_dir={audio_files_dir} \
+--labels=True --train --test --prior --levels=3 --level=2 --weight_decay=0.01 --save_iters=1000
+```
+To get the best sample quality, it is recommended to anneal the learning rate in the end. Training the 5B top-level requires GPipe which is not supported in this release.
+
+# Citation
+
+Please cite using the following bibtex entry:
+
+```
+@article{dhariwal2020jukebox,
+  title={Jukebox: A Generative Model for Music},
+  author={Dhariwal, Prafulla and Jun, Heewoo and Payne, Christine and Kim, Jong Wook and Radford, Alec and Sutskever, Ilya},
+  journal={arXiv preprint arXiv:2005.00341},
+  year={2020}
+}
+```
+
+# License 
+[Noncommercial Use License](./LICENSE) 
+
+It covers both released code and weights. 
+
diff --git a/apex/.gitignore b/apex/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..58d3520d60eb0e7d0f2420b48cc45c77d8fac0a8
--- /dev/null
+++ b/apex/.gitignore
@@ -0,0 +1,5 @@
+apex.egg-info
+dist
+build
+docs/build
+*~
\ No newline at end of file
diff --git a/apex/.nojekyll b/apex/.nojekyll
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/apex/LICENSE b/apex/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..3d1e9454ff796daeba01a60f5131491d02f026fa
--- /dev/null
+++ b/apex/LICENSE
@@ -0,0 +1,11 @@
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/apex/README.md b/apex/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd145218316f6f78eaec91bb582c92c653518098
--- /dev/null
+++ b/apex/README.md
@@ -0,0 +1,99 @@
+# Introduction
+
+This repository holds NVIDIA-maintained utilities to streamline 
+mixed precision and distributed training in Pytorch. 
+Some of the code here will be included in upstream Pytorch eventually.
+The intention of Apex is to make up-to-date utilities available to 
+users as quickly as possible.
+
+## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)
+
+# Contents
+
+## 1. Amp:  Automatic Mixed Precision
+
+`apex.amp` is a tool to enable mixed precision training by changing only 3 lines of your script.
+Users can easily experiment with different pure and mixed precision training modes by supplying
+different flags to `amp.initialize`.
+
+[Webinar introducing Amp](https://info.nvidia.com/webinar-mixed-precision-with-pytorch-reg-page.html)
+(The flag `cast_batchnorm` has been renamed to `keep_batchnorm_fp32`).
+
+[API Documentation](https://nvidia.github.io/apex/amp.html)
+
+[Comprehensive Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+
+[DCGAN example coming soon...](https://github.com/NVIDIA/apex/tree/master/examples/dcgan)
+
+[Moving to the new Amp API](https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users) (for users of the deprecated "Amp" and "FP16_Optimizer" APIs)
+
+## 2. Distributed Training
+
+`apex.parallel.DistributedDataParallel` is a module wrapper, similar to 
+`torch.nn.parallel.DistributedDataParallel`.  It enables convenient multiprocess distributed training,
+optimized for NVIDIA's NCCL communication library.
+
+[API Documentation](https://nvidia.github.io/apex/parallel.html)
+
+[Python Source](https://github.com/NVIDIA/apex/tree/master/apex/parallel)
+
+[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed)
+
+The [Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+shows use of `apex.parallel.DistributedDataParallel` along with `apex.amp`.
+
+### Synchronized Batch Normalization
+
+`apex.parallel.SyncBatchNorm` extends `torch.nn.modules.batchnorm._BatchNorm` to
+support synchronized BN.
+It allreduces stats across processes during multiprocess (DistributedDataParallel) training.
+Synchronous BN has been used in cases where only a small
+local minibatch can fit on each GPU.
+Allreduced stats increase the effective batch size for the BN layer to the
+global batch size across all processes (which, technically, is the correct
+formulation).
+Synchronous BN has been observed to improve converged accuracy in some of our research models.
+
+# Requirements
+
+Python 3
+
+CUDA 9 or newer
+
+PyTorch 0.4 or newer.  The CUDA and C++ extensions require pytorch 1.0 or newer.
+
+We recommend the latest stable release, obtainable from
+[https://pytorch.org/](https://pytorch.org/).  We also test against the latest master branch, obtainable from [https://github.com/pytorch/pytorch](https://github.com/pytorch/pytorch).
+
+It's often convenient to use Apex in Docker containers.  Compatible options include:
+* [NVIDIA Pytorch containers from NGC](https://ngc.nvidia.com/catalog/containers/nvidia%2Fpytorch), which come with Apex preinstalled.  To use the latest Amp API, you may need to `pip uninstall apex` then reinstall Apex using the **Quick Start** commands below.
+* [official Pytorch -devel Dockerfiles](https://hub.docker.com/r/pytorch/pytorch/tags), e.g. `docker pull pytorch/pytorch:nightly-devel-cuda10.0-cudnn7`, in which you can install Apex using the **Quick Start** commands.
+
+See the [Docker example folder](https://github.com/NVIDIA/apex/tree/master/examples/docker) for details.
+
+# Quick Start
+
+### Linux
+
+For performance and full functionality, we recommend installing Apex with
+CUDA and C++ extensions via
+```
+$ git clone https://github.com/NVIDIA/apex
+$ cd apex
+$ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+```
+
+Apex also supports a Python-only build (required with Pytorch 0.4) via
+```
+$ pip install -v --no-cache-dir .
+```
+A Python-only build omits:
+- Fused kernels required to use `apex.optimizers.FusedAdam`.
+- Fused kernels required to use `apex.normalization.FusedLayerNorm`.
+- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`.
+- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
+`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.
+
+### Windows support
+Windows support is experimental, and Linux is recommended.  `pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
+on your system.  `pip install -v --no-cache-dir .` (without CUDA/C++ extensions) is more likely to work.  If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
diff --git a/apex/apex.patch b/apex/apex.patch
new file mode 100644
index 0000000000000000000000000000000000000000..5120c9a74942278654c7303951f61e3b3c9dbe2b
--- /dev/null
+++ b/apex/apex.patch
@@ -0,0 +1,42 @@
+diff --git a/csrc/fused_adam_cuda_kernel.cu b/csrc/fused_adam_cuda_kernel.cu
+index 34f7aa2..95581d1 100644
+--- a/csrc/fused_adam_cuda_kernel.cu
++++ b/csrc/fused_adam_cuda_kernel.cu
+@@ -19,8 +19,8 @@ typedef enum{
+ 
+ template <typename T, typename GRAD_T>
+ __global__ void adam_cuda_kernel(
+-        T* __restrict__ p,
+-        GRAD_T* __restrict__ p_copy, // For mixed precision training, pass NULL if not needed
++        GRAD_T* __restrict__ p,
++        T* __restrict__ p_copy, // For mixed precision training, pass NULL if not needed
+         T* __restrict__ m,
+         T* __restrict__ v,
+         const GRAD_T * __restrict__ g,
+@@ -50,7 +50,7 @@ __global__ void adam_cuda_kernel(
+                 else // Mode 1
+                     denom = sqrtf(v[j]) + eps;
+                 float update = (m[j]/denom) + (decay*p[j]);
+-                p[j] = p[j] - (step_size*update);
++                p[j] = (GRAD_T) (p[j] - (step_size*update));
+                 if (p_copy != NULL) p_copy[j] = (GRAD_T) p[j];
+         }
+ }
+@@ -93,14 +93,14 @@ void fused_adam_cuda(
+ 
+         if (g.scalar_type() == at::ScalarType::Half) {
+ //all other values should be fp32 for half gradients
+-            AT_ASSERTM(p.scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
++//            AT_ASSERTM(p.scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
+ //dispatch is done on the gradient type
+             using namespace at; // prevents "toString is undefined" errors
+             DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel", 
+                 using accscalar_t = at::acc_type<scalar_t_0, true>;
+                 adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
+-                        p.data<accscalar_t>(),
+-                        p_copy.numel() ? p_copy.data<scalar_t_0>() : NULL,
++                        p.data<scalar_t_0>(),
++                        NULL, //don't output p_copy for fp32, it's wasted write
+                         m.data<accscalar_t>(),
+                         v.data<accscalar_t>(),
+                         g.data<scalar_t_0>(),
diff --git a/apex/apex/RNN/README.md b/apex/apex/RNN/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e86fd8fc1eecb066b10814fea5967acbb67fe7b
--- /dev/null
+++ b/apex/apex/RNN/README.md
@@ -0,0 +1 @@
+Under construction...
diff --git a/apex/apex/RNN/RNNBackend.py b/apex/apex/RNN/RNNBackend.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d4937efa583710c2eddb617da10bd824db9371
--- /dev/null
+++ b/apex/apex/RNN/RNNBackend.py
@@ -0,0 +1,365 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+import torch.nn.functional as F
+
+import math
+
+
+def is_iterable(maybe_iterable):
+    return isinstance(maybe_iterable, list) or isinstance(maybe_iterable, tuple)
+
+
+def flatten_list(tens_list):
+    """
+    flatten_list
+    """
+    if not is_iterable(tens_list):
+        return tens_list
+    
+    return torch.cat(tens_list, dim=0).view(len(tens_list), *tens_list[0].size() )
+
+    
+#These modules always assumes batch_first
+class bidirectionalRNN(nn.Module):
+    """
+    bidirectionalRNN
+    """
+    def __init__(self, inputRNN, num_layers=1, dropout = 0):
+        super(bidirectionalRNN, self).__init__()
+        self.dropout = dropout
+        self.fwd = stackedRNN(inputRNN, num_layers=num_layers, dropout = dropout)
+        self.bckwrd = stackedRNN(inputRNN.new_like(), num_layers=num_layers, dropout = dropout)
+        self.rnns = nn.ModuleList([self.fwd, self.bckwrd])
+        
+    #collect hidden option will return all hidden/cell states from entire RNN
+    def forward(self, input, collect_hidden=False):
+        """
+        forward()
+        """
+        seq_len = input.size(0)
+        bsz = input.size(1)
+
+        fwd_out, fwd_hiddens = list(self.fwd(input, collect_hidden = collect_hidden))
+        bckwrd_out, bckwrd_hiddens = list(self.bckwrd(input, reverse=True, collect_hidden = collect_hidden))
+        
+        output = torch.cat( [fwd_out, bckwrd_out], -1 )
+        hiddens = tuple( torch.cat(hidden, -1) for hidden in zip( fwd_hiddens, bckwrd_hiddens) )
+
+        return output, hiddens
+
+    def reset_parameters(self):
+        """
+        reset_parameters()
+        """
+        for rnn in self.rnns:
+            rnn.reset_parameters()
+        
+    def init_hidden(self, bsz):
+        """
+        init_hidden()
+        """
+        for rnn in self.rnns:
+            rnn.init_hidden(bsz)
+
+    def detach_hidden(self):
+        """
+        detach_hidden()
+        """
+        for rnn in self.rnns:
+            rnn.detachHidden()
+        
+    def reset_hidden(self, bsz):
+        """
+        reset_hidden()
+        """
+        for rnn in self.rnns:
+            rnn.reset_hidden(bsz)
+
+    def init_inference(self, bsz):    
+        """
+        init_inference()
+        """
+        for rnn in self.rnns:
+            rnn.init_inference(bsz)
+
+   
+#assumes hidden_state[0] of inputRNN is output hidden state
+#constructor either takes an RNNCell or list of RNN layers
+class stackedRNN(nn.Module):        
+    """
+    stackedRNN
+    """
+    def __init__(self, inputRNN, num_layers=1, dropout=0):
+        super(stackedRNN, self).__init__()
+        
+        self.dropout = dropout
+        
+        if isinstance(inputRNN, RNNCell):
+            self.rnns = [inputRNN]
+            for i in range(num_layers-1):
+                self.rnns.append(inputRNN.new_like(inputRNN.output_size))
+        elif isinstance(inputRNN, list):
+            assert len(inputRNN) == num_layers, "RNN list length must be equal to num_layers"
+            self.rnns=inputRNN
+        else:
+            raise RuntimeError()
+        
+        self.nLayers = len(self.rnns)
+        
+        self.rnns = nn.ModuleList(self.rnns)
+
+
+    '''
+    Returns output as hidden_state[0] Tensor([sequence steps][batch size][features])
+    If collect hidden will also return Tuple(
+        [n_hidden_states][sequence steps] Tensor([layer][batch size][features])
+    )
+    If not collect hidden will also return Tuple(
+        [n_hidden_states] Tensor([layer][batch size][features])
+    '''
+    def forward(self, input, collect_hidden=False, reverse=False):
+        """
+        forward()
+        """
+        seq_len = input.size(0)
+        bsz = input.size(1)
+        inp_iter = reversed(range(seq_len)) if reverse else range(seq_len)
+
+        hidden_states = [[] for i in range(self.nLayers)]
+        outputs = []
+
+        for seq in inp_iter:
+            for layer in range(self.nLayers):
+
+                if layer == 0:
+                    prev_out = input[seq]
+                    
+                outs = self.rnns[layer](prev_out)
+
+                if collect_hidden:
+                    hidden_states[layer].append(outs)
+                elif seq == seq_len-1:
+                    hidden_states[layer].append(outs)
+                    
+                prev_out = outs[0]
+
+            outputs.append(prev_out)
+
+        if reverse:
+            outputs = list(reversed(outputs))
+        '''
+        At this point outputs is in format:
+        list( [seq_length] x Tensor([bsz][features]) )
+        need to convert it to:
+        list( Tensor([seq_length][bsz][features]) )
+        '''
+        output = flatten_list(outputs)
+
+        '''
+        hidden_states at this point is in format:
+        list( [layer][seq_length][hidden_states] x Tensor([bsz][features]) )
+        need to convert it to:
+          For not collect hidden:
+            list( [hidden_states] x Tensor([layer][bsz][features]) )
+          For collect hidden:
+            list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
+        '''
+        if not collect_hidden:
+            seq_len = 1
+        n_hid = self.rnns[0].n_hidden_states
+        new_hidden = [ [ [ None for k in range(self.nLayers)] for j in range(seq_len) ] for i in range(n_hid) ]
+
+
+        for i in range(n_hid):
+            for j in range(seq_len):
+                for k in range(self.nLayers):
+                    new_hidden[i][j][k] = hidden_states[k][j][i]
+
+        hidden_states = new_hidden
+        #Now in format list( [hidden_states][seq_length][layer] x Tensor([bsz][features]) )
+        #Reverse seq_length if reverse
+        if reverse:
+            hidden_states = list( list(reversed(list(entry))) for entry in hidden_states)
+
+        #flatten layer dimension into tensor
+        hiddens = list( list(
+            flatten_list(seq) for seq in hidden )
+                        for hidden in hidden_states )
+        
+        #Now in format list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
+        #Remove seq_length dimension if not collect_hidden
+        if not collect_hidden:
+            hidden_states = list( entry[0] for entry in hidden_states)
+        return output, hidden_states
+    
+    def reset_parameters(self):
+        """
+        reset_parameters()
+        """
+        for rnn in self.rnns:
+            rnn.reset_parameters()
+        
+    def init_hidden(self, bsz):
+        """
+        init_hidden()
+        """
+        for rnn in self.rnns:
+            rnn.init_hidden(bsz)
+
+    def detach_hidden(self):
+        """
+        detach_hidden()
+        """
+        for rnn in self.rnns:
+            rnn.detach_hidden()
+        
+    def reset_hidden(self, bsz):
+        """
+        reset_hidden()
+        """
+        for rnn in self.rnns:
+            rnn.reset_hidden(bsz)
+
+    def init_inference(self, bsz):    
+        """ 
+        init_inference()
+        """
+        for rnn in self.rnns:
+            rnn.init_inference(bsz)
+
+class RNNCell(nn.Module):
+    """ 
+    RNNCell 
+    gate_multiplier is related to the architecture you're working with
+    For LSTM-like it will be 4 and GRU-like will be 3.
+    Always assumes input is NOT batch_first.
+    Output size that's not hidden size will use output projection
+    Hidden_states is number of hidden states that are needed for cell
+    if one will go directly to cell as tensor, if more will go as list
+    """
+    def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_hidden_states = 2, bias = False, output_size = None):
+        super(RNNCell, self).__init__()
+
+        self.gate_multiplier = gate_multiplier
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.cell = cell
+        self.bias = bias
+        self.output_size = output_size
+        if output_size is None:
+            self.output_size = hidden_size
+
+        self.gate_size = gate_multiplier * self.hidden_size
+        self.n_hidden_states = n_hidden_states
+
+        self.w_ih = nn.Parameter(torch.Tensor(self.gate_size, self.input_size))
+        self.w_hh = nn.Parameter(torch.Tensor(self.gate_size, self.output_size))
+
+        #Check if there's recurrent projection
+        if(self.output_size != self.hidden_size):
+            self.w_ho = nn.Parameter(torch.Tensor(self.output_size, self.hidden_size))
+
+        self.b_ih = self.b_hh = None
+        if self.bias:
+            self.b_ih = nn.Parameter(torch.Tensor(self.gate_size))
+            self.b_hh = nn.Parameter(torch.Tensor(self.gate_size))
+            
+        #hidden states for forward
+        self.hidden = [ None for states in range(self.n_hidden_states)]
+
+        self.reset_parameters()
+
+    def new_like(self, new_input_size=None):
+        """
+        new_like()
+        """
+        if new_input_size is None:
+            new_input_size = self.input_size
+            
+        return type(self)(self.gate_multiplier,
+                       new_input_size,
+                       self.hidden_size,
+                       self.cell,
+                       self.n_hidden_states,
+                       self.bias,
+                       self.output_size)
+
+    
+    #Use xavier where we can (weights), otherwise use uniform (bias)
+    def reset_parameters(self, gain=1):
+        """
+        reset_parameters()
+        """
+        stdev = 1.0 / math.sqrt(self.hidden_size)
+        for param in self.parameters():
+            param.data.uniform_(-stdev, stdev)
+    '''
+    Xavier reset:
+    def reset_parameters(self, gain=1):
+        stdv = 1.0 / math.sqrt(self.gate_size)
+
+        for param in self.parameters():
+            if (param.dim() > 1):
+                torch.nn.init.xavier_normal(param, gain)
+            else:
+                param.data.uniform_(-stdv, stdv)
+    '''
+    def init_hidden(self, bsz):
+        """
+        init_hidden()
+        """
+        for param in self.parameters():
+            if param is not None:
+                a_param = param
+                break
+
+        for i, _ in enumerate(self.hidden):
+            if(self.hidden[i] is None or self.hidden[i].data.size()[0] != bsz):
+
+                if i==0:
+                    hidden_size = self.output_size
+                else:
+                    hidden_size = self.hidden_size
+
+                tens = a_param.data.new(bsz, hidden_size).zero_()
+                self.hidden[i] = Variable(tens, requires_grad=False)
+            
+        
+    def reset_hidden(self, bsz):
+        """
+        reset_hidden()
+        """
+        for i, _ in enumerate(self.hidden):
+            self.hidden[i] = None
+        self.init_hidden(bsz)
+
+    def detach_hidden(self):
+        """
+        detach_hidden()
+        """
+        for i, _ in enumerate(self.hidden):
+            if self.hidden[i] is None:
+                raise RuntimeError("Must initialize hidden state before you can detach it")
+        for i, _ in enumerate(self.hidden):
+            self.hidden[i] = self.hidden[i].detach()
+        
+    def forward(self, input):
+        """
+        forward()
+        if not inited or bsz has changed this will create hidden states
+        """
+        self.init_hidden(input.size()[0])
+
+        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
+        self.hidden = self.cell(input, hidden_state, self.w_ih, self.w_hh, b_ih=self.b_ih, b_hh=self.b_hh)
+        if(self.n_hidden_states > 1):
+            self.hidden = list(self.hidden)
+        else:
+            self.hidden=[self.hidden]
+
+        if self.output_size != self.hidden_size:
+            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
+
+        return tuple(self.hidden)
diff --git a/apex/apex/RNN/__init__.py b/apex/apex/RNN/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d70674666969bb874cd7c0af9193510c6c26f0c2
--- /dev/null
+++ b/apex/apex/RNN/__init__.py
@@ -0,0 +1,3 @@
+from .models import LSTM, GRU, ReLU, Tanh, mLSTM
+
+__all__ = ['models']
diff --git a/apex/apex/RNN/cells.py b/apex/apex/RNN/cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..32b61a1be18a9788f83865cbb15fa32e1aaa0fc1
--- /dev/null
+++ b/apex/apex/RNN/cells.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .RNNBackend import RNNCell
+
+from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
+
+import math 
+
+
+class mLSTMRNNCell(RNNCell):
+    """
+    mLSTMRNNCell
+    """
+
+    def __init__(self, input_size, hidden_size, bias = False, output_size = None):
+        gate_multiplier = 4
+        super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
+
+        self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
+        self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))
+
+        self.reset_parameters()
+
+    def forward(self, input):
+        """
+        mLSTMRNNCell.forward()
+        """
+        #if not inited or bsz has changed this will create hidden states
+        self.init_hidden(input.size()[0])
+
+        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
+
+        self.hidden = list(
+                           self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
+                           b_ih=self.b_ih, b_hh=self.b_hh)
+        )
+        
+        if self.output_size != self.hidden_size:
+            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
+        return tuple(self.hidden)
+
+
+    def new_like(self, new_input_size=None):
+        if new_input_size is None:
+            new_input_size = self.input_size
+        
+        return type(self)(
+            new_input_size,
+            self.hidden_size,
+            self.bias,
+            self.output_size)
+
+def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
+    """
+    mLSTMCell
+    """
+
+    if input.is_cuda:
+        igates = F.linear(input, w_ih)
+        m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
+        hgates = F.linear(m, w_hh)
+
+        state = fusedBackend.LSTMFused.apply
+        return state(igates, hgates, hidden[1], b_ih, b_hh)
+
+    hx, cx = hidden
+    
+    m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
+    gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
+
+    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
+
+    ingate = F.sigmoid(ingate)
+    forgetgate = F.sigmoid(forgetgate)
+    cellgate = F.tanh(cellgate)
+    outgate = F.sigmoid(outgate)
+    
+    cy = (forgetgate * cx) + (ingate * cellgate)
+    hy = outgate * F.tanh(cy)
+    
+    return hy, cy
+                                                                            
diff --git a/apex/apex/RNN/models.py b/apex/apex/RNN/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd7adce04781a305749643b5caec7cae854a73ad
--- /dev/null
+++ b/apex/apex/RNN/models.py
@@ -0,0 +1,54 @@
+import torch
+
+from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
+
+from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
+from .cells import mLSTMRNNCell, mLSTMCell
+
+def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
+    """
+    :class:`toRNNBackend`
+    """
+
+    if bidirectional:
+        return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
+    else:
+        return stackedRNN(inputRNN, num_layers, dropout = dropout)
+
+
+def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+    """
+    :class:`LSTM`
+    """
+    inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
+    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+    """
+    :class:`GRU`
+    """
+    inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
+    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+    """
+    :class:`ReLU`
+    """
+    inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
+    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+    """
+    :class:`Tanh`
+    """
+    inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
+    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+        
+def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+    """
+    :class:`mLSTM`
+    """
+    inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
+    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+
diff --git a/apex/apex/__init__.py b/apex/apex/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d37717944b3be2895a77cab25d7a71c1edc3d36
--- /dev/null
+++ b/apex/apex/__init__.py
@@ -0,0 +1,13 @@
+from . import parallel
+from . import amp
+from . import fp16_utils
+
+# For optimizers and normalization there is no Python fallback.
+# Absence of cuda backend is a hard error.
+# I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
+# to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
+# so they expect those backends to be available, but for some reason they actually aren't
+# available (for example because they built improperly in a way that isn't revealed until
+# load time) the error message is timely and visible.
+from . import optimizers
+from . import normalization
diff --git a/apex/apex/amp/README.md b/apex/apex/amp/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a87b5010e374dd3c0c4b794cf9057a8b5a984924
--- /dev/null
+++ b/apex/apex/amp/README.md
@@ -0,0 +1,72 @@
+# amp: Automatic Mixed Precision
+
+## Annotating User Functions
+
+Nearly all PyTorch user code needs nothing more than the two steps
+above to use amp. After all, custom layers are built out of simpler
+PyTorch components, and amp already can see those.
+
+However, any custom C++ or CUDA code is outside of amp's (default)
+view of things. For example, suppose I implemented a new recurrent
+cell called a "forgetful recurrent unit" that calls directly into a
+CUDA backend:
+
+```python
+from backend import FRUBackend
+
+def fru(input, hidden, weight, bias):
+    # call to CUDA code
+    FRUBackend(input, hidden, weight, bias)
+```
+
+In this case, it is possible to get a runtime type mismatch. For
+example, you might have `input` in fp16, and `weight` in fp32, and amp
+doesn't have the visibility to insert an appropriate cast.
+
+amp exposes two ways to handle "invisible" backend code: function
+annotations and explicit registration.
+
+#### Function annotation
+
+The first way to handle backend code is a set of function annotations:
+
+- `@amp.half_function`
+- `@amp.float_function`
+- `@amp.promote_function`
+
+These correspond to:
+
+- Cast all arguments to fp16
+- Cast all argumnets fo fp32
+- If there are any type mismatches, cast everything to the widest type
+
+In our example, we believe that the FRU unit is fp16-safe and will get
+performance gains from casting its arguments to fp16, so we write:
+
+```python
+@amp.half_function
+def fru(input, hidden, weight, bias):
+    #...
+```
+
+#### Explicit registration
+
+The other way to handle backend code is with explicit function
+registration:
+
+- `amp.register_half_function(module, function_name)`
+- `amp.register_float_function(module, function_name)`
+- `amp.register_promote_function(module, function_name)`
+
+When using this API, `module` is the containing class or module for
+the function, and `function_name` is the _string_ name of the
+function. Note that the function must be registered before the call to
+`amp.initalize()`.
+
+For our FRU unit, we can register the backend function directly:
+
+```python
+import backend
+
+amp.register_half_function(backend, 'FRUBackend')
+```
diff --git a/apex/apex/amp/__init__.py b/apex/apex/amp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..16fa551fbaca527585f2f02a0e0bf601a21a9141
--- /dev/null
+++ b/apex/apex/amp/__init__.py
@@ -0,0 +1,5 @@
+from .amp import init, half_function, float_function, promote_function,\
+    register_half_function, register_float_function, register_promote_function
+from .handle import scale_loss, disable_casts
+from .frontend import initialize
+from ._amp_state import master_params, _amp_state
diff --git a/apex/apex/amp/__version__.py b/apex/apex/amp/__version__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a83701b29dea1314ed7de69bdbc0a59f2f6badd
--- /dev/null
+++ b/apex/apex/amp/__version__.py
@@ -0,0 +1,2 @@
+VERSION = (0, 1, 0)
+__version__ = '.'.join(map(str, VERSION))
diff --git a/apex/apex/amp/_amp_state.py b/apex/apex/amp/_amp_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd37b0f7f02303e7b30771110336d7ec60302275
--- /dev/null
+++ b/apex/apex/amp/_amp_state.py
@@ -0,0 +1,70 @@
+# This is a "header object" that allows different amp modules to communicate.
+# I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.  
+# But apparently it's ok:
+# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
+import os
+import torch
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+if TORCH_MAJOR == 0:
+    import collections.abc as container_abcs
+else:
+    from torch._six import container_abcs
+
+
+class AmpState(object):
+    def __init__(self):
+        self.hard_override=False
+        self.allow_incoming_model_not_fp32 = False
+        self.verbosity=1
+
+
+# Attribute stash.  Could also just stash things as global module attributes.
+_amp_state = AmpState()
+
+
+def warn_or_err(msg):
+    if _amp_state.hard_override:
+        print("Warning:  " + msg)
+    else:
+        raise RuntimeError(msg)
+        # I'm not sure if allowing hard_override is a good idea.
+        # + "  If you're sure you know what you're doing, supply " +
+        #                    "hard_override=True to amp.initialize.")
+
+
+distributed = False
+if 'WORLD_SIZE' in os.environ:
+    distributed = int(os.environ['WORLD_SIZE']) > 1
+
+
+def maybe_print(msg, rank0=False):
+    if _amp_state.verbosity > 0:
+        if rank0:
+            if distributed:
+                if torch.distributed.get_rank() == 0:
+                    print(msg)
+            else:
+                print(msg)
+        else:
+            print(msg)
+
+
+# def iter_params(param_groups):
+#     for group in param_groups:
+#         for p in group['params']:
+#             yield p
+
+
+def master_params(optimizer):
+    """
+    Generator expression that iterates over the params owned by ``optimizer``.
+
+    Args:
+        optimizer: An optimizer previously returned from ``amp.initialize``.
+    """
+    for group in optimizer.param_groups:
+        for p in group['params']:
+            yield p
diff --git a/apex/apex/amp/_initialize.py b/apex/apex/amp/_initialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..2193e31280356f23e5d18c94d9080cb9c7750658
--- /dev/null
+++ b/apex/apex/amp/_initialize.py
@@ -0,0 +1,268 @@
+import torch
+from torch._six import string_classes
+import functools
+import numpy as np
+import warnings
+from ._amp_state import _amp_state, warn_or_err, container_abcs
+from .handle import disable_casts
+from .scaler import LossScaler
+from ._process_optimizer import _process_optimizer
+from apex.fp16_utils import convert_network
+from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
+from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
+from ..optimizers import FusedAdam
+from ..parallel import DistributedDataParallel as apex_DDP
+from ..parallel.LARC import LARC
+
+
+def to_type(dtype, t):
+    if isinstance(t, torch.Tensor):
+        if not t.is_cuda:
+            # This should not be a hard error, since it may be legitimate.
+            warnings.warn("An input tensor was not cuda.")
+        # GANs require this.
+        # if t.requires_grad:
+        #     warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
+        #         "its gradients will not be properly allreduced by DDP.")
+        if t.is_floating_point():
+            return t.to(dtype)
+        return t
+    else:
+        # Trust the user's custom batch type, that's all I can do here.
+        return t.to(dtype)
+
+
+# Modified from torch.optim.optimizer.py.  This is a bit more general than casted_args in utils.py.
+def applier(value, fn):
+    if isinstance(value, torch.Tensor):
+        return fn(value)
+    elif isinstance(value, string_classes):
+        return value
+    elif isinstance(value, np.ndarray):
+        return value
+    elif hasattr(value, "to"): # Allow handling of custom batch classes
+        return fn(value)
+    elif isinstance(value, container_abcs.Mapping):
+        return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
+    elif isinstance(value, container_abcs.Iterable):
+        return type(value)(applier(v, fn) for v in value)
+    else:
+        # Do I want this to fire off even if someone chooses to pass something ordinary like
+        # an int or float?  May be more annoying than it's worth.
+        # print("Warning:  unrecognized type in applier.  If your input data is a custom class, "
+        #     "provide it with a .to(dtype) method which converts its floating-point Tensors to dtype. "
+        #     "Amp will check for your custom to() and invoke it to cast the batch's "
+        #     "floating-point Tensors to the appropriate type. "
+        #     "Also, if your data is a custom class, it is your responsibility to ensure that "
+        #     "any Tensors you want to be cuda are already cuda."
+        return value
+
+
+def check_models(models):
+    for model in models:
+        parallel_type = None
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+            parallel_type = "torch.nn.parallel.DistributedDataParallel"
+        if isinstance(model, apex_DDP):
+            parallel_type = "apex.parallel.DistributedDataParallel"
+        if isinstance(model, torch.nn.parallel.DataParallel):
+            parallel_type = "torch.nn.parallel.DataParallel"
+        if parallel_type is not None:
+            raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
+                "Parallel wrappers should only be applied to the model(s) AFTER \n"
+                "the model(s) have been returned from amp.initialize.")
+
+
+def check_params_fp32(models):
+    for model in models:
+        for name, param in model.named_parameters():
+            if param.is_floating_point():
+                if 'Half' in param.type():
+                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you do not need to call .half() on your model\n"
+                        "before passing it, no matter what optimization level you choose.".format(
+                        name, param.type()))
+                elif not param.is_cuda:
+                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you need to provide a model with parameters\n"
+                        "located on a CUDA device before passing it no matter what optimization level\n"
+                        "you chose. Use model.to('cuda') to use the default device.".format(
+                        name, param.type()))
+
+        # Backward compatibility for PyTorch 0.4
+        if hasattr(model, 'named_buffers'):
+            buf_iter = model.named_buffers()
+        else:
+            buf_iter = model._buffers
+        for obj in buf_iter:
+            if type(obj)==tuple:
+                name, buf = obj
+            else:
+                name, buf = obj, buf_iter[obj]
+            if buf.is_floating_point():
+                if 'Half' in buf.type():
+                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you do not need to call .half() on your model\n"
+                        "before passing it, no matter what optimization level you choose.".format(
+                        name, buf.type()))
+                elif not buf.is_cuda:
+                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you need to provide a model with buffers\n"
+                        "located on a CUDA device before passing it no matter what optimization level\n"
+                        "you chose. Use model.to('cuda') to use the default device.".format(
+                        name, buf.type()))
+
+
+def check_optimizers(optimizers):
+    for optim in optimizers:
+        bad_optim_type = None
+        if isinstance(optim, FP16_Optimizer_general):
+            bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
+        if isinstance(optim, FP16_Optimizer_for_fused):
+            bad_optim_type = "apex.optimizers.FP16_Optimizer"
+        if bad_optim_type is not None:
+            raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
+                               "The optimizer(s) passed to amp.initialize() must be bare \n"
+                               "instances of either ordinary Pytorch optimizers, or Apex fused \n"
+                               "optimizers (currently just FusedAdam, but FusedSGD will be added \n"
+                               "soon).  You should not manually wrap your optimizer in either \n"
+                               "apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer. \n"
+                               "amp.initialize will take care of that for you (if necessary) based \n"
+                               "on the specified opt_level (and optional overridden properties).")
+
+
+def wrap_fused_adam(optimizer, properties):
+    msg = 'Currently, the usage of FusedAdam is restricted to '\
+          'amp.initialize(..., opt_level="O2", keep_batchnorm_fp32=False, '\
+          'loss_scale=float or "dynamic").  We are working on enabling more general usage.'
+
+    assert properties.master_weights is True, msg
+    assert properties.cast_model_type is torch.float16, msg
+    assert (properties.keep_batchnorm_fp32 is False or
+            properties.keep_batchnorm_fp32 is None), msg
+
+    if properties.loss_scale == "dynamic":
+        return FP16_Optimizer_for_fused(optimizer, dynamic_loss_scale=True)
+    else:
+        return FP16_Optimizer_for_fused(optimizer, static_loss_scale=properties.loss_scale)
+
+
+def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
+    from apex.parallel import DistributedDataParallel as apex_DDP
+    from .amp import init as amp_init
+
+    optimizers_was_list = False
+    if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
+        optimizers = [optimizers]
+    elif optimizers is None:
+        optimizers = []
+    elif isinstance(optimizers, list):
+        optimizers_was_list = True
+        check_optimizers(optimizers)
+    else:
+        check_optimizers([optimizers])
+        raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")
+
+    if isinstance(models, torch.nn.Module):
+        models_was_list = False
+        models = [models]
+    elif isinstance(models, list):
+        models_was_list = True
+    else:
+        raise TypeError("models must be either a single model or a list of models.")
+
+    check_models(models)
+
+    if not _amp_state.allow_incoming_model_not_fp32:
+        check_params_fp32(models)
+
+
+    # In the future, when FP16_Optimizer can be deprecated and master weights can
+    # become an attribute, remember to stash master weights before casting the model.
+
+    if properties.cast_model_type:
+        if properties.keep_batchnorm_fp32:
+            for model in models:
+                convert_network(model, properties.cast_model_type)
+        else:
+            for model in models:
+                model.to(properties.cast_model_type)
+
+        input_caster = functools.partial(to_type, properties.cast_model_type)
+        if cast_model_outputs is not None:
+            output_caster = functools.partial(to_type, cast_model_outputs)
+        else:
+            output_caster = functools.partial(to_type, torch.float32)
+
+        for model in models:
+            # Patch the forward method to cast incoming data to the correct type, and
+            # outgoing data to float32, so "the user never needs to call .half()."
+            # I like writing things explicitly more than decorators.
+            def patch_forward(old_fwd):
+                def new_fwd(*args, **kwargs):
+                    output = old_fwd(*applier(args, input_caster),
+                                     **applier(kwargs, input_caster))
+                    return applier(output, output_caster)
+                return new_fwd
+
+            model.forward = patch_forward(model.forward)
+
+        # State dict trick to recast any preexisting per-param state tensors 
+        for optimizer in optimizers:
+            optimizer.load_state_dict(optimizer.state_dict())
+    elif cast_model_outputs is not None:
+        output_caster = functools.partial(to_type, cast_model_outputs)
+
+        for model in models:
+            def patch_forward(old_fwd):
+                def new_fwd(*args, **kwargs):
+                    output = old_fwd(*args, **kwargs)
+                    return applier(output, output_caster)
+                return new_fwd
+
+            model.forward = patch_forward(model.forward)
+
+    for i, optimizer in enumerate(optimizers):
+        # Still need to special case this for the first pass
+        if isinstance(optimizer, FusedAdam):
+            optimizers[i] = wrap_fused_adam(optimizer, properties)
+        else:
+            optimizers[i] = _process_optimizer(optimizer, properties)
+
+    _amp_state.loss_scalers = []
+    for _ in range(num_losses):
+        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale,
+                                                  min_loss_scale=_amp_state.min_loss_scale,
+                                                  max_loss_scale=_amp_state.max_loss_scale))
+
+    if properties.patch_torch_functions:
+        # handle is unused here. It's accessible later through a global value anyway.
+        handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
+        for optimizer in optimizers:
+            # Disable Amp casting for the optimizer step, because it should only be
+            # applied to FP32 master params anyway.
+            def patch_step(old_step):
+                def new_step(*args, **kwargs):
+                    with disable_casts():
+                        output = old_step(*args, **kwargs)
+                    return output
+                return new_step
+
+            optimizer.step = patch_step(optimizer.step)
+
+    if optimizers_was_list:
+        if models_was_list:
+            return models, optimizers
+        else:
+            return models[0], optimizers
+    else:
+        if models_was_list:
+            if len(optimizers) == 0:
+                return models
+            else:
+                return models, optimizers[0]
+        else:
+            if len(optimizers) == 0:
+                return models[0]
+            else:
+                return models[0], optimizers[0]
diff --git a/apex/apex/amp/_process_optimizer.py b/apex/apex/amp/_process_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da861730a80f5d55c97b8fc7022fa2fba556da8
--- /dev/null
+++ b/apex/apex/amp/_process_optimizer.py
@@ -0,0 +1,411 @@
+import types
+from ..fp16_utils import master_params_to_model_params
+from ..multi_tensor_apply import multi_tensor_applier
+from ._amp_state import maybe_print
+import torch
+
+
+class AmpOptimizerState(object):
+    def __init__(self):
+        pass
+
+
+def lazy_init_with_master_weights(self):
+        stash = self._amp_stash
+        stash.fp16_groups = []
+        stash.fp32_from_fp16_groups = []
+        stash.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.param_groups):
+            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+                        #             .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.state:
+                           self.state[master_param] = self.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+                        #             .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError("Optimizer's parameters must be either "
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                        "Received {}".format(param.type()))
+
+            stash.fp16_groups.append(fp16_params_this_group)
+            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        stash.all_fp16_params = []
+        for group in stash.fp16_groups:
+            stash.all_fp16_params += group
+
+        stash.all_fp32_from_fp16_params = []
+        for group in stash.fp32_from_fp16_groups:
+            stash.all_fp32_from_fp16_params += group
+
+        stash.all_fp32_from_fp32_params = []
+        for group in stash.fp32_from_fp32_groups:
+            stash.all_fp32_from_fp32_params += group
+
+        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
+
+        for param in stash.all_fp32_from_fp16_params:
+            param.grad = None
+
+        for param in stash.all_fp32_from_fp32_params:
+            param.grad = None
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.load_state_dict(self.state_dict())
+
+
+def prepare_backward_with_master_weights(self):
+    stash = self._amp_stash
+
+    if not stash.lazy_init_called:
+        self._lazy_init_maybe_master_weights()
+        stash.lazy_init_called = True
+
+    for i, param in enumerate(stash.all_fp16_params):
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
+    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad
+
+    for i, param in enumerate(stash.all_fp32_from_fp32_params):
+        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+
+def post_backward_with_master_weights(self, scaler):
+    stash = self._amp_stash
+
+    # This is a lot of python overhead...
+    fp16_grads_needing_unscale = []
+    new_fp32_grads = []
+    fp16_grads_needing_unscale_with_stash = []
+    preexisting_fp32_grads = []
+    for fp16_param, fp32_param in zip(stash.all_fp16_params,
+                                      stash.all_fp32_from_fp16_params):
+        if fp16_param.grad is None and fp32_param.grad is not None:
+            continue
+        elif fp16_param.grad is not None and fp32_param.grad is None:
+            fp32_param.grad = torch.empty_like(fp32_param)
+            fp16_grads_needing_unscale.append(fp16_param.grad)
+            new_fp32_grads.append(fp32_param.grad)
+        elif fp16_param.grad is not None and fp32_param.grad is not None:
+            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
+            preexisting_fp32_grads.append(fp32_param.grad)
+        else: # fp16_param.grad is None and fp32_param.grad is None:
+            continue
+
+    if len(fp16_grads_needing_unscale) > 0:
+        scaler.unscale(
+            fp16_grads_needing_unscale,
+            new_fp32_grads,
+            scaler.loss_scale(),
+            models_are_masters=False)
+
+    if len(fp16_grads_needing_unscale_with_stash) > 0:
+        scaler.unscale_with_stashed(
+            fp16_grads_needing_unscale_with_stash,
+            preexisting_fp32_grads,
+            preexisting_fp32_grads)
+
+    # fp32 params can be treated as they would be in the "no_master_weights" case.
+    grads_needing_unscale = []
+    grads_needing_unscale_with_stash = []
+    stashed = []
+    for param, stashed_grad in zip(stash.all_fp32_from_fp32_params,
+                                   stash.all_fp32_from_fp32_grad_stash):
+        if param.grad is None and stashed_grad is not None:
+            param.grad = stashed_grad
+        elif param.grad is not None and stashed_grad is None:
+            grads_needing_unscale.append(param.grad)
+        elif param.grad is not None and stashed_grad is not None:
+            grads_needing_unscale_with_stash.append(param.grad)
+            stashed.append(stashed_grad)
+        else: # param.grad is None and stashed_grad is None:
+            continue
+
+    if len(grads_needing_unscale) > 0:
+        scaler.unscale(
+            grads_needing_unscale,
+            grads_needing_unscale,
+            scaler.loss_scale(),
+            models_are_masters=True)
+
+    if len(grads_needing_unscale_with_stash) > 0:
+        scaler.unscale_with_stashed(
+            grads_needing_unscale_with_stash,
+            stashed,
+            grads_needing_unscale_with_stash)
+
+    # Clear the stash.
+    for i in range(len(stash.all_fp32_from_fp32_grad_stash)):
+        stash.all_fp32_from_fp32_grad_stash[i] = None
+
+
+def lazy_init_no_master_weights(self):
+    stash = self._amp_stash
+    stash.all_fp16_params = []
+    stash.all_fp32_params = []
+    for i, param_group in enumerate(self.param_groups):
+        for i, param in enumerate(param_group['params']):
+            if param.type() == 'torch.cuda.HalfTensor':
+                stash.all_fp16_params.append(param)
+            elif param.type() == 'torch.cuda.FloatTensor':
+                stash.all_fp32_params.append(param)
+            else:
+                raise TypeError("Optimizer's parameters must be either "
+                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                "Received {}".format(param.type()))
+
+    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
+    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
+
+
+def prepare_backward_no_master_weights(self):
+    stash = self._amp_stash
+
+    if not stash.lazy_init_called:
+        self._lazy_init_maybe_master_weights()
+        stash.lazy_init_called = True
+
+    for i, param in enumerate(stash.all_fp16_params):
+        stash.all_fp16_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+    for i, param in enumerate(stash.all_fp32_params):
+        stash.all_fp32_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+
+def post_backward_no_master_weights(self, scaler):
+    stash = self._amp_stash
+
+    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
+             (stash.all_fp32_params, stash.all_fp32_grad_stash))
+
+    for params, stashed_grads in split_types:
+        # This is a lot of python overhead...
+        grads_needing_unscale = []
+        grads_needing_unscale_with_stash = []
+        stashed = []
+        for param, stashed_grad in zip(params, stashed_grads):
+            if param.grad is None and stashed_grad is not None:
+                param.grad = stashed_grad
+            elif param.grad is not None and stashed_grad is None:
+                grads_needing_unscale.append(param.grad)
+            elif param.grad is not None and stashed_grad is not None:
+                grads_needing_unscale_with_stash.append(param.grad)
+                stashed.append(stashed_grad)
+            else: # param.grad is None and stashed_grad is None
+                continue
+
+        if len(grads_needing_unscale) > 0:
+            scaler.unscale(
+                grads_needing_unscale,
+                grads_needing_unscale,
+                scaler.loss_scale(),
+                models_are_masters=True)
+
+        if len(grads_needing_unscale_with_stash) > 0:
+            scaler.unscale_with_stashed(
+                grads_needing_unscale_with_stash,
+                stashed,
+                grads_needing_unscale_with_stash)
+
+        # Clear the stash.
+        for i in range(len(stashed_grads)):
+            stashed_grads[i] = None
+
+
+def _master_params_to_model_params(self):
+    stash = self._amp_stash
+    if multi_tensor_applier.available:
+        if len(stash.all_fp16_params) > 0:
+            multi_tensor_applier(
+                stash.multi_tensor_scale,
+                stash.dummy_overflow_buf,
+                [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
+                1.0)
+    else:
+        for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+
+def _process_optimizer(optimizer, properties):
+    if hasattr(optimizer, "_amp_stash"):
+        raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
+    else:
+        optimizer._amp_stash = AmpOptimizerState()
+
+    optimizer._amp_stash.lazy_init_called = False
+    optimizer._amp_stash.already_patched = False
+    optimizer._amp_stash.params_have_scaled_gradients = False
+
+    for name in ("_lazy_init_maybe_master_weights",
+                 "_master_params_to_model_params",
+                 "_prepare_amp_backward",
+                 "_post_amp_backward"):
+        if hasattr(optimizer, name):
+            raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
+
+    # TODO:  Centralize exposure and import error checking for the C backend.
+    if multi_tensor_applier.available:
+        import amp_C
+        optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
+        optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
+
+    if properties.master_weights:
+        optimizer._lazy_init_maybe_master_weights = types.MethodType(
+            lazy_init_with_master_weights, optimizer)
+
+        optimizer._master_params_to_model_params = types.MethodType(
+            _master_params_to_model_params, optimizer)
+
+        old_step = optimizer.step
+        def new_step(self, closure=None):
+            if closure is not None:
+                raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
+            retval = old_step()
+            self._master_params_to_model_params()
+            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+            for param in self._amp_stash.all_fp32_from_fp16_params:
+                param.grad = None
+            return retval
+        optimizer.step = types.MethodType(new_step, optimizer)
+
+        old_zero_grad = optimizer.zero_grad
+        def new_zero_grad(self):
+            stash = self._amp_stash
+            if not stash.lazy_init_called:
+                self._lazy_init_maybe_master_weights()
+                stash.lazy_init_called = True
+            # Zero the model grads.
+            for param in stash.all_fp16_params:
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad.zero_()
+            for param in stash.all_fp32_from_fp32_params:
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad.zero_()
+            # Clear the master grads that are independent of model grads
+            for param in self._amp_stash.all_fp32_from_fp16_params:
+                param.grad = None
+        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
+
+        optimizer._prepare_amp_backward = types.MethodType(
+            prepare_backward_with_master_weights, optimizer)
+
+        optimizer._post_amp_backward = types.MethodType(
+            post_backward_with_master_weights, optimizer)
+    else:
+        optimizer._lazy_init_maybe_master_weights = types.MethodType(
+            lazy_init_no_master_weights, optimizer)
+
+        optimizer._prepare_amp_backward = types.MethodType(
+            prepare_backward_no_master_weights, optimizer)
+
+        optimizer._post_amp_backward = types.MethodType(
+            post_backward_no_master_weights, optimizer)
+
+    old_add_param_group = optimizer.add_param_group
+
+    def new_add_param_group(self, new_group):
+        stash = self._amp_stash
+
+        if not stash.lazy_init_called:
+            self._lazy_init_maybe_master_weights()
+            stash.lazy_init_called = True
+
+        assert isinstance(new_group, dict), "param group must be a dict"
+
+        new_params = new_group['params']
+        if isinstance(new_params, torch.Tensor):
+            new_group['params'] = [new_params]
+        elif isinstance(new_params, set):
+            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
+                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
+        else:
+            new_group['params'] = list(new_params)
+
+        if properties.master_weights:
+            # Mutate new_group in-place to use FP32 master params
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(new_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        new_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        fp32_params_this_group.append(param)
+                        new_group['params'][i] = param
+                    else:
+                        raise TypeError("Optimizer's parameters must be either "
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                        "Received {}".format(param.type()))
+
+            stash.fp16_groups.append(fp16_params_this_group)
+            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+            stash.all_fp16_params += fp16_params_this_group
+            stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
+            stash.all_fp32_from_fp32_params += fp32_params_this_group
+
+            # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+            stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]
+
+            # It should be ok to let params be added with existing .grad attributes.
+            # for param in fp16_params_this_group:
+            #     param.grad = None
+
+            # for param in fp32_from_fp16_params_this_group:
+            #     param.grad = None
+
+            # for param in stash.fp32_params_this_group:
+            #     param.grad = None
+        else:
+            for param in new_group['params']:
+                if param.type() == 'torch.cuda.HalfTensor':
+                    stash.all_fp16_params.append(param)
+                    stash.all_fp16_grad_stash.append(None)
+                elif param.type() == 'torch.cuda.FloatTensor':
+                    stash.all_fp32_params.append(param)
+                    stash.all_fp32_grad_stash.append(None)
+                else:
+                    raise TypeError("Optimizer's parameters must be either "
+                                    "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                    "Received {}".format(param.type()))
+
+        old_add_param_group(new_group)
+
+    optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)
+
+    return optimizer
diff --git a/apex/apex/amp/amp.py b/apex/apex/amp/amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eed72d07b63730ca1906da44cdc569cb9425433
--- /dev/null
+++ b/apex/apex/amp/amp.py
@@ -0,0 +1,177 @@
+from . import compat, rnn_compat, utils, wrap
+from .handle import AmpHandle, NoOpHandle
+from .lists import functional_overrides, torch_overrides, tensor_overrides
+from ._amp_state import _amp_state
+from .frontend import *
+
+import functools
+import itertools
+
+import torch
+
+
+_DECORATOR_HANDLE = None
+_USER_CAST_REGISTRY = set()
+_USER_PROMOTE_REGISTRY = set()
+
+
+def _decorator_helper(orig_fn, cast_fn, wrap_fn):
+    def wrapper(*args, **kwargs):
+        handle = _DECORATOR_HANDLE
+        if handle is None or not handle.is_active():
+            return orig_fn(*args, **kwargs)
+        inner_cast_fn = utils.verbosify(cast_fn, orig_fn.__name__,
+                                  handle.verbose)
+        return wrap_fn(orig_fn, inner_cast_fn, handle)(*args, **kwargs)
+    return wrapper
+
+
+# Decorator form
+def half_function(fn):
+    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=True)
+    return _decorator_helper(fn, utils.maybe_half, wrap_fn)
+
+
+def float_function(fn):
+    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=False)
+    return _decorator_helper(fn, utils.maybe_float, wrap_fn)
+
+
+def promote_function(fn):
+    wrap_fn = functools.partial(wrap.make_promote_wrapper)
+    return _decorator_helper(fn, utils.maybe_float, wrap_fn)
+
+
+# Registry form
+def register_half_function(module, name):
+    if not hasattr(module, name):
+        raise ValueError('No function named {} in module {}.'.format(
+            name, module))
+    _USER_CAST_REGISTRY.add((module, name, utils.maybe_half))
+
+
+def register_float_function(module, name):
+    if not hasattr(module, name):
+        raise ValueError('No function named {} in module {}.'.format(
+            name, module))
+    _USER_CAST_REGISTRY.add((module, name, utils.maybe_float))
+
+
+def register_promote_function(module, name):
+    if not hasattr(module, name):
+        raise ValueError('No function named {} in module {}.'.format(
+            name, module))
+    _USER_PROMOTE_REGISTRY.add((module, name))
+
+
+# Top-level function to insert _all_ the hooks.
+def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
+    global _DECORATOR_HANDLE
+
+    if not enabled:
+        handle = NoOpHandle()
+        _DECORATOR_HANDLE = handle
+        return handle
+
+    handle = AmpHandle(loss_scale, enable_caching, verbose)
+
+    # 0) Force-{fp16, fp32} for user-annotated functions
+    for mod, fn, cast_fn in _USER_CAST_REGISTRY:
+        try_caching = (cast_fn == utils.maybe_half)
+        wrap.cached_cast(mod, fn, cast_fn, handle,
+                         try_caching, verbose)
+    _USER_CAST_REGISTRY.clear()
+
+    # 0.5) Force-promote for user-annotated functions
+    for mod, fn in _USER_PROMOTE_REGISTRY:
+        wrap.promote(mod, fn, handle, verbose)
+    _USER_PROMOTE_REGISTRY.clear()
+
+    # 1) Force-{fp16, fp32} on white- / black-list functions
+    override_modules = [functional_overrides,
+                        torch_overrides,
+                        tensor_overrides]
+    cast_table = [('FP16_FUNCS', utils.maybe_half),
+                  ('FP32_FUNCS', utils.maybe_float)]
+    for module, (list_name, cast_fn) in itertools.product(override_modules,
+                                                          cast_table):
+        for fn in getattr(module, list_name):
+            try_caching = (cast_fn == utils.maybe_half)
+            wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
+                             try_caching, verbose)
+
+    # 1.5) Pre-0.4, put the blacklist methods on HalfTensor and whitelist
+    #      methods on FloatTensor, since they're distinct types.
+    if compat.tensor_is_float_tensor():
+        for fn in tensor_overrides.FP16_FUNCS:
+            wrap.cached_cast(torch.cuda.FloatTensor, fn, utils.maybe_half,
+                             handle, try_caching=True, verbose=verbose)
+        for fn in tensor_overrides.FP32_FUNCS:
+            wrap.cached_cast(torch.cuda.HalfTensor, fn, utils.maybe_float,
+                             handle, try_caching=False, verbose=verbose)
+
+    # 2) Enable type-promotion on multi-arg functions and methods.
+    #    NB: special handling for sequence fns (e.g. `torch.cat`).
+    promote_modules = [torch_overrides, tensor_overrides]
+    promote_table = [('CASTS', wrap.promote),
+                     ('SEQUENCE_CASTS', wrap.sequence_promote)]
+    for promote_mod, (list_name, promote_fn) in itertools.product(promote_modules,
+                                                                  promote_table):
+        for fn in getattr(promote_mod, list_name):
+            promote_fn(promote_mod.MODULE, fn, handle, verbose)
+
+    # 2.5) Pre-0.4, add blacklist methods directly to HalfTensor and FloatTensor types
+    if compat.tensor_is_float_tensor():
+        for cls, (list_name, promote_fn) in itertools.product([torch.cuda.FloatTensor,
+                                                               torch.cuda.HalfTensor],
+                                                              promote_table):
+            for fn in getattr(tensor_overrides, list_name):
+                promote_fn(cls, fn, handle, verbose)
+
+    # 3) For any in-place version of a blacklist function, error if any input is fp16.
+    #    NB: this is overly conservative.
+    for fn in utils.as_inplace(torch_overrides.FP32_FUNCS):
+        wrap.err_if_any_half(torch_overrides.MODULE, fn, handle)
+
+    # 3.5) For any in-place blacklist method, error if called on fp16 tensor
+    for fn in utils.as_inplace(tensor_overrides.FP32_FUNCS):
+        wrap.err_if_arg0_half(tensor_overrides.MODULE, fn, handle, verbose)
+        if compat.tensor_is_float_tensor():
+            wrap.err_if_arg0_half(torch.cuda.HalfTensor, fn, handle, verbose)
+
+    # 4) For other in-place methods, match the type of self tensor
+    for fn in utils.as_inplace(itertools.chain(
+            tensor_overrides.FP16_FUNCS,
+            tensor_overrides.CASTS)):
+        wrap.promote_match_arg0(tensor_overrides.MODULE, fn, handle, verbose)
+        if compat.tensor_is_float_tensor():
+            wrap.promote_match_arg0(torch.cuda.HalfTensor, fn, handle, verbose)
+            wrap.promote_match_arg0(torch.cuda.FloatTensor, fn, handle, verbose)
+
+    # 5) RNNs + RNN cells are whitelisted specially
+    if rnn_compat.has_old_rnns():
+        wrap.rnn_cast(torch.nn.backends.thnn.backend, 'RNN', handle, verbose)
+    if not rnn_compat.has_old_rnns():
+        # Patch in our own indirection of `_VF` in modules/rnn s.t. it is mutable.
+        torch.nn.modules.rnn._VF = rnn_compat.VariableFunctionsShim()
+        # Wrap all the rnns
+        for x in rnn_compat.RNN_NAMES:
+            wrap.new_rnn_cast(x.upper(), handle, verbose)
+
+    # Wrap all the RNN cells
+    rnn_compat.whitelist_rnn_cells(handle, verbose)
+
+    # 6) Place error+print message on banned functions.
+    #    Or, if allow_banned, then cast to FP32.
+    for fn, err_msg in functional_overrides.BANNED_FUNCS:
+        if allow_banned:
+            wrap.cached_cast(functional_overrides.MODULE, fn, utils.maybe_float,
+                             handle, try_caching=True, verbose=verbose)
+        else:
+            wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)
+
+    _DECORATOR_HANDLE = handle
+
+    _amp_state.handle = handle
+
+    return handle
diff --git a/apex/apex/amp/compat.py b/apex/apex/amp/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a4edc2228c65a81b873a87d7abef89b6a968d22
--- /dev/null
+++ b/apex/apex/amp/compat.py
@@ -0,0 +1,42 @@
+import torch
+
+# True for post-0.4, when Variables/Tensors merged.
+def variable_is_tensor():
+    v = torch.autograd.Variable()
+    return isinstance(v, torch.Tensor)
+
+def tensor_is_variable():
+    x = torch.Tensor()
+    return type(x) == torch.autograd.Variable
+
+# False for post-0.4
+def tensor_is_float_tensor():
+    x = torch.Tensor()
+    return type(x) == torch.FloatTensor
+
+# Akin to `torch.is_tensor`, but returns True for Variable
+# objects in pre-0.4.
+def is_tensor_like(x):
+    return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
+
+# Wraps `torch.is_floating_point` if present, otherwise checks
+# the suffix of `x.type()`.
+def is_floating_point(x):
+    if hasattr(torch, 'is_floating_point'):
+        return torch.is_floating_point(x)
+    try:
+        torch_type = x.type()
+        return torch_type.endswith('FloatTensor') or \
+            torch_type.endswith('HalfTensor') or \
+            torch_type.endswith('DoubleTensor')
+    except AttributeError:
+        return False
+
+def scalar_python_val(x):
+    if hasattr(x, 'item'):
+        return x.item()
+    else:
+        if isinstance(x, torch.autograd.Variable):
+            return x.data[0]
+        else:
+            return x[0]
diff --git a/apex/apex/amp/frontend.py b/apex/apex/amp/frontend.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3cc792cb3be7eedb1c5dbf7c92e9628b3013c10
--- /dev/null
+++ b/apex/apex/amp/frontend.py
@@ -0,0 +1,399 @@
+import torch
+from ._initialize import _initialize
+from ._amp_state import _amp_state, warn_or_err, maybe_print
+
+
+class Properties(object):
+    """
+    This class has two purposes: to establish a set of default properties,
+    and to route setting of these attributes through __setattr__ so that (in theory)
+    they can be checked for consistency with other existing args.
+    """
+    def __init__(self):
+        self.options = {
+            "enabled" : False,
+            "opt_level" : None,
+            "cast_model_type" : None,
+            "patch_torch_functions" : False,
+            "keep_batchnorm_fp32" : None,
+            "master_weights" : None,
+            "loss_scale" : 1.0,
+            # Reserved for future functionality
+            # "fused_optimizer" : False,
+            # "enable_ddp_interop" : False,
+            }
+
+    """
+    This function allows updating several options at a time without routing through
+    __setattr__ checks, to avoid "you can't get there from here" scenarios.
+    Currently not intended to be exposed; users are expected to select an opt_level
+    and apply consistent modifications.
+    """
+    def _update_options_dict(new_options):
+        for k, v in new_options:
+            if k in self.options:
+                self.options[k] = v
+            else:
+                raise ValueError("Tried to set unexpected option {}".format(k))
+    """
+    The members of "options" are not direct attributes of self, so access attempts
+    will roll down to __getattr__.  This borrows from the logic in torch.nn.Module.
+    """
+    def __getattr__(self, name):
+        if "options" in self.__dict__:
+            options =  self.__dict__["options"]
+            if name in options:
+                return options[name]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, name))
+
+    def __setattr__(self, name, value):
+        if "options" in self.__dict__:
+            if name in self.options:
+                # print("setting {} {}".format(name, value))
+                if name == "cast_model_type":
+                    if self.opt_level == "O1" and value is not None:
+                        if value is not False:
+                            if value is not torch.float32:
+                                warn_or_err("O1 inserts casts around Torch functions rather than "
+                                            "model weights, so with O1, the model weights themselves "
+                                            "should remain FP32. If you wish to cast the model to a "
+                                            "different type, use opt_level='O2' or 'O3'. " +
+                                            "cast_model_type was {}".format(value))
+                    self.options[name] = value
+                elif name == "patch_torch_functions":
+                    if self.opt_level != "O1" and value:
+                        warn_or_err("Currently, patch_torch_functions=True should only be set by "
+                                    "selecting opt_level='O1'.")
+                    self.options[name] = value
+                elif name == "keep_batchnorm_fp32":
+                    if self.opt_level == "O1" and value is not None:
+                        warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
+                                    "to run in FP32, so keep_batchnorm_fp32 should be None." +
+                                    " keep_batchnorm_fp32 was {}".format(value))
+                    if value == "False":
+                        self.options[name] = False
+                    elif value == "True":
+                        self.options[name] = True
+                    else:
+                        assert (value is True or value is False or value is None),\
+                            "keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
+                            "or None, found keep_batchnorm_fp32={}".format(value)
+                        self.options[name] = value
+                elif name == "master_weights":
+                    if self.opt_level == "O1" and value is not None:
+                        warn_or_err("It doesn't make sense to use master_weights with O1. "
+                                    "With O1, your model weights themselves should be FP32.")
+                    self.options[name] = value
+                elif name == "loss_scale":
+                    if value == "dynamic":
+                        self.options[name] = value
+                    else:
+                        self.options[name] = float(value)
+                else:
+                    self.options[name] = value
+        else:
+            super(Properties, self).__setattr__(name, value)
+
+
+""" O0-O3 are convenience wrappers to establish defaults for typically used mixed precision options. """
+
+class O3:
+    brief = "O3:  Pure FP16 training."
+    more = "Calls .half() on your model, converting the entire model to FP16.\n"\
+        "A casting operation is also inserted to cast incoming Tensors to FP16,\n"\
+        "so you don't need to change your data pipeline.\n"\
+        "This mode is useful for establishing a performance ceiling.\n"\
+        "It's also possible training may 'just work' in this mode.\n"\
+        "If not, try other optimization levels."
+
+    def __call__(self, properties):
+        properties.enabled = True
+        properties.opt_level = "O3"
+        properties.cast_model_type = torch.float16
+        properties.patch_torch_functions = False
+        properties.keep_batchnorm_fp32 = False
+        properties.master_weights = False
+        properties.loss_scale = 1.0
+        # properties.fused_optimizer = False
+        # properties.enable_ddp_interop = False
+        return properties # modified in place so this isn't really necessary
+
+
+class O2:
+    brief = "O2:  FP16 training with FP32 batchnorm and FP32 master weights.\n"
+    more = "Calls .half() on your model, converting the entire model (except for batchnorms)\n"\
+        "to FP16.  Batchnorms are retained in FP32 for additional stability.\n"\
+        "The forward pass is patched to cast incoming Tensors to FP16, so you don't need to change\n"\
+        "your data pipeline.\n"\
+        "O2 creates FP32 master weights outside the model and patches any optimizers to update\n"\
+        "these master weights, then copy the master weights into the FP16 model weights.\n"\
+        "Master weights can also improve convergence and stability."
+
+    def __call__(self, properties):
+        properties.enabled = True
+        properties.opt_level = "O2"
+        properties.cast_model_type = torch.float16
+        properties.patch_torch_functions = False
+        properties.keep_batchnorm_fp32 = True
+        properties.master_weights = True
+        properties.loss_scale = "dynamic"
+        # properties.fused_optimizer = False
+        # properties.enable_ddp_interop = False
+        return properties # modified in place so this isn't really necessary
+
+
+class O1:
+    brief = "O1:  Insert automatic casts around Pytorch functions and Tensor methods.\n"
+    more = "The type of your model's weights is not altered.  However, internally,\n"\
+        "Pytorch functions are patched to cast any Tensor Core-friendly ops to FP16 for speed,\n"\
+        "while operations that might benefit from the additional stability of FP32 are patched\n"\
+        "to cast their inputs to fp32.\n"\
+        "O1 is the safest way to try mixed precision training, and is recommended when\n"\
+        "trying mixed precision training for the first time."
+
+    def __call__(self, properties):
+        properties.enabled = True
+        properties.opt_level = "O1"
+        properties.cast_model_type = None
+        properties.patch_torch_functions = True
+        properties.keep_batchnorm_fp32 = None
+        properties.master_weights = None
+        properties.loss_scale = "dynamic"
+        # properties.fused_optimizer = False
+        # properties.enable_ddp_interop = False
+        return properties # modified in place so this isn't really necessary
+
+
+class O0:
+    brief = "O0:  Pure FP32 training.\n"
+    more = "Your models are checked to make sure parameters are FP32, but otherwise the\n"\
+        "types of weights and internal Pytorch operations are not altered.  This mode disables any\n"\
+        "FP16 arithmetic, although other optimizations like DDP interop may still be requested.\n"
+
+    def __call__(self, properties):
+        properties.enabled = True
+        properties.opt_level = "O0"
+        properties.cast_model_type = torch.float32
+        properties.patch_torch_functions = False
+        properties.keep_batchnorm_fp32 = None
+        properties.master_weights = False
+        properties.loss_scale = 1.0
+        # properties.fused_optimizer = False
+        # properties.enable_ddp_interop = False
+        return properties # modified in place so this isn't really necessary
+
+
+opt_levels = {"O3": O3(),
+              "O2": O2(),
+              "O1": O1(),
+              "O0": O0()}
+
+
+# allow user to directly pass Properties struct as well?
+def initialize(
+    models,
+    optimizers=None,
+    enabled=True,
+    opt_level="O1",
+    cast_model_type=None,
+    patch_torch_functions=None,
+    keep_batchnorm_fp32=None,
+    master_weights=None,
+    loss_scale=None,
+    cast_model_outputs=None,
+    num_losses=1,
+    verbosity=1,
+    min_loss_scale=None,
+    max_loss_scale=2.**24
+    ):
+    """
+    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
+    chosen ``opt_level`` and overridden properties, if any.
+
+    ``amp.initialize`` should be called **after** you have finished
+    constructing your model(s) and
+    optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
+    See `Distributed training`_ in the Imagenet example.
+
+    Currently, ``amp.initialize`` should only be called **once**,
+    although it can process an arbitrary number of
+    models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
+    If you think your use case requires ``amp.initialize`` to be called more than once,
+    `let us know`_.
+
+    Any property keyword argument that is not ``None`` will be interpreted as a manual override.
+
+    To prevent having to rewrite anything else in your script, name the returned models/optimizers
+    to replace the passed models/optimizers, as in the code sample below.
+
+    Args:
+        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
+        optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
+            REQUIRED for training, optional for inference.
+        enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops, so your script
+            should run as if Amp were not present.
+        opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.  Accepted values are
+            "O0", "O1", "O2", and "O3", explained in detail above.
+        cast_model_type (``torch.dtype``, optional, default=None):  Optional property override, see
+            above.
+        patch_torch_functions (bool, optional, default=None):  Optional property override.
+        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
+            passed as a string, must be the string "True" or "False".
+        master_weights (bool, optional, default=None):  Optional property override.
+        loss_scale (float or str, optional, default=None):  Optional property override.  If passed as a string,
+            must be a string representing a number, e.g., "128.0", or the string "dynamic".
+        cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that the outputs
+            of your model(s) are always cast to a particular type regardless of ``opt_level``.
+        num_losses (int, optional, default=1):  Option to tell Amp in advance how many losses/backward
+            passes you plan to use.  When used in conjunction with the ``loss_id`` argument to
+            ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
+            which can improve stability.  See "Multiple models/optimizers/losses"
+            under `Advanced Amp Usage`_ for examples.  If ``num_losses`` is left to 1, Amp will still
+            support multiple losses/backward passes, but use a single global loss scale
+            for all of them.
+        verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
+        min_loss_scale (float, default=None):  Sets a floor for the loss scale values that can be chosen by dynamic
+            loss scaling.  The default value of None means that no floor is imposed.
+            If dynamic loss scaling is not used, `min_loss_scale` is ignored.
+        max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
+            dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.
+
+    Returns:
+        Model(s) and optimizer(s) modified according to the ``opt_level``.
+        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
+        also be a list.
+
+    Permissible invocations::
+
+        model, optim = amp.initialize(model, optim,...)
+        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
+        [model1, model2], optim = amp.initialize([model1, model2], optim,...)
+        [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
+
+        # This is not an exhaustive list of the cross product of options that are possible,
+        # just a set of examples.
+        model, optim = amp.initialize(model, optim, opt_level="O0")
+        model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
+
+        model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
+        model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
+
+        model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
+        model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
+        model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
+
+        model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
+        model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
+        model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
+
+    The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
+
+    .. _`Distributed training`:
+        https://github.com/NVIDIA/apex/tree/master/examples/imagenet#distributed-training
+
+    .. _`Imagenet example`:
+        https://github.com/NVIDIA/apex/tree/master/examples/imagenet
+
+    .. _`Advanced Amp Usage`:
+        https://nvidia.github.io/apex/advanced.html
+
+    .. _`Advanced Amp Usage topic`:
+        https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses
+
+    .. _`let us know`:
+        https://github.com/NVIDIA/apex/issues
+    """
+    _amp_state.opt_properties = Properties()
+    _amp_state.verbosity = verbosity
+
+    if not enabled:
+        if optimizers is None:
+            return models
+        else:
+            return models, optimizers
+
+    if not torch.backends.cudnn.enabled:
+        raise RuntimeError(
+            "Amp requires torch.backends.cudnn.enabled = True")
+
+    if opt_level not in opt_levels:
+        raise RuntimeError(
+            "Unexpected optimization level {}. ".format(opt_level) +
+            "Options are 'O0', 'O1', 'O2', 'O3'.  Note that in `O0`, `O1`, etc., the prefix O is the letter O, " +
+            "not the number zero.")
+    else:
+        _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
+        maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
+        maybe_print("Defaults for this optimization level are:", True)
+        for k, v in _amp_state.opt_properties.options.items():
+            maybe_print("{:22} : {}".format(k, v), True)
+
+    _amp_state.min_loss_scale = min_loss_scale
+    _amp_state.max_loss_scale = max_loss_scale
+
+    maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
+    # I chose to have the keyword arguments listed directly in the argument list,
+    # instead of **kwargs, so I can't use kwargs.items() here.
+    if enabled is not None:
+        _amp_state.opt_properties.enabled = enabled
+    if opt_level is not None:
+        _amp_state.opt_properties.opt_level = opt_level
+    if cast_model_type is not None:
+        _amp_state.opt_properties.cast_model_type = cast_model_type
+    if patch_torch_functions is not None:
+        _amp_state.opt_properties.patch_torch_functions = patch_torch_functions
+    if keep_batchnorm_fp32 is not None:
+        _amp_state.opt_properties.keep_batchnorm_fp32 = keep_batchnorm_fp32
+    if master_weights is not None:
+        _amp_state.opt_properties.master_weights = master_weights
+    if loss_scale is not None:
+        _amp_state.opt_properties.loss_scale = loss_scale
+
+    maybe_print("After processing overrides, optimization options are:", True)
+    for k, v in _amp_state.opt_properties.options.items():
+        maybe_print("{:22} : {}".format(k, v), True)
+
+    return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)
+
+
+# TODO:  is this necessary/useful?
+# def check_option_consistency(enabled=True,
+#                              opt_level=None,
+#                              cast_model_type=None,
+#                              patch_torch_functions=None,
+#                              keep_batchnorm_fp32=None,
+#                              master_weights=None,
+#                              loss_scale=None,
+#                              enable_ddp_interop=None,
+#                              hard_override=False):
+#     """
+#     Utility function that enables users to quickly check if the option combination they intend
+#     to use is permitted.  ``check_option_consistency`` does not require models or optimizers
+#     to be constructed, and can be called at any point in the script.  ``check_option_consistency``
+#     is totally self-contained; it does not set any amp global state or affect anything outside
+#     of itself.
+#     """
+#
+#     if not enabled:
+#         return
+#
+#     if opt_level not in opt_levels:
+#         raise RuntimeError("Unexpected optimization level.  Options are 'O0', 'O1', 'O2', 'O3'.")
+#     else:
+#         opt_properties = opt_levels[opt_level](Properties())
+#         print("Selected optimization level {}", opt_levels[opt_level].brief)
+#         print("Defaults for this optimization level are:")
+#         for k, v in opt_properties.options:
+#             print("{:22} : {}".format(k, v))
+#
+#     print("Processing user overrides (additional kwargs that are not None)...")
+#     for k, v in kwargs:
+#         if k not in _amp_state.opt_properties.options:
+#             raise RuntimeError("Unexpected kwarg {}".format(k))
+#         if v is not None:
+#             setattr(opt_properties, k, v)
+#
+#     print("After processing overrides, optimization options are:")
+#     for k, v in opt_properties.options:
+#         print("{:22} : {}".format(k, v))
diff --git a/apex/apex/amp/handle.py b/apex/apex/amp/handle.py
new file mode 100644
index 0000000000000000000000000000000000000000..957ad4d940ba3ac4ce2bc634873996ab878454e2
--- /dev/null
+++ b/apex/apex/amp/handle.py
@@ -0,0 +1,280 @@
+import contextlib
+import warnings
+import torch
+
+from . import utils
+from .opt import OptimWrapper
+from .scaler import LossScaler
+from ._amp_state import _amp_state, master_params, maybe_print
+from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
+from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
+from ..parallel.LARC import LARC
+
+
+# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
+@contextlib.contextmanager
+def scale_loss(loss,
+               optimizers,
+               loss_id=0,
+               model=None,
+               delay_unscale=False,
+               delay_overflow_check=False):
+    """
+    On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
+    ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::
+
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+
+    On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
+    and unscaled, so that ``optimizer.step()`` can be called.
+
+    .. note::
+        If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
+        can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
+        any FP16 gradients are copied to FP32 master gradients before being unscaled.
+        ``optimizer.step()`` will then apply the unscaled master gradients to the master params.
+
+    .. warning::
+        If Amp is using explicit FP32 master params, only the FP32 master gradients will be
+        unscaled.  The direct ``.grad`` attributes of any FP16
+        model params will remain scaled after context manager exit.
+        This subtlety affects gradient clipping.  See "Gradient clipping" under
+        `Advanced Amp Usage`_ for best practices.
+
+    Args:
+        loss(Tensor):  Typically a scalar Tensor. The ``scaled_loss`` that the context
+            manager yields is simply ``loss.float()*loss_scale``, so in principle
+            ``loss`` could have more than one element, as long as you call
+            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
+        optimizers:  All optimizer(s) for which the current backward pass is creating gradients.
+            Must be an optimizer or list of optimizers returned from an earlier call
+            to ``amp.initialize``.  For example use with multiple optimizers, see
+            "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
+        loss_id(int, optional, default=0):  When used in conjunction with the ``num_losses`` argument
+            to ``amp.initialize``, enables Amp to use a different loss scale per loss.  ``loss_id``
+            must be an integer between 0 and ``num_losses`` that tells Amp which loss is
+            being used for the current backward pass.  See "Multiple models/optimizers/losses"
+            under `Advanced Amp Usage`_ for examples.  If ``loss_id`` is left unspecified, Amp
+            will use the default global loss scaler for this backward pass.
+        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
+            optimizations.
+        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is never necessary, and
+            the default value of ``False`` is strongly recommended.
+            If ``True``, Amp will not unscale the gradients or perform model->master
+            gradient copies on context manager exit.
+            ``delay_unscale=True`` is a minor ninja performance optimization and can result
+            in weird gotchas (especially with multiple models/optimizers/losses),
+            so only use it if you know what you're doing.
+            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
+            illustrates a situation where this CAN (but does not need to) be used.
+
+    .. warning::
+        If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
+        called yet after context manager exit, and must wait for another, later backward context
+        manager invocation with ``delay_unscale`` left to False.
+
+    .. _`Advanced Amp Usage`:
+        https://nvidia.github.io/apex/advanced.html
+    """
+    if not hasattr(_amp_state, "opt_properties"):
+        raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized.  "
+                           "model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
+                           "before `with amp.scale_loss`.")
+
+    if not _amp_state.opt_properties.enabled:
+        yield loss
+        return
+
+    if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
+        optimizers = [optimizers]
+
+    # this is what happens when i have to support tools from different sources under the same API...
+    # TODO:  Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
+    if isinstance(optimizers, FP16_Optimizer_for_fused):
+        loss_scale = optimizers.cur_scale
+    else:
+        loss_scaler = _amp_state.loss_scalers[loss_id]
+        loss_scale = loss_scaler.loss_scale()
+
+    if ((not _amp_state.opt_properties.master_weights)
+        and (not loss_scaler.dynamic)
+        and loss_scale == 1.0):
+        yield loss.float()
+        # Needing to drop the cache here as well is an ugly gotcha.
+        # But for now I think it's necessary to short-circuit.
+        # Probably ok to skip this if not delay_unscale
+        if _amp_state.opt_properties.patch_torch_functions:
+            _amp_state.handle._clear_cache()
+        return
+
+    if not delay_unscale:
+        if isinstance(optimizers, list):
+            for optimizer in optimizers:
+                if not optimizer._amp_stash.params_have_scaled_gradients:
+                    optimizer._prepare_amp_backward()
+
+    yield (loss.float())*loss_scale
+
+    if delay_unscale:
+        for optimizer in optimizers:
+            optimizer._amp_stash.params_have_scaled_gradients = True
+    else:
+        # FusedAdam and FusedSGD will take care of unscaling as part of their step() methods.
+        if not isinstance(optimizers, FP16_Optimizer_for_fused):
+            loss_scaler.clear_overflow_state()
+            for optimizer in optimizers:
+                optimizer._post_amp_backward(loss_scaler)
+                optimizer._amp_stash.params_have_scaled_gradients = False
+            # For future fused optimizers that enable sync-free dynamic loss scaling,
+            # should_skip will always be False.
+            should_skip = False if delay_overflow_check else loss_scaler.update_scale()
+            if should_skip:
+                for optimizer in optimizers:
+                    if not optimizer._amp_stash.already_patched:
+                        # Close on loss_scaler and loss_id as well, to be safe.  Probably not
+                        # necessary because amp.scale_loss is already creating a temporary scope.
+                        def patch_step(opt, loss_scaler, loss_id):
+                            opt_step = opt.step
+                            def skip_step(closure=None):
+                                if closure is not None:
+                                    raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
+                                maybe_print(("Gradient overflow.  Skipping step, loss scaler " +
+                                             "{} reducing loss scale to {}").format(loss_id,
+                                             loss_scaler.loss_scale()))
+                                if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
+                                    # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
+                                        param.grad = None
+                                opt.step = opt_step
+                                opt._amp_stash.already_patched = False
+                            return skip_step
+                        optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
+                        optimizer._amp_stash.already_patched = True
+
+    # Probably ok to skip this if not delay_unscale
+    if _amp_state.opt_properties.patch_torch_functions:
+        _amp_state.handle._clear_cache()
+
+
+# Free function version of AmpHandle.disable_casts, another step on the
+# path to removing the concept of "AmpHandle"
+@contextlib.contextmanager
+def disable_casts():
+    _amp_state.handle._is_active = False
+    yield
+    _amp_state.handle._is_active = True
+
+
+class AmpHandle(object):
+    def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
+        self._enable_caching = enable_caching
+        self._verbose = verbose
+        self._cache = dict()
+        self._default_scaler = LossScaler(loss_scale)
+        self._is_active = True
+        self._all_wrappers = []
+
+    def is_active(self):
+        return self._is_active
+
+    @contextlib.contextmanager
+    def _disable_casts(self):
+        self._is_active = False
+        yield
+        self._is_active = True
+
+    def wrap_optimizer(self, optimizer, num_loss=1):
+        self._default_scaler = None
+        return OptimWrapper(optimizer, self, num_loss)
+
+    @contextlib.contextmanager
+    def scale_loss(self, loss, optimizer):
+        raise RuntimeError("The old Amp API is no longer supported.  Please move to the new API, "
+            "documented here:  https://nvidia.github.io/apex/amp.html.  Transition guide:  "
+            "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")
+
+        if not self.is_active():
+            yield loss
+            return
+
+        if self._default_scaler is None:
+            raise RuntimeError(
+                'After calling `handle.wrap_optimizer()`, you must explicitly ' +
+                'use `optimizer.scale_loss(loss)`.')
+
+        # TODO: this code block is duplicated here and `opt.py`. Unify.
+        loss_scale = self._default_scaler.loss_scale()
+        yield loss * loss_scale
+
+        self._default_scaler.clear_overflow_state()
+        self._default_scaler.unscale(
+            master_params(optimizer),
+            master_params(optimizer),
+            loss_scale)
+        should_skip = self._default_scaler.update_scale()
+        if should_skip:
+            optimizer_step = optimizer.step
+            def skip_step():
+                maybe_print('Gradient overflow, skipping update')
+                optimizer.step = optimizer_step
+            optimizer.step = skip_step
+
+        self._clear_cache()
+
+    def _clear_cache(self):
+        self._cache.clear()
+
+    # Experimental support for saving / restoring uncasted versions of functions
+    def _save_func(self, mod, fn, func):
+        self._all_wrappers.append((mod, fn, func))
+
+    def _deactivate(self):
+        for mod, fn, func in self._all_wrappers:
+            utils.set_func(mod, fn, func)
+        self._all_wrappers = []
+
+    @property
+    def has_cache(self):
+        return self._enable_caching
+
+    @property
+    def cache(self):
+        return self._cache
+
+    def remove_cache(self, param):
+        if self.has_cache and param in self.cache:
+            del self.cache[param]
+
+    @property
+    def verbose(self):
+        return self._verbose
+
+class NoOpHandle(object):
+    def is_active(self):
+        return False
+
+    @contextlib.contextmanager
+    def _disable_casts(self):
+        yield
+
+    def wrap_optimizer(self, optimizer, num_loss=1):
+        return OptimWrapper(optimizer, self, num_loss)
+
+    @contextlib.contextmanager
+    def scale_loss(self, loss, optimizer):
+        yield loss
+
+    @property
+    def has_cache(self):
+        return False
+
+    @property
+    def verbose(self):
+        return False
+
+    def _clear_cache(self):
+        pass
+
+    def _deactivate(self):
+        pass
diff --git a/apex/apex/amp/lists/__init__.py b/apex/apex/amp/lists/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/apex/apex/amp/lists/functional_overrides.py b/apex/apex/amp/lists/functional_overrides.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ef4a31a989dff68b37ecaed256ac86b346782da
--- /dev/null
+++ b/apex/apex/amp/lists/functional_overrides.py
@@ -0,0 +1,77 @@
+
+# TODO: think about the following two. They do weird things.
+# - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
+# - torch.nn.utils.weight_norm
+
+# Notes:
+# F.instance_norm uses batch_norm internally. Which correctly handles
+#   fp16 in/out with fp32 weights. So we shouldn't do anything for
+#   either of these.
+# F.normalize calls `input.norm()` internally, so it's redundant, but
+#   kept here in case impl. changes.
+# F.cosine_similarity is same: calls `x.norm()` internally.
+
+import torch.nn.functional
+
+MODULE = torch.nn.functional
+
+FP16_FUNCS = [
+    'conv1d',
+    'conv2d',
+    'conv3d',
+    'conv_transpose1d',
+    'conv_transpose2d',
+    'conv_transpose3d',
+    'conv_tbc', # Undocumented / maybe new?
+    'linear',
+]
+
+FP32_FUNCS = [
+
+    # Interpolation/Upsampling
+    'interpolate',
+
+    # Pointwise
+    'softplus',
+    'softmin',
+    'log_softmax',
+    'softmax',
+
+    # Normalization
+    'layer_norm',
+    'group_norm',
+    'local_response_norm',
+    'normalize',
+    'cosine_similarity',
+
+    # Loss functions
+    # TODO: which of these can be fp16?
+    'poisson_nll_loss',
+    'cosine_embedding_loss',
+    'cross_entropy',
+    'hinge_embedding_loss',
+    'kl_div',
+    'l1_loss',
+    'mse_loss',
+    'margin_ranking_loss',
+    'multilabel_margin_loss',
+    'multilabel_soft_margin_loss',
+    'multi_margin_loss',
+    'nll_loss',
+    'binary_cross_entropy_with_logits',
+    'smooth_l1_loss',
+    'soft_margin_loss',
+    'triplet_margin_loss'
+]
+
+BANNED_FUNCS = [
+    ('binary_cross_entropy',
+     ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
+      "It requires that the output of the previous function be already a FloatTensor. \n\n"
+      "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
+      "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
+      "that is compatible with amp.\nAnother option is to add\n"
+      "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
+      "If you _really_ know what you are doing, you can disable this warning by passing "
+      "allow_banned=True to `amp.init()`."))
+]
diff --git a/apex/apex/amp/lists/tensor_overrides.py b/apex/apex/amp/lists/tensor_overrides.py
new file mode 100644
index 0000000000000000000000000000000000000000..dde84a288bc795d19e2dd099f1013ffdfeb88a18
--- /dev/null
+++ b/apex/apex/amp/lists/tensor_overrides.py
@@ -0,0 +1,63 @@
+from .. import compat
+from . import torch_overrides
+
+import importlib
+
+import torch
+
+if compat.variable_is_tensor() and not compat.tensor_is_variable():
+    MODULE = torch.Tensor
+else:
+    MODULE = torch.autograd.Variable
+
+
+FP16_FUNCS = [
+    '__matmul__',
+]
+
+FP32_FUNCS = [
+    '__ipow__',
+    '__pow__',
+    '__rpow__',
+
+    # Cast to fp32 before transfer to CPU
+    'cpu',
+]
+
+CASTS = [
+    '__add__',
+    '__div__',
+    '__eq__',
+    '__ge__',
+    '__gt__',
+    '__iadd__',
+    '__idiv__',
+    '__imul__',
+    '__isub__',
+    '__itruediv__',
+    '__le__',
+    '__lt__',
+    '__mul__',
+    '__ne__',
+    '__radd__',
+    '__rdiv__',
+    '__rmul__',
+    '__rsub__',
+    '__rtruediv__',
+    '__sub__',
+    '__truediv__',
+]
+
+# None of these, but here to make code cleaner.
+SEQUENCE_CASTS = []
+
+# We need to grab all the methods from torch_overrides and add them to
+# the Tensor lists as well, as almost all methods are duplicated
+# between `torch` and `torch.Tensor` (and check with `hasattr`,
+# because a few random ones aren't defined on Tensor)
+_self_mod = importlib.import_module(__name__)
+for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
+    lst = getattr(_self_mod, attrname)
+    for fn in getattr(torch_overrides, attrname):
+        if hasattr(MODULE, fn):
+            lst.append(fn)
diff --git a/apex/apex/amp/lists/torch_overrides.py b/apex/apex/amp/lists/torch_overrides.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e0cf146ed615909c884563313271d0acc902df
--- /dev/null
+++ b/apex/apex/amp/lists/torch_overrides.py
@@ -0,0 +1,103 @@
+import torch
+
+from .. import utils
+
+MODULE = torch
+
+FP16_FUNCS = [
+    # Low level functions wrapped by torch.nn layers.
+    # The wrapper layers contain the weights which are then passed in as a parameter
+    # to these functions.
+    'conv1d',
+    'conv2d',
+    'conv3d',
+    'conv_transpose1d',
+    'conv_transpose2d',
+    'conv_transpose3d',
+    'conv_tbc',
+    'prelu',
+
+    # BLAS
+    'addmm',
+    'addmv',
+    'addr',
+    'matmul',
+    'mm',
+    'mv',
+]
+
+FP32_FUNCS = [
+    # Pointwise
+    'acos',
+    'asin',
+    'cosh',
+    'erfinv',
+    'exp',
+    'expm1',
+    'log',
+    'log10',
+    'log2',
+    'reciprocal',
+    'rsqrt',
+    'sinh',
+    'tan',
+
+    # Other math
+    'pow',
+
+    # Reduction
+    'cumprod',
+    'cumsum',
+    'dist',
+    'mean',
+    'norm',
+    'prod',
+    'std',
+    'sum',
+    'var',
+
+    # Misc
+    'renorm'
+]
+
+# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
+# check the CUDA version -- if at least 9.1, then put the bmm
+# functions on the fp16 list. Otherwise, put them on the fp32 list.
+_bmms = ['addbmm',
+         'baddbmm',
+         'bmm']
+if utils.get_cuda_version() >= (9, 1, 0):
+    FP16_FUNCS.extend(_bmms)
+else:
+    FP32_FUNCS.extend(_bmms)
+
+# Multi-tensor fns that may need type promotion
+CASTS = [
+    # Multi-tensor math
+    'addcdiv',
+    'addcmul',
+    'atan2',
+    'cross',
+    'bilinear',
+
+    # Element-wise _or_ tensor-wise math
+    'add',
+    'div',
+    'mul',
+
+    # Comparison
+    'eq',
+    'equal',
+    'ge',
+    'gt',
+    'le',
+    'lt',
+    'ne'
+]
+
+# Functions that take sequence arguments. We need to inspect the whole
+# sequence and cast to the widest type.
+SEQUENCE_CASTS = [
+    'cat',
+    'stack'
+]
diff --git a/apex/apex/amp/opt.py b/apex/apex/amp/opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf311684de179e3026f15bae99d7aae9e266ce6
--- /dev/null
+++ b/apex/apex/amp/opt.py
@@ -0,0 +1,103 @@
+import contextlib
+import warnings
+
+from .scaler import LossScaler, master_params
+from ._amp_state import maybe_print
+
+import numpy as np
+
+class OptimWrapper(object):
+    def __init__(self, optimizer, amp_handle, num_loss):
+        self._optimizer = optimizer
+        self._amp_handle = amp_handle
+        self._num_loss = num_loss
+        self._loss_idx = 0
+        self._skip_next = [False] * num_loss
+        self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]
+
+    @contextlib.contextmanager
+    def scale_loss(self, loss):
+        if not self._amp_handle.is_active():
+            yield loss
+            return
+
+        # When there are multiple losses per-optimizer, we need
+        # to save out current grad accumulation, since we won't be
+        # able to unscale this particulare loss once the grads are
+        # all mixed together.
+        cached_grads = []
+        if self._loss_idx > 0:
+            for p in master_params(self._optimizer):
+                if p.grad is not None:
+                    cached_grads.append(p.grad.data.detach().clone())
+                else:
+                    cached_grads.append(None)
+            self._optimizer.zero_grad()
+
+        loss_scale = self._cur_loss_scaler().loss_scale()
+        yield loss * loss_scale
+
+        self._cur_loss_scaler().clear_overflow_state()
+        self._cur_loss_scaler().unscale(
+            master_params(self._optimizer),
+            master_params(self._optimizer),
+            loss_scale)
+        self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
+        self._loss_idx += 1
+
+        if len(cached_grads) > 0:
+            for p, cached_grad in zip(master_params(self._optimizer),
+                                      cached_grads):
+                if cached_grad is not None:
+                    p.grad.data.add_(cached_grad)
+            cached_grads = []
+
+    def _cur_loss_scaler(self):
+        assert 0 <= self._loss_idx < self._num_loss
+        return self._loss_scaler[self._loss_idx]
+
+    def step(self, closure=None):
+        if not self._amp_handle.is_active():
+            return self._optimizer.step(closure=closure)
+
+        self._loss_idx = 0
+
+        for group in self._optimizer.param_groups:
+            for p in group['params']:
+                self._amp_handle.remove_cache(p)
+
+        if closure is not None:
+            raise NotImplementedError(
+                'The `closure` argument is unsupported by the amp ' +
+                'optimizer wrapper.')
+        if any(self._skip_next):
+            maybe_print('Gradient overflow, skipping update')
+            self._skip_next = [False] * self._num_loss
+        else:
+            return self._optimizer.step(closure=closure)
+
+    # Forward any attribute lookups
+    def __getattr__(self, attr):
+        return getattr(self._optimizer, attr)
+
+    # Forward all torch.optim.Optimizer methods
+    def __getstate__(self):
+        return self._optimizer.__getstate__()
+
+    def __setstate__(self):
+        return self._optimizer.__setstate__()
+
+    def __repr__(self):
+        return self._optimizer.__repr__()
+
+    def state_dict(self):
+        return self._optimizer.state_dict()
+
+    def load_state_dict(self, state_dict):
+        return self._optimizer.load_state_dict(state_dict)
+
+    def zero_grad(self):
+        return self._optimizer.zero_grad()
+
+    def add_param_group(self, param_group):
+        return self._optimizer.add_param_group(param_group)
diff --git a/apex/apex/amp/rnn_compat.py b/apex/apex/amp/rnn_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..d062ae26587031430b215d66f6062177b9881630
--- /dev/null
+++ b/apex/apex/amp/rnn_compat.py
@@ -0,0 +1,53 @@
+from . import utils, wrap
+
+import torch
+_VF = torch._C._VariableFunctions
+RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
+
+def _gen_VF_wrapper(name):
+    def wrapper(*args, **kwargs):
+        return getattr(_VF, name)(*args, **kwargs)
+    return wrapper
+
+# Some python magic to generate an object that has the rnn cell functions
+# defined on it, all of which call into corresponding _VF version.
+# Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
+# imported at module scope within torch.nn.modules.rnn).  This should
+# not affect third-party importers of _VF.py.
+class VariableFunctionsShim(object):
+    def __init__(self):
+        for name in RNN_NAMES:
+            for suffix in ['', '_cell']:
+               fn_name = name + suffix
+               setattr(self, fn_name, _gen_VF_wrapper(fn_name))
+
+def has_old_rnns():
+    try:
+        torch.nn.backends.thnn.backend.LSTMCell
+        return True
+    except:
+        return False
+
+def whitelist_rnn_cells(handle, verbose):
+    # Different module + function names in old/new RNN cases
+    if has_old_rnns():
+        fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
+        mod = torch.nn.backends.thnn.backend
+    else:
+        fn_names = [x + '_cell' for x in RNN_NAMES]
+        mod = torch.nn.modules.rnn._VF
+        assert isinstance(mod, VariableFunctionsShim)
+
+    # Insert casts on cell functions
+    for fn in fn_names:
+        wrap.cached_cast(mod, fn, utils.maybe_half, handle,
+                         try_caching=True, verbose=verbose)
+
+    if has_old_rnns():
+        # Special handling of `backward` for fused gru / lstm:
+        # The `backward` method calls Tensor.sum() (blacklist) internally,
+        # and then the resulting grad_input has the wrong type.
+        # TODO: where else is this a problem?
+        for rnn_type in ['GRUFused', 'LSTMFused']:
+            mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
+            wrap.disable_casts(mod, 'backward', handle)
diff --git a/apex/apex/amp/scaler.py b/apex/apex/amp/scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc257a89aa23275a998d29ece5bacf4720f0ea6
--- /dev/null
+++ b/apex/apex/amp/scaler.py
@@ -0,0 +1,210 @@
+import torch
+from ..multi_tensor_apply import multi_tensor_applier
+from ._amp_state import _amp_state, master_params, maybe_print
+from itertools import product
+
+def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
+    # Exception handling for 18.04 compatibility
+    if check_overflow:
+        cpu_sum = float(model_grad.float().sum())
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+            return True
+
+    if master_grad is not model_grad: # copy_ probably internally short-circuits this
+        master_grad.copy_(model_grad)
+    if scale != 1.0:
+        master_grad.mul_(scale)
+    return False
+
+def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, check_overflow=False):
+    # Exception handling for 18.04 compatibility
+    if check_overflow:
+        cpu_sum = float(model_grad.float().sum())
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+            return True
+
+    # if master_grad is not model_grad: # copy_ probably internally short-circuits this
+    #     master_grad.copy_(model_grad)
+    assert stashed_grad.dtype == master_grad.dtype
+    converted_model_grad = model_grad.to(master_grad.dtype)
+    stashed_grad.add_(scale, converted_model_grad)
+    master_grad.data = stashed_grad.data
+    return False
+
+class LossScaler(object):
+    warned_no_fused_kernel = False
+    warned_unscaling_non_fp32_grad = False
+    has_fused_kernel = False
+
+    def __init__(self,
+                 loss_scale,
+                 init_scale=2.**16,
+                 scale_factor=2.,
+                 scale_window=2000,
+                 min_loss_scale=None,
+                 max_loss_scale=2.**24):
+        if loss_scale == "dynamic":
+            self.dynamic = True
+            self._loss_scale = init_scale
+        else:
+            self.dynamic = False
+            self._loss_scale = loss_scale
+        self._max_loss_scale = max_loss_scale
+        self._min_loss_scale = min_loss_scale
+        self._scale_seq_len = scale_window
+        self._unskipped = 0
+        self._has_overflow = False
+        self._overflow_buf = torch.cuda.IntTensor([0])
+        if multi_tensor_applier.available:
+            import amp_C
+            LossScaler.has_fused_kernel = multi_tensor_applier.available
+            LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
+            LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
+        else:
+            if not LossScaler.warned_no_fused_kernel:
+                maybe_print(
+                    "Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
+                    "possibly because apex was installed without --cuda_ext --cpp_ext. "
+                    "Using Python fallback.  Original ImportError was: " +
+                    repr(multi_tensor_applier.import_err),
+                    True)
+            LossScaler.has_fused_kernel = False
+            LossScaler.warned_no_fused_kernel = True
+
+    def loss_scale(self):
+        return self._loss_scale
+
+    def unscale_python(self, model_grads, master_grads, scale):
+        for model, master in zip(model_grads, master_grads):
+            if model is not None:
+                if not LossScaler.warned_unscaling_non_fp32_grad:
+                    if master.dtype != torch.float32:
+                        maybe_print(
+                            "Attempting to unscale a grad with type {} ".format(master.type()) +
+                            "Unscaling non-fp32 grads may indicate an error. "
+                            "When using Amp, you don't need to call .half() on your model.")
+                        LossScaler.warned_unscaling_non_fp32_grad = True
+                self._has_overflow = scale_check_overflow_python(model,
+                                                                 master,
+                                                                 1./scale,
+                                                                 self.dynamic)
+                if self._has_overflow and self.dynamic:
+                    break
+
+    # unused_scale keeps some of the old API alive for hopefully a short time.
+    def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False):
+        if self._has_overflow:
+            return
+
+        scale = self._loss_scale
+
+        if scale == 1.0 and models_are_masters and not self.dynamic:
+            return
+
+        if LossScaler.has_fused_kernel:
+            # if (not LossScaler.warned_unscaling_non_fp32_grad
+            #     and master_grads[0].dtype == torch.float16):
+            #     print("Warning:  unscaling grads that are not FP32. "
+            #           "Unscaling non-fp32 grads may indicate an error. "
+            #           "When using Amp, you don't need to call .half() on your model.")
+            #     # Setting this to True unconditionally allows the possibility of an escape
+            #     # if never-before-seen non-fp32 grads are created in some later iteration.
+            #     LossScaler.warned_unscaling_non_fp32_grad = True
+            multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
+                                 self._overflow_buf,
+                                 [model_grads, master_grads],
+                                 1./scale)
+        else:
+            self.unscale_python(model_grads, master_grads, scale)
+
+        # Defer to update_scale
+        # If the fused kernel is available, we only need one D2H memcopy and sync.
+        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+        #     self._has_overflow = self._overflow_buf.item()
+
+    def unscale_with_stashed_python(self,
+                                    model_grads,
+                                    stashed_master_grads,
+                                    master_grads,
+                                    scale):
+        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
+            if model is None and stashed is None:
+                continue
+            else:
+                if not LossScaler.warned_unscaling_non_fp32_grad:
+                    if master.dtype != torch.float32:
+                        maybe_print(
+                            "Attempting to unscale a grad with type {} ".format(master.type()) +
+                            "Unscaling non-fp32 grads may indicate an error. "
+                            "When using Amp, you don't need to call .half() on your model.")
+                        LossScaler.warned_unscaling_non_fp32_grad = True
+                self._has_overflow = axpby_check_overflow_python(model,
+                                                                 stashed,
+                                                                 master,
+                                                                 1./scale,
+                                                                 self.dynamic)
+                if self._has_overflow and self.dynamic:
+                    break
+
+    def unscale_with_stashed(self,
+                             model_grads,
+                             stashed_master_grads,
+                             master_grads):
+        if self._has_overflow:
+            return
+
+        scale = self._loss_scale
+
+        if LossScaler.has_fused_kernel:
+            if (not LossScaler.warned_unscaling_non_fp32_grad
+                and master_grads[0].dtype == torch.float16):
+                print("Warning:  unscaling grads that are not FP32. "
+                      "Unscaling non-fp32 grads may indicate an error. "
+                      "When using Amp, you don't need to call .half() on your model.")
+                # Setting this to True unconditionally allows the possibility of an escape
+                # if never-before-seen non-fp32 grads are created in some later iteration.
+                LossScaler.warned_unscaling_non_fp32_grad = True
+            multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
+                                 self._overflow_buf,
+                                 [model_grads, stashed_master_grads, master_grads],
+                                 1./scale,
+                                 1.0,
+                                 0) # check only arg 0, aka the incoming model grads, for infs
+        else:
+            self.unscale_with_stashed_python(model_grads,
+                                             stashed_master_grads,
+                                             master_grads,
+                                             scale)
+
+        # Defer to update_scale
+        # If the fused kernel is available, we only need one D2H memcopy and sync.
+        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+        #     self._has_overflow = self._overflow_buf.item()
+
+    def clear_overflow_state(self):
+        self._has_overflow = False
+        if self.has_fused_kernel:
+            self._overflow_buf.zero_()
+
+    # Separate so unscale() can be called more that once before updating.
+    def update_scale(self):
+        # If the fused kernel is available, we only need one D2H memcopy and sync.
+        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+            self._has_overflow = self._overflow_buf.item()
+
+        if self._has_overflow and self.dynamic:
+            should_skip = True
+            if(self._min_loss_scale):
+                self._loss_scale = max(self._min_loss_scale, self._loss_scale/2.)
+            else:
+                self._loss_scale = self._loss_scale/2.
+            self._unskipped = 0
+        else:
+            should_skip = False
+            self._unskipped += 1
+
+        if self._unskipped == self._scale_seq_len and self.dynamic:
+            self._loss_scale = min(self._max_loss_scale, self._loss_scale*2.)
+            self._unskipped = 0
+
+        return should_skip
diff --git a/apex/apex/amp/utils.py b/apex/apex/amp/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d43b4685bb62dca2d02d3a812c954e5783ba1fe6
--- /dev/null
+++ b/apex/apex/amp/utils.py
@@ -0,0 +1,213 @@
+from . import compat
+
+import functools
+import itertools
+
+import torch
+
+def get_cuda_version():
+    return tuple(int(x) for x in torch.version.cuda.split('.'))
+
+def is_fp_tensor(x):
+    if is_nested(x):
+        # Fast-fail version of all(is_fp_tensor)
+        for y in x:
+            if not is_fp_tensor(y):
+                return False
+        return True
+    return compat.is_tensor_like(x) and compat.is_floating_point(x)
+
+def is_nested(x):
+    return isinstance(x, tuple) or isinstance(x, list)
+
+def should_cache(x):
+    if is_nested(x):
+        # Fast-fail version of all(should_cache)
+        for y in x:
+            if not should_cache(y):
+                return False
+        return True
+    return isinstance(x, torch.nn.parameter.Parameter) and \
+        type_string(x) == 'FloatTensor'
+
+def collect_fp_tensor_types(args, kwargs):
+    def collect_types(x, types):
+        if is_nested(x):
+            for y in x:
+                collect_types(y, types)
+        else:
+            types.add(type_string(x))
+
+    all_args = itertools.chain(args, kwargs.values())
+    types = set()
+    for x in all_args:
+        if is_fp_tensor(x):
+            collect_types(x, types)
+    return types
+
+def type_string(x):
+    return x.type().split('.')[-1]
+
+def maybe_half(x, name='', verbose=False):
+    if is_nested(x):
+        return type(x)([maybe_half(y) for y in x])
+
+    if not x.is_cuda or type_string(x) == 'HalfTensor':
+        return x
+    else:
+        if verbose:
+            print('Float->Half ({})'.format(name))
+        return x.half()
+
+def maybe_float(x, name='', verbose=False):
+    if is_nested(x):
+        return type(x)([maybe_float(y) for y in x])
+
+    if not x.is_cuda or type_string(x) == 'FloatTensor':
+        return x
+    else:
+        if verbose:
+            print('Half->Float ({})'.format(name))
+        return x.float()
+
+# NB: returneds casted `args`, mutates `kwargs` in-place
+def casted_args(cast_fn, args, kwargs):
+    new_args = []
+    for x in args:
+        if is_fp_tensor(x):
+            new_args.append(cast_fn(x))
+        else:
+            new_args.append(x)
+    for k in kwargs:
+        val = kwargs[k]
+        if is_fp_tensor(val):
+            kwargs[k] = cast_fn(val)
+    return new_args
+
+def cached_cast(cast_fn, x, cache):
+    if is_nested(x):
+        return type(x)([cached_cast(y) for y in x])
+    if x in cache:
+        cached_x = cache[x]
+        if x.requires_grad and cached_x.requires_grad:
+            # Make sure x is actually cached_x's autograd parent.
+            if cached_x.grad_fn.next_functions[1][0].variable is not x:
+                raise RuntimeError("x and cache[x] both require grad, but x is not "
+                                   "cache[x]'s parent.  This is likely an error.")
+        # During eval, it's possible to end up caching casted weights with
+        # requires_grad=False.  On the next training iter, if cached_x is found
+        # and reused from the cache, it will not actually have x as its parent.
+        # Therefore, we choose to invalidate the cache (and force refreshing the cast)
+        # if x.requires_grad and cached_x.requires_grad do not match.
+        #
+        # During eval (i.e. running under with torch.no_grad()) the invalidation
+        # check would cause the cached value to be dropped every time, because
+        # cached_x would always be created with requires_grad=False, while x would
+        # still have requires_grad=True.  This would render the cache effectively
+        # useless during eval.  Therefore, if we are running under the no_grad()
+        # context manager (torch.is_grad_enabled=False) we elide the invalidation
+        # check, and use the cached value even though its requires_grad flag doesn't
+        # match.  During eval, we don't care that there's no autograd-graph
+        # connection between x and cached_x.
+        if torch.is_grad_enabled() and x.requires_grad != cached_x.requires_grad:
+            del cache[x]
+        else:
+            return cached_x
+
+    casted_x = cast_fn(x)
+    cache[x] = casted_x
+    return casted_x
+
+def verbosify(cast_fn, fn_name, verbose):
+    if verbose:
+        return functools.partial(cast_fn, name=fn_name, verbose=verbose)
+    else:
+        return cast_fn
+
+def as_inplace(fns):
+    for x in fns:
+        yield x + '_'
+
+def has_func(mod, fn):
+    if isinstance(mod, torch.nn.backends.backend.FunctionBackend):
+        return fn in mod.function_classes
+    elif isinstance(mod, dict):
+        return fn in mod
+    else:
+        return hasattr(mod, fn)
+
+def get_func(mod, fn):
+    if isinstance(mod, torch.nn.backends.backend.FunctionBackend):
+        return mod.function_classes[fn]
+    elif isinstance(mod, dict):
+        return mod[fn]
+    else:
+        return getattr(mod, fn)
+
+def set_func(mod, fn, new_fn):
+    if isinstance(mod, torch.nn.backends.backend.FunctionBackend):
+        mod.function_classes[fn] = new_fn
+    elif isinstance(mod, dict):
+        mod[fn] = new_fn
+    else:
+        setattr(mod, fn, new_fn)
+
+def set_func_save(handle, mod, fn, new_fn):
+    cur_fn = get_func(mod, fn)
+    handle._save_func(mod, fn, cur_fn)
+    set_func(mod, fn, new_fn)
+
+# A couple problems get solved here:
+# - The flat_weight buffer is disconnected from autograd graph,
+#   so the fp16 weights need to be derived from the input weights
+#   to this forward call, not the flat buffer.
+# - The ordering of weights in the flat buffer is...idiosyncratic.
+# First problem is solved with combination of set_ (to set up
+# correct storage) and copy_ (so the fp16 weight derives from the
+# fp32 one in autograd.
+# Second is solved by doing ptr arithmetic on the fp32 weights
+# to derive the correct offset.
+#
+# TODO: maybe this should actually use
+# `torch._cudnn_rnn_flatten_weight`? But then I need to call
+# on first iter and cache the right offsets. Ugh.
+def synthesize_flattened_rnn_weights(fp32_weights,
+                                     fp16_flat_tensor,
+                                     rnn_fn='',
+                                     verbose=False):
+    fp16_weights = []
+    fp32_base_ptr = fp32_weights[0][0].data_ptr()
+    for layer_weights in fp32_weights:
+        fp16_layer_weights = []
+        for w_fp32 in layer_weights:
+            w_fp16 = w_fp32.new().half()
+            offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
+            w_fp16.set_(fp16_flat_tensor.storage(),
+                        offset,
+                        w_fp32.shape)
+            w_fp16.copy_(w_fp32)
+            if verbose:
+                print('Float->Half ({})'.format(rnn_fn))
+            fp16_layer_weights.append(w_fp16)
+        fp16_weights.append(fp16_layer_weights)
+    return fp16_weights
+
+# Roughly same as above, just the `fp32_weights` aren't nested.
+# Code kept separate for readability.
+def new_synthesize_flattened_rnn_weights(fp32_weights,
+                                         fp16_flat_tensor,
+                                         rnn_fn='',
+                                         verbose=False):
+    fp16_weights = []
+    fp32_base_ptr = fp32_weights[0].data_ptr()
+    for w_fp32 in fp32_weights:
+        w_fp16 = w_fp32.new().half()
+        offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
+        w_fp16.set_(fp16_flat_tensor.storage(),
+                    offset,
+                    w_fp32.shape)
+        w_fp16.copy_(w_fp32)
+        if verbose:
+            print('Float->Half ({})'.format(rnn_fn))
+        fp16_weights.append(w_fp16)
+    return fp16_weights
diff --git a/apex/apex/amp/wrap.py b/apex/apex/amp/wrap.py
new file mode 100644
index 0000000000000000000000000000000000000000..559d0558d9a6605f92d37b15d3563f056ad7e318
--- /dev/null
+++ b/apex/apex/amp/wrap.py
@@ -0,0 +1,276 @@
+from . import compat
+from . import utils
+from ._amp_state import _amp_state
+from . import rnn_compat
+
+import functools
+
+import torch
+
+def make_cast_wrapper(orig_fn, cast_fn, handle,
+                      try_caching=False):
+    @functools.wraps(orig_fn)
+    def wrapper(*args, **kwargs):
+        if not handle.is_active():
+            return orig_fn(*args, **kwargs)
+
+        if try_caching and handle.has_cache:
+            args = list(args)
+            for i in range(len(args)):
+                if utils.should_cache(args[i]):
+                    args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
+            for k in kwargs:
+                if utils.should_cache(kwargs[k]):
+                    kwargs[k] = utils.cached_cast(cast_fn, kwargs[k], handle.cache)
+        new_args = utils.casted_args(cast_fn,
+                                     args,
+                                     kwargs)
+        return orig_fn(*new_args, **kwargs)
+    return wrapper
+
+def cached_cast(mod, fn, cast_fn, handle,
+                try_caching=False, verbose=False):
+    if not utils.has_func(mod, fn):
+        return
+
+    orig_fn = utils.get_func(mod, fn)
+    cast_fn = utils.verbosify(cast_fn, fn, verbose)
+    wrapper = make_cast_wrapper(orig_fn, cast_fn, handle, try_caching)
+    utils.set_func_save(handle, mod, fn, wrapper)
+
+# `handle` arg is unused, but simplifies API to make `make_cast_wrapper`
+# Annoyingly, make_promote_wrapper still uses the global handle.  Once everyone
+# is on the new API and I am free to get rid of handle, I can clean this up.
+def make_promote_wrapper(orig_fn, cast_fn, handle=None):
+    @functools.wraps(orig_fn)
+    def wrapper(*args, **kwargs):
+        if not _amp_state.handle.is_active():
+            return orig_fn(*args, **kwargs)
+
+        types = utils.collect_fp_tensor_types(args, kwargs)
+
+        if len(types) <= 1:
+            return orig_fn(*args, **kwargs)
+        elif len(types) == 2 and types == set(['HalfTensor', 'FloatTensor']):
+            new_args = utils.casted_args(cast_fn,
+                                         args,
+                                         kwargs)
+            return orig_fn(*new_args, **kwargs)
+        else:
+            raise NotImplementedError('Do not know how to handle ' +
+                                      'these types to promote: {}'
+                                      .format(types))
+    return wrapper
+
+def promote(mod, fn, handle, verbose=False):
+    orig_fn = utils.get_func(mod, fn)
+    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
+    wrapper = make_promote_wrapper(orig_fn, maybe_float)
+    utils.set_func_save(handle, mod, fn, wrapper)
+
+def sequence_promote(mod, fn, handle, verbose=False):
+    orig_fn = utils.get_func(mod, fn)
+    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
+    @functools.wraps(orig_fn)
+    def wrapper(seq, *args, **kwargs):
+        if not _amp_state.handle.is_active():
+            return orig_fn(seq, *args, **kwargs)
+
+        types = set([utils.type_string(x) for x in seq])
+        if len(types) <= 1:
+            return orig_fn(seq, *args, **kwargs)
+        elif types == set(['HalfTensor', 'FloatTensor']):
+            cast_seq = utils.casted_args(maybe_float,
+                                         seq, {})
+            return orig_fn(cast_seq, *args, **kwargs)
+        else:
+            # TODO: other mixed-type cases aren't due to amp.
+            #       Just pass through?
+            return orig_fn(seq, *args, **kwargs)
+    utils.set_func_save(handle, mod, fn, wrapper)
+
+def promote_match_arg0(mod, fn, handle, verbose=False):
+    if not utils.has_func(mod, fn):
+        return
+
+    orig_fn = utils.get_func(mod, fn)
+    @functools.wraps(orig_fn)
+    def wrapper(arg0, *args, **kwargs):
+        assert compat.is_tensor_like(arg0)
+        if not _amp_state.handle.is_active():
+            return orig_fn(arg0, *args, **kwargs)
+
+        if utils.type_string(arg0) == 'HalfTensor':
+            cast_fn = utils.maybe_half
+        elif utils.type_string(arg0) == 'FloatTensor':
+            cast_fn = utils.maybe_float
+        else:
+            return orig_fn(arg0, *args, **kwargs)
+        cast_fn = utils.verbosify(cast_fn, fn, verbose)
+        new_args = utils.casted_args(cast_fn, args, kwargs)
+        return orig_fn(arg0, *new_args, **kwargs)
+    utils.set_func_save(handle, mod, fn, wrapper)
+
+def err_if_any_half(mod, fn, handle, custom_err_msg=None):
+    if not utils.has_func(mod, fn):
+        return
+
+    orig_fn = utils.get_func(mod, fn)
+    @functools.wraps(orig_fn)
+    def wrapper(*args, **kwargs):
+        types = utils.collect_fp_tensor_types(args, kwargs)
+        if 'HalfTensor' in types:
+            if custom_err_msg:
+                raise NotImplementedError(custom_err_msg)
+            else:
+                raise NotImplementedError('Cannot call in-place function ' +
+                                          '{} with fp16 arguments.'.format(fn))
+        else:
+            return orig_fn(*args, **kwargs)
+    utils.set_func_save(handle, mod, fn, wrapper)
+
+def err_if_arg0_half(mod, fn, handle, verbose=False):
+    if not utils.has_func(mod, fn):
+        return
+
+    orig_fn = utils.get_func(mod, fn)
+    @functools.wraps(orig_fn)
+    def wrapper(arg0, *args, **kwargs):
+        assert compat.is_tensor_like(arg0)
+        if utils.type_string(arg0) == 'HalfTensor':
+            raise NotImplementedError('Cannot call in-place method ' +
+                                      '{} on fp16 Tensors.'.format(fn))
+        else:
+            cast_fn = utils.verbosify(utils.maybe_float, fn, verbose)
+            new_args = utils.casted_args(cast_fn, args, kwargs)
+            return orig_fn(arg0, *new_args, **kwargs)
+    utils.set_func_save(handle, mod, fn, wrapper)
+
+# Current RNN approach:
+# - Wrap top-level `RNN` function in thnn backend
+# - Will call into either CudnnRNN or AutogradRNN
+#  - Each of these are factory functions that return a per-iter
+#    `forward` function
+# - We interpose on the factory function to:
+#   1) Interpose on the actual forward function and put in casts
+#   2) Insert an fp16 `flat_weight` if necessary
+def rnn_cast(backend, fn, handle, verbose=False):
+    orig_rnn = utils.get_func(backend, fn)
+    @functools.wraps(orig_rnn)
+    def rnn_wrapper(*args, **kwargs):
+        flat_weight = kwargs.get('flat_weight')
+        if flat_weight is not None:
+            # We replace `flat_weight` with an uninitialized fp16
+            # Tensor. The "actual" weight tensors (provided in `forward`),
+            # will then be set up as ptrs into the buffer and have the
+            # corresponding fp32 values copied in.
+            # We need to call `copy` on the "actual" weights so that the
+            # autograd graph correctly backprops from the wgrads computed
+            # inside cuDNN (on fp16 weights) into the fp32 weights.
+            assert utils.type_string(flat_weight) == 'FloatTensor'
+            if compat.tensor_is_float_tensor() or compat.tensor_is_variable():
+                # Pre-0.4. A little slower, since it zeros out memory.
+                flat_weight_fp16 = flat_weight.new().half().resize_(flat_weight.shape)
+            else:
+                flat_weight_fp16 = torch.empty_like(flat_weight,
+                                                    dtype=torch.float16)
+            kwargs['flat_weight'] = flat_weight_fp16
+        else:
+            flat_weight_fp16 = None
+
+        forward = orig_rnn(*args, **kwargs)
+        @functools.wraps(forward)
+        def fwd_wrapper(*fargs, **fkwargs):
+            assert len(fargs) == 3 or len(fargs) == 4
+            inputs, weights, hiddens = fargs[:3]
+            assert utils.is_fp_tensor(inputs)
+            assert isinstance(weights, list)
+            cast_fn = utils.verbosify(utils.maybe_half,
+                                      fn,
+                                      verbose)
+            new_args = []
+
+            # 0) Inputs
+            new_args.append(cast_fn(inputs))
+
+            # 1) Weights
+            if flat_weight_fp16 is not None:
+                fp16_weights = utils.synthesize_flattened_rnn_weights(
+                    weights, flat_weight_fp16, fn, verbose)
+            else:
+                fp16_weights = [[cast_fn(w) for w in layer]
+                                for layer in weights]
+            new_args.append(fp16_weights)
+
+            # 2) Inputs: either a tuple (for LSTM) or single tensor
+            if isinstance(hiddens, tuple):
+                new_args.append(tuple(cast_fn(x) for x in hiddens))
+            elif utils.is_fp_tensor(hiddens):
+                new_args.append(cast_fn(hiddens))
+            else:
+                # Hiddens can, in principle, be `None` -- pass through
+                new_args.append(hiddens)
+
+            # 3) Batch sizes (0.4 or later only)
+            if len(fargs) == 4:
+                new_args.append(fargs[3])
+
+            return forward(*new_args, **fkwargs)
+        return fwd_wrapper
+    utils.set_func_save(handle, backend, fn, rnn_wrapper)
+
+def new_rnn_cast(fn, handle, verbose=False):
+    # Forward+backward compatibility around https://github.com/pytorch/pytorch/pull/15744
+    # For rnn backend calls that route through _rnn_impls, we must patch the ref
+    # that _rnn_impls stashed.  For rnn backend calls that directly invoke
+    # _VF.<backend>, e.g. _VF.lstm, we can patch onto VariableFunctionsShim,
+    # which in turn has patched the ref named "_VF" in torch.nn.modules.rnn.
+    if utils.has_func(torch.nn.modules.rnn._rnn_impls, fn):
+        mod = torch.nn.modules.rnn._rnn_impls
+    else:
+        mod = torch.nn.modules.rnn._VF
+        assert isinstance(mod, rnn_compat.VariableFunctionsShim)
+        fn = fn.lower()
+    orig_fn = utils.get_func(mod, fn)
+    cast_fn = utils.verbosify(utils.maybe_half, fn, verbose)
+    @functools.wraps(orig_fn)
+    def wrapper(*args, **kwargs):
+        # Exact call signature from modules/rnn.py
+        assert len(args) == 9
+        assert len(kwargs) == 0
+
+        if not _amp_state.handle.is_active():
+            return orig_fn(*args, **kwargs)
+
+        if isinstance(args[6], bool):
+            params_idx = 2 # Not PackedSequence case
+        else:
+            params_idx = 3 # PackedSequence case
+
+        new_args = []
+        for i, arg in enumerate(args):
+            if i == params_idx:
+                num_params = sum([x.numel() for x in arg])
+                fp16_weight_buf = args[0].new_empty((num_params,),
+                                                    dtype=torch.half)
+                casted_weights = utils.new_synthesize_flattened_rnn_weights(
+                    arg, fp16_weight_buf, fn, verbose)
+                new_args.append(casted_weights)
+            elif utils.is_fp_tensor(arg):
+                new_args.append(cast_fn(arg))
+            else:
+                new_args.append(arg)
+
+        return orig_fn(*new_args)
+    utils.set_func_save(handle, mod, fn, wrapper)
+
+def disable_casts(mod, fn, handle):
+    if not utils.has_func(mod, fn):
+        return
+
+    orig_fn = utils.get_func(mod, fn)
+    @functools.wraps(orig_fn)
+    def wrapper(*args, **kwargs):
+        with handle._disable_casts():
+            return orig_fn(*args, **kwargs)
+    utils.set_func_save(handle, mod, fn, wrapper)
diff --git a/apex/apex/fp16_utils/README.md b/apex/apex/fp16_utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..941de17947f8ea75de248f3a8e1445cef7a114b2
--- /dev/null
+++ b/apex/apex/fp16_utils/README.md
@@ -0,0 +1,16 @@
+fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.
+
+#### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
+
+#### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
+
+#### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+
+#### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
+
+
+fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  
+
+#### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
+
+The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
diff --git a/apex/apex/fp16_utils/__init__.py b/apex/apex/fp16_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7bb1f537efe1b580b1d556974a117e169e9b0c0
--- /dev/null
+++ b/apex/apex/fp16_utils/__init__.py
@@ -0,0 +1,16 @@
+from .fp16util import (
+    BN_convert_float,
+    network_to_half,
+    prep_param_lists,
+    model_grads_to_master_grads,
+    master_params_to_model_params,
+    tofp16,
+    to_python_float,
+    clip_grad_norm,
+    convert_module,
+    convert_network,
+    FP16Model,
+)
+
+from .fp16_optimizer import FP16_Optimizer
+from .loss_scaler import LossScaler, DynamicLossScaler
diff --git a/apex/apex/fp16_utils/fp16_optimizer.py b/apex/apex/fp16_utils/fp16_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..885460311221ed40823bec7126227366218176f6
--- /dev/null
+++ b/apex/apex/fp16_utils/fp16_optimizer.py
@@ -0,0 +1,643 @@
+import torch
+from torch import nn
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from ..amp._amp_state import _amp_state, maybe_print
+from ..amp.scaler import LossScaler
+from ..multi_tensor_apply import multi_tensor_applier
+from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
+
+# TODO:  Update overflow check + downscale to use Carl's fused kernel.
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, 
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary. 
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.  
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`LossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`LossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`LossScaler`'s defaults will be used.
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.  
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be 
+    named to replace ``init_optimizer``, for two reasons:  
+    First, it means that references to the same name
+    later in the file will not have to change.  
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to 
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer. 
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of 
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will 
+    ingest these ``param_groups`` and remember them. 
+
+    Calls to ::
+
+        loss.backward() 
+
+    must be replaced with ::
+
+        optimizer.backward(loss)  
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement 
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other 
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+    
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a 
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting 
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` 
+    should still work as intended.
+    """
+
+    def __init__(self, 
+                 init_optimizer, 
+                 static_loss_scale=1.0, 
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=True):
+        if not torch.cuda.is_available:
+            raise SystemError("Cannot use fp16 without CUDA.")
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+                                         .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+                                         .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError("Wrapped parameters must be either "
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "  
+                                        "Received {}".format(param.type()))
+            
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        self.all_fp16_params = []
+        for group in self.fp16_groups:
+            self.all_fp16_params += group
+
+        self.all_fp32_from_fp16_params = []
+        for group in self.fp32_from_fp16_groups:
+            self.all_fp32_from_fp16_params += group
+
+        self.all_fp32_from_fp32_params = []
+        for group in self.fp32_from_fp32_groups:
+            self.all_fp32_from_fp32_params += group
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = LossScaler("dynamic", **dynamic_loss_args)
+            else:
+                self.loss_scaler = LossScaler("dynamic")
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = clip_grad_norm
+
+        # TODO:  Centralize exposure and import error checking for the C backend.
+        if multi_tensor_applier.available:
+            import amp_C
+            self.multi_tensor_scale = amp_C.multi_tensor_scale
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
+
+    # Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
+    # of having to support FP16_Optimizer separately, for the time being.
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+            
+    def __getstate__(self):
+        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
+
+    def __setstate__(self, state):
+        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+             for p in group['params']:
+                 if set_grads_to_None:
+                     p.grad = None
+                 else:
+                     if p.grad is not None:
+                         p.grad.detach_()
+                         p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    # Should not be used anymore.
+    # def _check_overflow(self):
+    #     params = []
+    #     for group in self.fp16_groups:
+    #         for param in group:
+    #             params.append(param)
+    #     for group in self.fp32_from_fp32_groups:
+    #         for param in group:
+    #             params.append(param)
+    #     self.overflow = self.loss_scaler.has_overflow(params)
+
+    # def _update_scale(self, has_overflow=False):
+    #     self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        if multi_tensor_applier.available:
+            if len(self.all_fp16_params) > 0:
+                multi_tensor_applier(
+                    self.multi_tensor_scale,
+                    self._dummy_overflow_buf,
+                    [self.all_fp32_from_fp16_params, self.all_fp16_params],
+                    1.0)
+        else:
+            for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
+                master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    # def _model_grads_to_master_grads(self):
+    #     for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
+    #         model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    # def _downscale_master(self):
+    #     if self.loss_scale != 1.0:
+    #         for group in self.optimizer.param_groups:
+    #             for param in group['params']:
+    #                 if param.grad is not None:
+    #                     param.grad.data.mul_(1./self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
+        """
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict(). 
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, 
+        whose parameters in turn came from ``model``, it is expected that the user 
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.  
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.  
+        # 1:  Refresh the master params from the model's fp16 params.  
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        # 
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device 
+        # of their associated parameters, because it's possible those buffers might not exist yet in 
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been 
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None): # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after 
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to 
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an 
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """
+
+        scale = self.loss_scaler.loss_scale()
+        # To consider:  Should this be in step(), or update_master_grads?  It works either way,
+        # but I should make it consistent with the Amp control flow, which updates the scale
+        # during backward context manager exit.
+        # self._update_scale(self.overflow)
+
+        if self.overflow:
+            # Using _amp_state.maybe_print instead of self.print here is intentional.
+            maybe_print("Gradient overflow.  Skipping step, reducing " +
+                "loss scale to {}".format(self.loss_scaler.loss_scale()))
+            return
+        
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            # torch.cuda.nvtx.range_push("pytorch optimizer step")
+            retval = self.optimizer.step()
+            # torch.cuda.nvtx.range_pop()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer 
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure," 
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call 
+            # closure() and return the loss.
+            temp_loss = closure() 
+            while(self.overflow):
+                scale = self.loss_scaler.loss_scale()
+                # self._update_scale(self.overflow) # now done at the end of backward
+                print("OVERFLOW within closure! Skipping step, reducing loss scale to {}".format(
+                      self.loss_scaler.loss_scale()))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """ 
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an 
+            fp16 loss value.  
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to 
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to 
+            :attr:`backward` should not be regarded as valid in general, 
+            because it's possible 
+            they have been scaled (and in the case of dynamic loss scaling, 
+            the scale factor may change over time).  
+            If the user wants to inspect gradients after a call to :attr:`backward`,  
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but 
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all 
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """ 
+        # To consider:  try multiple backward passes using retain_grad=True to find 
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid 
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.  
+        scaled_loss = loss.float()*self.loss_scaler.loss_scale()
+        scaled_loss.backward(retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        # torch.cuda.nvtx.range_push("update_master_grads")
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to 
+        the ``.grad`` attribute of the fp32 master parameters that are directly 
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """
+        # if self.dynamic_loss_scale:
+        #     self._check_overflow()
+        #     if self.overflow: return
+        # self._model_grads_to_master_grads()
+        # self._downscale_master()
+        # Use the one-shot multi-tensor apply kernel
+        self.loss_scaler.clear_overflow_state()
+        if len(self.all_fp16_params) > 0:
+            # print("Model grads before")
+            # print([param.grad.data for param in self.all_fp16_params])
+            # I'm ONLY writing this as an incremental way to make some tests pass until
+            # I can refactor the tests as well.
+            # FP16_Optimizer should not be used by anyone.
+            model_grads = []
+            master_grads = []
+            for model_param, master_param in zip(self.all_fp16_params,
+                                                 self.all_fp32_from_fp16_params):
+                if model_param.grad is not None:
+                    model_grads.append(model_param.grad)
+                    if master_param.grad is None:
+                        master_param.grad = torch.empty_like(master_param)
+                    master_grads.append(master_param.grad)
+            self.loss_scaler.unscale(
+                model_grads,
+                master_grads,
+                self.loss_scaler.loss_scale())
+            # print("Master grads after")
+            # print([param.grad.data for param in self.all_fp32_from_fp16_params])
+        if len(self.all_fp32_from_fp32_params) > 0:
+            model_grads = []
+            master_grads = []
+            for model_param, master_param in zip(self.all_fp32_from_fp32_params,
+                                                 self.all_fp32_from_fp32_params):
+                if model_param.grad is not None:
+                    model_grads.append(model_param.grad)
+                    master_grads.append(master_param.grad)
+            # print("Model grads before")
+            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
+            self.loss_scaler.unscale(
+                model_grads,
+                master_grads,
+                self.loss_scaler.loss_scale())
+            # print("Master grads after")
+            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
+        # quit()
+        self.overflow = self.loss_scaler.update_scale()
+        # torch.cuda.nvtx.range_pop()
+
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`, 
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.  
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However, 
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be 
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.                 
+        """
+        if self.overflow:
+            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
+                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale()
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler._loss_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
diff --git a/apex/apex/fp16_utils/fp16util.py b/apex/apex/fp16_utils/fp16util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc3447a4cf70b7379de6c7fe4585175f94f0b8
--- /dev/null
+++ b/apex/apex/fp16_utils/fp16util.py
@@ -0,0 +1,187 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+        if isinstance(module, torch.nn.RNNBase) or isinstance(module, torch.nn.modules.rnn.RNNBase):
+            module.flatten_parameters()
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError("master_params recieved a gradient in the backward pass!")
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """
+    model_params = [param for param in model.parameters() if param.requires_grad]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
+        except:
+            print("Error in prep_param_lists:  model may contain a mixture of parameters "
+                      "of different types.  Use flat_master=False, or use F16_Optimizer.")
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [param.clone().float().detach() for param in model_params]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params, master_params, flat_master=False):
+    """
+    Copy model gradients to master gradients.  
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
+    """
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params, master_params, flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
+    """
+    if flat_master:
+        for model, master in zip(model_params, 
+                                 _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+# Backward compatibility fixes
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
+    clip_grad_norm = torch.nn.utils.clip_grad_norm
+else:
+    clip_grad_norm = torch.nn.utils.clip_grad_norm_
diff --git a/apex/apex/fp16_utils/loss_scaler.py b/apex/apex/fp16_utils/loss_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f32fe01fb1a52f2436d10dc353fcf68256eb0c
--- /dev/null
+++ b/apex/apex/fp16_utils/loss_scaler.py
@@ -0,0 +1,186 @@
+import torch
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to 
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of 
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has 
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.  
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of 
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. 
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+    """
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
+                return True
+
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if 
+            # Pytorch's .sum() creates a one-element tensor of the same type as x 
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if "value cannot be converted" not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+        
+##############################################################        
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+        
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+        
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
diff --git a/apex/apex/multi_tensor_apply/__init__.py b/apex/apex/multi_tensor_apply/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a80e3c545276c30c9c6c7ab74739c11ff1deeb2
--- /dev/null
+++ b/apex/apex/multi_tensor_apply/__init__.py
@@ -0,0 +1,4 @@
+from .multi_tensor_apply import MultiTensorApply
+
+multi_tensor_applier = MultiTensorApply(2048*32)
+
diff --git a/apex/apex/multi_tensor_apply/multi_tensor_apply.py b/apex/apex/multi_tensor_apply/multi_tensor_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..346c6e50fb31ea28e16bdb7ef8115da2324c3359
--- /dev/null
+++ b/apex/apex/multi_tensor_apply/multi_tensor_apply.py
@@ -0,0 +1,30 @@
+import torch
+
+class MultiTensorApply(object):
+    available = False
+    warned = False
+
+    def __init__(self, chunk_size):
+        try:
+            import amp_C
+            MultiTensorApply.available = True
+            self.chunk_size = chunk_size
+        except ImportError as err:
+            MultiTensorApply.available = False
+            MultiTensorApply.import_err = err
+
+    def check_avail(self):
+        if MultiTensorApply.available == False:
+            raise RuntimeError(
+                "Attempted to call MultiTensorApply method, but MultiTensorApply "
+                "is not available, possibly because Apex was installed without "
+                "--cpp_ext --cuda_ext.  Original import error message:",
+                MultiTensorApply.import_err)
+
+    def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
+        self.check_avail()
+
+        return op(self.chunk_size,
+                  noop_flag_buffer,
+                  tensor_lists,
+                  *args)
diff --git a/apex/apex/normalization/__init__.py b/apex/apex/normalization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b798883b3c24964c54def1caea0021b8ecf6ef32
--- /dev/null
+++ b/apex/apex/normalization/__init__.py
@@ -0,0 +1 @@
+from .fused_layer_norm import FusedLayerNorm
diff --git a/apex/apex/normalization/fused_layer_norm.py b/apex/apex/normalization/fused_layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3202112bed8f3a5aef342165684905ec5b2ae333
--- /dev/null
+++ b/apex/apex/normalization/fused_layer_norm.py
@@ -0,0 +1,160 @@
+import math
+import torch
+import numbers
+from torch.nn.parameter import Parameter
+from torch.nn import init
+from torch.nn import functional as F
+import importlib
+
+class FusedLayerNormAffineFunction(torch.autograd.Function):
+  def __init__(self, normalized_shape, eps=1e-6):
+    global fused_layer_norm_cuda
+    fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+    self.normalized_shape = normalized_shape
+    self.eps = eps
+
+  def forward(self, input, weight, bias):
+    input_ = input.contiguous()
+    weight_ = weight.contiguous()
+    bias_ = bias.contiguous()
+    output, mean, invvar = fused_layer_norm_cuda.forward_affine(
+        input_, self.normalized_shape, weight_, bias_, self.eps)
+    self.save_for_backward(input_, weight_, bias_, mean, invvar)
+    return output
+
+  def backward(self, grad_output):
+    input_, weight_, bias_, mean, invvar = self.saved_tensors
+    grad_input = grad_weight = grad_bias = None
+    grad_input, grad_weight, grad_bias = fused_layer_norm_cuda.backward_affine(
+        grad_output.contiguous(), mean, invvar,
+        input_, self.normalized_shape, 
+        weight_, bias_, self.eps)
+    return grad_input, grad_weight, grad_bias;
+    
+class FusedLayerNormFunction(torch.autograd.Function):
+  def __init__(self, normalized_shape, eps=1e-6):
+    global fused_layer_norm_cuda
+    fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+    self.normalized_shape = normalized_shape
+    self.eps = eps
+
+  def forward(self, input):
+    input_ = input.contiguous()
+    output, mean, invvar = fused_layer_norm_cuda.forward(
+        input_, self.normalized_shape, self.eps)
+    self.save_for_backward(input_, mean, invvar)
+    return output
+
+  def backward(self, grad_output):
+    input_, mean, invvar = self.saved_tensors
+    grad_input = None
+    grad_input = fused_layer_norm_cuda.backward(
+        grad_output.contiguous(), mean, invvar,
+        input_, self.normalized_shape,
+        self.eps)
+    return grad_input
+
+def fused_layer_norm_affine(input, normalized_shape, weight, bias, eps=1e-6):
+    return FusedLayerNormAffineFunction(normalized_shape,eps)(input, weight, bias)
+
+def fused_layer_norm(input, normalized_shape, eps=1e-6):
+    return FusedLayerNormFunction(normalized_shape,eps)(input)
+
+class FusedLayerNorm(torch.nn.Module):
+    r"""Applies Layer Normalization over a mini-batch of inputs as described in
+    the paper `Layer Normalization`_ .
+
+    Currently only runs on cuda() tensors.
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated separately over the last
+    certain number dimensions which have to be of the shape specified by
+    :attr:`normalized_shape`.
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+
+            .. math::
+                [* \times \text{normalized}\_\text{shape}[0] \times \text{normalized}\_\text{shape}[1]
+                    \times \ldots \times \text{normalized}\_\text{shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    Examples::
+
+        >>> input = torch.randn(20, 5, 10, 10)
+        >>> # With Learnable Parameters
+        >>> m = apex.normalization.FusedLayerNorm(input.size()[1:])
+        >>> # Without Learnable Parameters
+        >>> m = apex.normalization.FusedLayerNorm(input.size()[1:], elementwise_affine=False)
+        >>> # Normalize over last two dimensions
+        >>> m = apex.normalization.FusedLayerNorm([10, 10])
+        >>> # Normalize over last dimension of size 10
+        >>> m = apex.normalization.FusedLayerNorm(10)
+        >>> # Activating the module
+        >>> output = m(input)
+
+    .. _`Layer Normalization`: https://arxiv.org/abs/1607.06450
+    """
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super(FusedLayerNorm, self).__init__()
+
+        global fused_layer_norm_cuda
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.Tensor(*normalized_shape))
+            self.bias = Parameter(torch.Tensor(*normalized_shape))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input):
+        if not input.is_cuda:
+            return  F.layer_norm(
+                input, self.normalized_shape, self.weight, self.bias, self.eps)
+        if self.elementwise_affine:
+          return FusedLayerNormAffineFunction(self.normalized_shape,self.eps)(
+              input, self.weight, self.bias)
+        else:
+          return FusedLayerNormFunction(self.normalized_shape,self.eps)(
+              input)
+
+    def extra_repr(self):
+        return '{normalized_shape}, eps={eps}, ' \
+            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
diff --git a/apex/apex/optimizers/__init__.py b/apex/apex/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3248f7c4f41c3a918d5e0bae0ccd387bcf42ae
--- /dev/null
+++ b/apex/apex/optimizers/__init__.py
@@ -0,0 +1,2 @@
+from .fused_adam import FusedAdam
+from .fp16_optimizer import FP16_Optimizer
diff --git a/apex/apex/optimizers/fp16_optimizer.py b/apex/apex/optimizers/fp16_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..de85b1a1d8173d791644d702da6d18555022a2dc
--- /dev/null
+++ b/apex/apex/optimizers/fp16_optimizer.py
@@ -0,0 +1,274 @@
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` A cutdown version of apex.fp16_utils.FP16_Optimizer.
+    Designed only to wrap apex.optimizers.FusedAdam.
+    Refer to apex.fp16_utils documents for more information.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = apex.optimizers.FusedAdam(model.parameters())
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+    """
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=True):
+
+        # The fused optimizer does all the work. We need this layer for two reason:
+        # 1. maintain same user API from apex.fp16_utils
+        # 2. keep common stuff here in case we need to add new fused optimizer later
+
+        # differences from apex.fp16_utils:
+        # - assume all model params in fp16
+        # - assume all params requires grad
+        # - flat by groups, not keeping state. TODO: remove state explicitly?
+        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
+        if not torch.cuda.is_available:
+            raise SystemError("Cannot use fp16 without CUDA.")
+        self.optimizer = init_optimizer
+
+        # param flattened by groups
+        self.fp16_groups = []
+        self.fp16_groups_flat = []
+        self.fp32_groups_flat = []
+
+        # loop to deal with groups
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            # push this group to list before modify
+            self.fp16_groups.append(param_group['params'])
+            # init fp16 weight buffer, flattened
+            self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]]))
+            # set model fp16 weight to slices of flattened buffer
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
+            for p,q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+            # init master weight, flattened
+            self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach())
+            # modify optimizer of have flat master weight
+            self.fp32_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it
+            param_group['params'] = [self.fp32_groups_flat[i]]
+
+        # we may have a way of fusing dynamic scale. Do not support for now
+        if dynamic_loss_scale:
+            if dynamic_loss_args is not None:
+                raise SystemError("Do not support dynamic loss scale args for now.")
+            self.dynamic_loss_scale = True
+            self.cur_scale = 2**16
+            self.cur_iter = 0
+            self.last_overflow_iter = -1
+            self.scale_factor = 2
+            self.scale_window = 1000
+        else:
+            self.dynamic_loss_scale = False
+            self.cur_iter = 0
+            self.cur_scale = static_loss_scale
+        self.verbose = verbose
+
+    def zero_grad(self, set_grads_to_None=True):
+        """
+        Zero FP16 parameter grads.
+        """
+        # FP32 grad should never exist.
+        # For speed, set model fp16 grad to None by default
+        for group in self.fp16_groups:
+            for p in group:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+    def _compute_grad_norm(self, fp16_grads_flat, norm_type=2):
+        """
+        Compute fp16 grad norm for later clipping(fused with update).
+        Internal accumulated in fp32.
+        Also fused in NaN check. Possibly other reduction needed for grad.
+
+        Args:
+            fp16_grads_flat (tensor): fp16 grad flattened
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp16 gradients (viewed as a single vector).
+            Returns -1 if the most recently computed fp16 gradients overflowed
+        """
+        # TODO: Not most efficient with copy to cpu and sync
+        # only support 2-norm now
+        # for torch version <= 1.0.1, torch.norm with dtype will fail and fall back to cast
+        try:
+            norm = float(torch.norm(fp16_grads_flat, 2.0, dtype=torch.float32))
+        except TypeError as err:
+            norm = float(torch.norm(fp16_grads_flat.float(), 2.0))
+        if norm == float('inf') or norm == -float('inf') or norm != norm:
+            return -1
+        else:
+            return norm
+
+    def step(self, closure=None):
+        """
+        Not supporting closure.
+        """
+        # First compute norm for all group so we know if there is overflow
+        grads_groups_flat = []
+        norm_groups = []
+        skip = False
+        for i, group in enumerate(self.fp16_groups):
+            grads_groups_flat.append(_flatten_dense_tensors([p.grad for p in group]))
+            norm_groups.append(self._compute_grad_norm(grads_groups_flat[i]))
+            if norm_groups[i] == -1: #TODO: early break
+                skip = True
+
+        if skip:
+            self._update_scale(skip)
+            return
+
+        # norm is in fact norm*cur_scale
+        self.optimizer.step(grads=[[g] for g in grads_groups_flat],
+                            output_params=[[p] for p in self.fp16_groups_flat],
+                            scale=self.cur_scale,
+                            grad_norms=norm_groups)
+
+        # TODO: we probably don't need this? just to be safe
+        for i in range(len(norm_groups)):
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
+            for p,q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+
+        self._update_scale(False)
+        return
+
+    def backward(self, loss):
+        """
+        :attr:`backward` performs the following steps:
+
+        1. fp32_loss = loss.float()
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
+        """
+        scaled_loss = (loss.float()) * self.cur_scale
+        scaled_loss.backward()
+
+    def _update_scale(self, skip):
+        if self.dynamic_loss_scale:
+            if skip:
+                if self.verbose:
+                    print("\nGrad overflow on iteration", self.cur_iter)
+                    print("Using dynamic loss scale of", self.cur_scale)
+                self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
+                self.last_overflow_iter = self.cur_iter
+            else:
+                if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
+                    self.cur_scale *= self.scale_factor
+        else:
+            if skip:
+                print("\nGrad overflow on iteration", self.cur_iter)
+                print("Using static loss scale of", self.cur_scale)
+        self.cur_iter +=1
+        return
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['cur_scale'] = self.cur_scale
+        state_dict['cur_iter'] = self.cur_iter
+        if state_dict['dynamic_loss_scale']:
+            state_dict['last_overflow_iter'] = self.last_overflow_iter
+            state_dict['scale_factor'] = self.scale_factor
+            state_dict['scale_window'] = self.scale_window
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_groups_flat'] = self.fp32_groups_flat
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+        Example::
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.cur_scale = state_dict['cur_scale']
+        self.cur_iter = state_dict['cur_iter']
+        if state_dict['dynamic_loss_scale']:
+            self.last_overflow_iter = state_dict['last_overflow_iter']
+            self.scale_factor = state_dict['scale_factor']
+            self.scale_window = state_dict['scale_window']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current, saved in zip(self.fp32_groups_flat, state_dict['fp32_groups_flat']):
+            current.data.copy_(saved.data)
diff --git a/apex/apex/optimizers/fused_adam.py b/apex/apex/optimizers/fused_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ebcdd92362ee5bdd040b848268c382fa7c2b92e
--- /dev/null
+++ b/apex/apex/optimizers/fused_adam.py
@@ -0,0 +1,147 @@
+import types
+import torch
+import importlib
+
+class FusedAdam(torch.optim.Optimizer):
+
+    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in FusedAdam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params,
+                 lr=1e-3, bias_correction = True,
+                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
+                 weight_decay=0., max_grad_norm=0., amsgrad=False):
+        global fused_adam_cuda
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+
+        if amsgrad:
+            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(FusedAdam, self).__init__(params, defaults)
+        self.eps_mode = 0 if  eps_inside_sqrt else 1
+
+    def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced precision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if grads is None:
+            grads_group = [None]*len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0])!=list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        if output_params is None:
+            output_params_group = [None]*len(self.param_groups)
+        elif isinstance(output_params, types.GeneratorType):
+            output_params_group = [output_params]
+        elif type(output_params[0])!=list:
+            output_params_group = [output_params]
+        else:
+            output_params_group = output_params
+
+        if grad_norms is None:
+            grad_norms = [None]*len(self.param_groups)
+
+        for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
+            if grads_this_group is None:
+               grads_this_group = [None]*len(group['params'])
+            if output_params_this_group is None:
+               output_params_this_group = [None]*len(group['params'])
+
+            # compute combined scale factor for this group
+            combined_scale = scale
+            if group['max_grad_norm'] > 0:
+                # norm is in fact norm*scale
+                clip = ((grad_norm / scale) + 1e-6) / group['max_grad_norm']
+                if clip > 1:
+                    combined_scale = clip * scale
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
+                #note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
+                fused_adam_cuda.adam(p.data,
+                                     out_p,
+                                     exp_avg,
+                                     exp_avg_sq,
+                                     grad,
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     combined_scale,
+                                     state['step'],
+                                     self.eps_mode,
+                                     bias_correction,
+                                     group['weight_decay'])
+        return loss
diff --git a/apex/apex/parallel/LARC.py b/apex/apex/parallel/LARC.py
new file mode 100644
index 0000000000000000000000000000000000000000..4031eac70c49bae978e1bb5d0a8f8a4abaa83a2f
--- /dev/null
+++ b/apex/apex/parallel/LARC.py
@@ -0,0 +1,97 @@
+import torch
+from torch import nn
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+class LARC(object):
+    """
+    :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC,
+    in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive 
+    local learning rate for each individual parameter. The algorithm is designed to improve
+    convergence of large batch training.
+     
+    See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate.
+
+    In practice it modifies the gradients of parameters as a proxy for modifying the learning rate
+    of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer.
+
+    ```
+    model = ...
+    optim = torch.optim.Adam(model.parameters(), lr=...)
+    optim = LARC(optim)
+    ```
+
+    It can even be used in conjunction with apex.fp16_utils.FP16_optimizer.
+
+    ```
+    model = ...
+    optim = torch.optim.Adam(model.parameters(), lr=...)
+    optim = LARC(optim)
+    optim = apex.fp16_utils.FP16_Optimizer(optim)
+    ```
+
+    Args:
+        optimizer: Pytorch optimizer to wrap and modify learning rate for.
+        trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888
+        clip: Decides between clipping or scaling mode of LARC. If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter. If `clip=False` the learning rate is set to `local_lr*optimizer_lr`.
+        eps: epsilon kludge to help with numerical stability while calculating adaptive_lr
+    """
+
+    def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8):
+        self.param_groups = optimizer.param_groups
+        self.optim = optimizer
+        self.trust_coefficient = trust_coefficient
+        self.eps = eps
+        self.clip = clip
+
+    def __getstate__(self):
+        return self.optim.__getstate__()
+
+    def __setstate__(self, state):
+        self.optim.__setstate__(state)
+
+    def __repr__(self):
+        return self.optim.__repr__()
+
+    def state_dict(self):
+        return self.optim.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self.optim.load_state_dict(state_dict)
+
+    def zero_grad(self):
+        self.optim.zero_grad()
+
+    def add_param_group(self, param_group):
+        self.optim.add_param_group( param_group)
+
+    def step(self):
+        with torch.no_grad():
+            weight_decays = []
+            for group in self.optim.param_groups:
+                # absorb weight decay control from optimizer
+                weight_decay = group['weight_decay'] if 'weight_decay' in group else 0
+                weight_decays.append(weight_decay)
+                group['weight_decay'] = 0
+                for p in group['params']:
+                    if p.grad is None:
+                        continue
+                    param_norm = torch.norm(p.data)
+                    grad_norm = torch.norm(p.grad.data)
+
+                    if param_norm != 0 and grad_norm != 0:
+                        # calculate adaptive lr + weight decay
+                        adaptive_lr = self.trust_coefficient * (param_norm) / (grad_norm + param_norm * weight_decay + self.eps)
+
+                        # clip learning rate for LARC
+                        if self.clip:
+                            # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
+                            adaptive_lr = min(adaptive_lr/group['lr'], 1)
+
+                        p.grad.data += weight_decay * p.data
+                        p.grad.data *= adaptive_lr
+
+        self.optim.step()
+        # return weight decay control to optimizer
+        for i, group in enumerate(self.optim.param_groups):
+            group['weight_decay'] = weight_decays[i]
diff --git a/apex/apex/parallel/README.md b/apex/apex/parallel/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7910d82f3954969b8abe4e8120142f1c579a22f
--- /dev/null
+++ b/apex/apex/parallel/README.md
@@ -0,0 +1,66 @@
+## Distributed Data Parallel
+
+distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library.
+
+`apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with
+computation in the backward pass and bucketing smaller transfers to reduce the total number of
+transfers required.
+
+multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs.
+
+#### [API Documentation](https://nvidia.github.io/apex/parallel.html)
+
+#### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
+
+#### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+
+#### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex)
+
+### Synchronized Batch Normalization
+
+`apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`.
+It reduces stats on the first (channel) dimension of the Tensor and accepts
+arbitrary spatial dimensions.
+
+#### Installation
+
+Apex provides two sync BN implementation:
+
+1. There is the Python-only implementation, which is the default implementation
+when install with `python setup.py install`.
+It uses PyTorch primitive operations and distributed communication package from
+`torch.distributed`.
+
+   - _Python-only implementation requires input tensor to be of same data type as
+layer_
+
+2. We also provide implementation with kernels through CUDA/C++ extension with
+improved performance. We are experimenting with Welford and Kahan for reduction
+hoping to get better accuracy.
+   To use the kernel implementation, user need to install Apex with CUDA extension
+enabled `python setup.py install --cuda_ext`.
+
+   - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn.
+This is required to run imagenet example in fp16._
+
+   - _Currently kernel implementation only supports GPU._
+
+#### HowTo
+
+1. User could use `apex.parallel.SyncBatchNorm` by building their module with
+the layer explicitly.
+
+```
+import apex
+input_t = torch.randn(3, 5, 20).cuda()
+sbn = apex.parallel.SyncBatchNorm(5).cuda()
+output_t = sbn(input)
+```
+
+2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`.
+
+```
+# model is an instance of torch.nn.Module
+import apex
+sync_bn_model = apex.parallel.convert_syncbn_model(model)
+```
diff --git a/apex/apex/parallel/__init__.py b/apex/apex/parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..610c3d8881bd54e0bb7f7bfba6b6ab2aad24ab54
--- /dev/null
+++ b/apex/apex/parallel/__init__.py
@@ -0,0 +1,92 @@
+import torch
+
+if hasattr(torch.distributed, 'ReduceOp'):
+    ReduceOp = torch.distributed.ReduceOp
+elif hasattr(torch.distributed, 'reduce_op'):
+    ReduceOp = torch.distributed.reduce_op
+else:
+    ReduceOp = torch.distributed.deprecated.reduce_op
+
+from .distributed import DistributedDataParallel, Reducer
+# This is tricky because I'd like SyncBatchNorm to be exposed the same way
+# for both the cuda-enabled and python-fallback versions, and I don't want
+# to suppress the error information.
+try:
+    import syncbn
+    from .optimized_sync_batchnorm import SyncBatchNorm
+except ImportError as err:
+    from .sync_batchnorm import SyncBatchNorm
+    SyncBatchNorm.syncbn_import_error = err
+
+def convert_syncbn_model(module, process_group=None, channel_last=False):
+    '''
+    Recursively traverse module and its children to replace all instances of
+    ``torch.nn.modules.batchnorm._BatchNorm`` with :class:`apex.parallel.SyncBatchNorm`.
+
+    All ``torch.nn.BatchNorm*N*d`` wrap around
+    ``torch.nn.modules.batchnorm._BatchNorm``, so this function lets you easily switch
+    to use sync BN.
+
+    Args:
+        module (torch.nn.Module): input module
+
+    Example::
+
+        >>> # model is an instance of torch.nn.Module
+        >>> import apex
+        >>> sync_bn_model = apex.parallel.convert_syncbn_model(model)
+    '''
+    mod = module
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+        mod = SyncBatchNorm(module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats, process_group, channel_last=channel_last)
+        mod.running_mean = module.running_mean
+        mod.running_var = module.running_var
+        if module.affine:
+            mod.weight.data = module.weight.data.clone().detach()
+            mod.bias.data = module.bias.data.clone().detach()
+    for name, child in module.named_children():
+        mod.add_module(name, convert_syncbn_model(child,
+                                                  process_group=process_group,
+                                                  channel_last=channel_last))
+    # TODO(jie) should I delete model explicitly?
+    del module
+    return mod
+
+def create_syncbn_process_group(group_size):
+    '''
+    Creates process groups to be used for syncbn of a give ``group_size`` and returns
+    process group that current GPU participates in.
+
+    ``group_size`` must divide the total number of GPUs (world_size).
+
+    ``group_size`` of 0 would be considered as =world_size. In this case ``None`` will be returned.
+
+    ``group_size`` of 1 would be equivalent to using non-sync bn, but will still carry the overhead.
+
+    Args:
+        group_size (int): number of GPU's to collaborate for sync bn
+
+    Example::
+
+        >>> # model is an instance of torch.nn.Module
+        >>> import apex
+        >>> group = apex.parallel.create_syncbn_process_group(group_size)
+    '''
+
+    if group_size==0:
+        return None
+
+    world_size = torch.distributed.get_world_size()
+    assert(world_size >= group_size)
+    assert(world_size % group_size == 0)
+
+    group=None
+    for group_num in (range(world_size//group_size)):
+        group_ids = range(group_num*group_size, (group_num+1)*group_size)
+        cur_group = torch.distributed.new_group(ranks=group_ids)
+        if (torch.distributed.get_rank()//group_size == group_num):
+            group = cur_group
+            #can not drop out and return here, every process must go through creation of all subgroups
+
+    assert(group is not None)
+    return group
diff --git a/apex/apex/parallel/distributed.py b/apex/apex/parallel/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d0a50eae7dda8612ed6d2e71344bb4d0d99d05
--- /dev/null
+++ b/apex/apex/parallel/distributed.py
@@ -0,0 +1,512 @@
+import torch
+import torch.distributed as dist
+from torch.nn.modules import Module
+from torch.autograd import Variable
+from collections import OrderedDict
+from itertools import chain
+import copy
+import importlib
+from ..multi_tensor_apply import multi_tensor_applier
+
+imported_flatten_impl = False
+
+def import_flatten_impl():
+    global flatten_impl, unflatten_impl, imported_flatten_impl
+    try:
+        import apex_C
+        flatten_impl = apex_C.flatten
+        unflatten_impl = apex_C.unflatten
+    except ImportError:
+        print("Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten.")
+        flatten_impl = torch._utils._flatten_dense_tensors
+        unflatten_impl = torch._utils._unflatten_dense_tensors
+    imported_flatten_impl = True
+
+def flatten(bucket):
+    if not imported_flatten_impl:
+        import_flatten_impl()
+    return flatten_impl(bucket)
+
+def unflatten(coalesced, bucket):
+    if not imported_flatten_impl:
+        import_flatten_impl()
+    return unflatten_impl(coalesced, bucket)
+
+# apply_dist_call requires that tensors in 'bucket' are all the same type.
+def apply_flat_dist_call(bucket, call, extra_args=None):
+
+    coalesced = flatten(bucket)
+
+    if extra_args is not None:
+        call(coalesced, *extra_args)
+    else:
+        call(coalesced)
+
+    if call is dist.all_reduce:
+        coalesced /= dist.get_world_size()
+        
+    for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
+        buf.copy_(synced)
+
+def split_half_float_double(tensors):
+    dtypes = ["torch.cuda.HalfTensor",  "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor"]
+    buckets = []
+    for i, dtype in enumerate(dtypes):
+        bucket = [t for t in tensors if t.type() == dtype]
+        if bucket:
+            buckets.append(bucket) 
+    return buckets
+
+def split_by_type(tensors):
+    buckets = OrderedDict()
+    for tensor in tensors:
+        tp = tensor.type()
+        if tp not in buckets:
+            buckets[tp] = []
+        buckets[tp].append(tensor)
+    return buckets
+
+# flat_dist_call organizes 'tensors' by type.
+def flat_dist_call(tensors, call, extra_args=None):
+    buckets = split_by_type(tensors)
+                    
+    for tp in buckets:
+        bucket = buckets[tp]
+        apply_flat_dist_call(bucket, call, extra_args)
+
+            
+def extract_tensors(maybe_tensor, tensor_list):
+    if torch.is_tensor(maybe_tensor):
+        tensor_list.append(maybe_tensor)
+    else:
+        try:
+            for item in maybe_tensor:
+                extract_tensors(item, tensor_list)
+        except TypeError:
+            return
+
+        
+class Reducer(object):
+    """
+    :class:`apex.parallel.Reducer` is a simple class that helps allreduce a module's parameters
+    across processes.  :class:`Reducer` is intended to give the user additional control:
+    Unlike :class:`DistributedDataParallel`, :class:`Reducer` will not automatically allreduce
+    parameters during ``backward()``.
+    Instead, :class:`Reducer` waits for the user to call ``<reducer_instance>.reduce()`` manually.
+    This enables, for example, delaying the allreduce to be carried out every 
+    several iterations instead of every single iteration.
+
+    Like :class:`DistributedDataParallel`, :class:`Reducer` averages any tensors it allreduces 
+    over the number of participating processes.
+
+    :class:`Reducer` is designed to work with the upstream launch utility script 
+    ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
+    When used with this launcher, :class:`Reducer` assumes 1:1 mapping of processes to GPUs.
+    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
+
+    Args:
+        module_or_grads_list: Either a network definition (module) being run in multi-gpu/distributed mode, or an iterable of gradients to be reduced.  If a module is passed in, the Reducer constructor will sync the parameters across processes (broadcasting from rank 0) to make sure they're all initialized with the same values.  If a list of gradients (that came from some module) is passed in, the user is responsible for manually syncing that module's parameters at the beginning of training.
+    """
+    
+    def __init__(self, module_or_grads_list):
+        if isinstance(module_or_grads_list, Module):
+            self.module = module_or_grads_list
+            flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
+
+        else:
+            self.module = None
+            self.grads = []
+            extract_tensors(module_or_grads_list, self.grads)
+            
+    def reduce(self):
+        if self.module:
+            grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
+            flat_dist_call(grads, dist.all_reduce)
+        else:
+            flat_dist_call(self.grads, dist.all_reduce)
+            
+            
+class DistributedDataParallel(Module):
+    """
+    :class:`apex.parallel.DistributedDataParallel` is a module wrapper that enables
+    easy multiprocess distributed data parallel training, similar to ``torch.nn.parallel.DistributedDataParallel``.  Parameters are broadcast across participating processes on initialization, and gradients are
+    allreduced and averaged over processes during ``backward()``.
+
+    :class:`DistributedDataParallel` is optimized for use with NCCL.  It achieves high performance by 
+    overlapping communication with computation during ``backward()`` and bucketing smaller gradient
+    transfers to reduce the total number of transfers required.
+
+    :class:`DistributedDataParallel` is designed to work with the upstream launch utility script 
+    ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
+    When used with this launcher, :class:`DistributedDataParallel` assumes 1:1 mapping of processes to GPUs.
+    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
+
+    https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed shows detailed usage.
+    https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows another example
+    that combines :class:`DistributedDataParallel` with mixed precision training.
+
+    Args:
+        module: Network definition to be run in multi-gpu/distributed mode.
+        message_size (int, default=1e7): Minimum number of elements in a communication bucket.
+        delay_allreduce (bool, default=False):  Delay all communication to the end of the backward pass.  This disables overlapping communication with computation.
+        allreduce_trigger_params (list, optional, default=None):  If supplied, should contain a list of parameters drawn from the model.  Allreduces will be kicked off whenever one of these parameters receives its gradient (as opposed to when a bucket of size message_size is full).  At the end of backward(), a cleanup allreduce to catch any remaining gradients will also be performed automatically.  If allreduce_trigger_params is supplied, the message_size argument will be ignored.
+        allreduce_always_fp32 (bool, default=False):  Convert any FP16 gradients to FP32 before allreducing.  This can improve stability for widely scaled-out runs.
+        gradient_average (bool, default=True):  Option to toggle whether or not DDP averages the allreduced gradients over processes.  For proper scaling, the default value of True is recommended.
+        gradient_predivide_factor (float, default=1.0):  Allows perfoming the average of gradients over processes partially before and partially after the allreduce.  Before allreduce:  ``grads.mul_(1.0/gradient_predivide_factor)``.  After allreduce:  ``grads.mul_(gradient_predivide_factor/world size)``.  This can reduce the stress on the dynamic range of FP16 allreduces for widely scaled-out runs.
+
+    .. warning::
+        If ``gradient_average=False``, the pre-allreduce division (``grads.mul_(1.0/gradient_predivide_factor)``) will still be applied, but the post-allreduce gradient averaging (``grads.mul_(gradient_predivide_factor/world size)``) will be omitted.
+
+    """
+
+    def __init__(self, 
+                 module, 
+                 message_size=10000000, 
+                 delay_allreduce=False, 
+                 shared_param=None,
+                 allreduce_trigger_params=None,
+                 retain_allreduce_buffers=False,
+                 allreduce_always_fp32=False,
+                 gradient_average=True,
+                 gradient_predivide_factor=1.0):
+        super(DistributedDataParallel, self).__init__()
+
+        # Backward/forward compatibility around 
+        # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
+        # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
+        if hasattr(dist, "get_backend"):
+            self._backend = dist.get_backend()
+            if hasattr(dist, "DistBackend"):
+                self.backend_enum_holder = dist.DistBackend
+            else:
+                self.backend_enum_holder = dist.Backend
+        else:
+            self._backend = dist._backend 
+            self.backend_enum_holder = dist.dist_backend
+
+        self.warn_on_half = True if self._backend == self.backend_enum_holder.GLOO else False
+
+        if shared_param is not None:
+            raise ValueError("shared_param is no longer supported as an option.  It was misleadingly named from the start.  It turns out overlapping communication with computation should work fine with shared parameters.  If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.") 
+
+        self.world_size = float(dist.get_world_size())
+
+        self.retain_allreduce_buffers = retain_allreduce_buffers
+        self.allreduce_always_fp32 = allreduce_always_fp32
+        self.gradient_average = gradient_average
+        self.gradient_predivide_factor = gradient_predivide_factor
+
+        self.custom_allreduce_triggers = False
+        if allreduce_trigger_params is not None:
+            if delay_allreduce:
+                raise ValueError("Setting allreduce_trigger_params is only valid if delay_allreduce=False.")  
+            self.custom_allreduce_triggers = True
+            self.allreduce_trigger_params = set([id(param) for param in allreduce_trigger_params])
+
+        self.delay_allreduce = delay_allreduce
+        self.message_size = message_size
+
+        self.reduction_stream = torch.cuda.Stream()
+        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False) 
+        
+        self.module = module
+
+        self._disable_allreduce = False
+        
+        if self._backend == self.backend_enum_holder.NCCL:
+            for param in self.module.parameters():
+                assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."
+
+        self.active_params = []
+
+        self.param_type_to_tmp_i = {"torch.cuda.HalfTensor" : 0, 
+                                    "torch.cuda.FloatTensor" : 1,
+                                    "torch.cuda.DoubleTensor" : 2}
+
+        if multi_tensor_applier.available:
+            # TODO:  I really need to centralize the C++ backed imports
+            import amp_C
+            self.multi_tensor_scale = amp_C.multi_tensor_scale
+            self._overflow_buf = torch.cuda.IntTensor([0])
+
+        self.create_hooks()
+
+        flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
+
+
+    def __setstate__(self, state):
+        super(DistributedDataParallel, self).__setstate__(state)
+        self.reduction_stream = torch.cuda.Stream()
+        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False) 
+
+
+    def __getstate__(self):
+        attrs = copy.copy(self.__dict__)
+        if self._backend != self.backend_enum_holder.NCCL:
+            del attrs['self.reduction_stream']
+            del attrs['self.reduction_event']
+            return attrs
+
+    def enable_allreduce(self):
+        self._disable_allreduce = False
+
+    def disable_allreduce(self):
+        self._disable_allreduce = True
+      
+    # Broadcast rank 0's bucket structure across all processes, and have all processes 
+    # regenerate their bucket structures to match. 
+    def sync_bucket_structure(self):
+        # Append leftover buckets
+        for tmp_bucket in self.tmp_buckets:
+            if len(tmp_bucket) > 0:
+                self.active_i_buckets.append(tmp_bucket)
+
+        self.num_buckets = len(self.active_i_buckets)
+        self.bucket_sizes = [len(bucket) for bucket in self.active_i_buckets]
+
+        info_tensor = torch.cuda.IntTensor([self.num_buckets] + 
+                                           self.bucket_sizes + 
+                                           list(chain(*self.active_i_buckets)))
+
+        dist.broadcast(info_tensor, 0)
+
+        info = [int(entry) for entry in info_tensor]
+
+        self.num_buckets = info[0]
+        self.bucket_sizes = info[1:self.num_buckets + 1] 
+        self.buckets = [[None for _ in range(self.bucket_sizes[i])] 
+                        for i in range(self.num_buckets)] 
+        # Technically, active_i_buckets' work is done.  But the information is still useful to
+        # keep around.  Therefore, refresh active_i_buckets based on rank 0 as well.
+        self.active_i_buckets = [[None for _ in range(self.bucket_sizes[i])] 
+                                 for i in range(self.num_buckets)] 
+        
+        flattened_buckets = info[self.num_buckets + 1:]
+        flat_i = 0
+        for bucket_idx in range(self.num_buckets): 
+            for bucket_loc in range(self.bucket_sizes[bucket_idx]):
+                param_i = flattened_buckets[flat_i]
+                self.active_i_buckets[bucket_idx][bucket_loc] = param_i 
+                self.param_id_to_bucket[id(self.active_params[param_i])] = (bucket_idx, bucket_loc)
+                flat_i += 1 
+        
+        
+    def create_hooks(self):
+        # Fallback hook that's only called at the end of backward.
+        # Used if you deliberately want to delay allreduces to the end, or to refresh the 
+        # bucket structure that will be used to overlap communication with computation in later
+        # iterations.
+        def allreduce_params():
+            # Bucket record refresh
+            if not self.delay_allreduce:
+                if self.needs_refresh:
+                    self.sync_bucket_structure()
+
+                    self.needs_refresh = False
+
+            self.allreduce_fallback()
+
+
+        def overlapping_backward_epilogue():
+            self.reduction_stream.record_event(self.reduction_event)
+            torch.cuda.current_stream().wait_event(self.reduction_event)
+     
+            # Sanity checks that all the buckets were kicked off
+            if self.next_bucket != self.num_buckets:
+                raise RuntimeError("In epilogue, next_bucket ({}) != num_buckets ({}).  ".format(
+                                   self.next_bucket, self.num_buckets),
+                                   "This probably indicates some buckets were not allreduced.")
+
+            for actual, expected in zip(self.buckets_ready_size, self.bucket_sizes):
+                if actual != expected:
+                    raise RuntimeError("Some param buckets were not allreduced.")
+           
+
+        self.grad_accs = []
+        for param in self.module.parameters():
+            if param.requires_grad:
+                def wrapper(param):
+                    param_tmp = param.expand_as(param)
+                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
+
+                    def allreduce_hook(*unused):
+                        if not self._disable_allreduce:
+                            if self.delay_allreduce or self.needs_refresh:
+                                # TODO:  How do we want to handle multiple backward passes between
+                                # each forward, e.g., backward passes with retain_graph=True?
+                                # needs_refresh and callback_queued are both vulnerable states.
+                                if not self.delay_allreduce and self.needs_refresh:
+                                    # Use the backward pass to build the bucket structure on the fly.
+                                    active_i = self.param_id_to_active_i[id(param)]
+
+                                    # Float, half, and double tensors are grouped into buckets separately.
+                                    current_type = self.param_type_to_tmp_i[param.type()]
+  
+                                    self.tmp_buckets[current_type].append(active_i)                          
+
+                                    ship_tmp_bucket = False
+                                    if self.custom_allreduce_triggers:
+                                        if id(param) in self.allreduce_trigger_params:
+                                            ship_tmp_bucket = True
+                                    else:
+                                        self.tmp_numels[current_type] += param.numel()
+                                        if self.tmp_numels[current_type] >= self.message_size:
+                                            ship_tmp_bucket = True
+
+                                    # To consider:  If custom_allreduce_triggers are in use, ship all
+                                    # tmp_buckets, not just tmp_buckets[current_type].
+                                    if ship_tmp_bucket:
+                                        self.active_i_buckets.append(self.tmp_buckets[current_type])
+                                        self.tmp_buckets[current_type] = []
+                                        self.tmp_numels[current_type] = 0
+                                
+                                if not self.callback_queued:
+                                    Variable._execution_engine.queue_callback(allreduce_params)
+                                    self.callback_queued = True
+                            else:
+                                if not self.callback_queued:
+                                    Variable._execution_engine.queue_callback(overlapping_backward_epilogue)
+                                    self.callback_queued = True 
+
+                                self.comm_ready_buckets(param)
+                        
+                    grad_acc.register_hook(allreduce_hook)
+                    self.grad_accs.append(grad_acc)
+
+                wrapper(param)
+
+    def allreduce_bucket(self, bucket):
+        tensor = flatten(bucket)
+
+        tensor_to_allreduce = tensor 
+
+        if self.allreduce_always_fp32:
+            tensor_to_allreduce = tensor.float() 
+
+        if self.gradient_predivide_factor != 1.0:
+            tensor_to_allreduce.mul_(1./self.gradient_predivide_factor)
+
+        dist.all_reduce(tensor_to_allreduce)
+
+        if self.gradient_average:
+            if self.gradient_predivide_factor != self.world_size:
+                tensor_to_allreduce.mul_(self.gradient_predivide_factor/self.world_size)
+
+        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
+            tensor.copy_(tensor_to_allreduce)
+ 
+        return tensor
+    
+
+    def allreduce_maybe_retain(self, bucket, bucket_idx=-1):
+        allreduced = self.allreduce_bucket(bucket)
+        if self.retain_allreduce_buffers:
+            if self.allreduce_buffers[bucket_idx] is not None:
+                raise RuntimeError("The backward pass is attempting to replace an already-filled "
+                                   "allreduce buffer.  This is almost certainly an error.")
+            self.allreduce_buffers[bucket_idx] = allreduced
+        else:
+            if multi_tensor_applier.available:
+                multi_tensor_applier(
+                    self.multi_tensor_scale,
+                    self._overflow_buf,
+                    [unflatten(allreduced, bucket), bucket],
+                    1.0)
+            else:
+                for buf, synced in zip(bucket, unflatten(allreduced, bucket)):
+                    buf.copy_(synced)
+
+
+    def allreduce_fallback(self):
+        grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
+
+        split_buckets = split_half_float_double(grads)
+
+        # If retain_allreduce_buffers is True and delay_allreduce is False,
+        # this will only be done during the first backward pass, ignored by the 
+        # training script, and overwritten in the next forward pass.  So it's harmless. 
+        if self.retain_allreduce_buffers:
+            self.allreduce_buffers = [None for _ in range(len(split_buckets))]
+    
+        for i, bucket in enumerate(split_buckets):
+            allreduced = self.allreduce_maybe_retain(bucket, i)
+
+
+    def comm_ready_buckets(self, param):
+        # Need to do this in every hook for compatibility with Ruberry's streaming backward PR.
+        # self.reduction_stream.wait_stream(torch.cuda.current_stream())
+
+        bucket_idx, bucket_loc = self.param_id_to_bucket[id(param)]
+
+        if self.buckets[bucket_idx][bucket_loc] is not None:
+            raise RuntimeError("The backward pass is attempting to replace an already-filled "
+                               "bucket slot.  This is almost certainly an error.")
+
+        self.buckets[bucket_idx][bucket_loc] = param.grad.data
+        self.buckets_ready_size[bucket_idx] += 1
+
+        if self.buckets_ready_size[bucket_idx] == self.bucket_sizes[bucket_idx]:
+            if bucket_idx == self.next_bucket:
+                torch.cuda.current_stream().record_event(self.reduction_event)
+                self.reduction_stream.wait_event(self.reduction_event)
+                with torch.cuda.stream(self.reduction_stream):
+                    self.allreduce_maybe_retain(self.buckets[bucket_idx], bucket_idx)
+
+                    self.next_bucket += 1
+
+                    # Reversing upstream's logic here, because we constructed our buckets based on
+                    # the order things were received during backward.
+                    if len(self.ready_buckets_not_reduced) > 0:
+                        sorted_todo = sorted(self.ready_buckets_not_reduced)
+                        for i in sorted_todo:
+                            # Nothing can be reduced now
+                            if i > self.next_bucket:
+                                break
+                            elif i == self.next_bucket:
+                                self.allreduce_maybe_retain(self.buckets[i], i)
+                                self.ready_buckets_not_reduced.remove(i)
+                                self.next_bucket += 1 
+                            else:
+                                raise ValueError("i should always be >= next_bucket")
+            else:
+                self.ready_buckets_not_reduced.add(bucket_idx)
+
+        
+    def forward(self, *inputs, **kwargs):
+        result = self.module(*inputs, **kwargs)
+       
+        if not self._disable_allreduce:
+            if not self.delay_allreduce:
+                param_list = [param for param in self.module.parameters() if param.requires_grad]
+
+                # Conditions under which to refresh self.record
+                # Forward has the authority to set needs_refresh to True, but only allreduce_params
+                # in backward has the authority to set needs_refresh to False.
+                # Parentheses are not necessary for correct order of operations, but make the intent clearer.
+                if ((not self.active_params) or 
+                    (len(param_list) != len(self.active_params)) or
+                    any([param1 is not param2 for param1, param2 in zip(param_list, self.active_params)])):
+                    self.needs_refresh = True
+
+                if self.needs_refresh:
+                    self.active_i_buckets = []
+                    self.buckets = []
+                    self.tmp_buckets = [[], [], []] # [running half, float, double buckets]
+                    self.tmp_numels = [0, 0, 0]
+                    self.bucket_sizes = []
+                    self.param_id_to_active_i = {id(param) : i for i, param in enumerate(param_list)}  
+                    self.param_id_to_bucket = {}
+                else:
+                    self.buckets = [[None for _ in range(self.bucket_sizes[i])] 
+                                   for i in range(self.num_buckets)] 
+                    self.buckets_ready_size = [0 for i in range(self.num_buckets)]
+                    if(self.retain_allreduce_buffers):
+                        self.allreduce_buffers = [None for _ in range(self.num_buckets)]
+                    self.next_bucket = 0
+                    self.ready_buckets_not_reduced = set()
+            
+                self.active_params = param_list
+
+            self.callback_queued = False
+        
+        return result
diff --git a/apex/apex/parallel/multiproc.py b/apex/apex/parallel/multiproc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff743df20b992e5a1ea95de5dd5b62785800247d
--- /dev/null
+++ b/apex/apex/parallel/multiproc.py
@@ -0,0 +1,35 @@
+import torch
+import sys
+import subprocess
+
+def docstring_hack():
+    """
+    Multiproc file which will launch a set of processes locally for multi-gpu
+    usage: python -m apex.parallel.multiproc main.py ...
+    """
+    pass
+
+argslist = list(sys.argv)[1:]
+world_size = torch.cuda.device_count()
+
+if '--world-size' in argslist:
+    world_size = int(argslist[argslist.index('--world-size')+1])
+else:
+    argslist.append('--world-size')
+    argslist.append(str(world_size))
+
+workers = []
+
+for i in range(world_size):
+    if '--rank' in argslist:
+        argslist[argslist.index('--rank')+1] = str(i)
+    else:
+        argslist.append('--rank')
+        argslist.append(str(i))
+    stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
+    print(argslist)
+    p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
+    workers.append(p)
+
+for p in workers:
+    p.wait()
diff --git a/apex/apex/parallel/optimized_sync_batchnorm.py b/apex/apex/parallel/optimized_sync_batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..046132e0234abcda36a1e102c7173fbd6d517a16
--- /dev/null
+++ b/apex/apex/parallel/optimized_sync_batchnorm.py
@@ -0,0 +1,84 @@
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn import functional as F
+
+import syncbn
+from .optimized_sync_batchnorm_kernel import SyncBatchnormFunction
+
+
+class SyncBatchNorm(_BatchNorm):
+    """
+    synchronized batch normalization module extented from `torch.nn.BatchNormNd`
+    with the added stats reduction across multiple processes.
+    :class:`apex.parallel.SyncBatchNorm` is designed to work with
+    `DistributedDataParallel`.
+
+    When running in training mode, the layer reduces stats across all processes
+    to increase the effective batchsize for normalization layer. This is useful
+    in applications where batch size is small on a given process that would
+    diminish converged accuracy of the model. The model uses collective
+    communication package from `torch.distributed`.
+
+    When running in evaluation mode, the layer falls back to
+    `torch.nn.functional.batch_norm`
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``True``
+        process_group: pass in a process group within which the stats of the
+            mini-batch is being synchronized. ``None`` for using default process
+            group
+        channel_last: a boolean value that when set to ``True``, this module
+            take the last dimension of the input tensor to be the channel
+            dimension. Default: False
+
+    Examples::
+        >>> # channel first tensor
+        >>> sbn = apex.parallel.SyncBatchNorm(100).cuda()
+        >>> inp = torch.randn(10, 100, 14, 14).cuda()
+        >>> out = sbn(inp)
+        >>> inp = torch.randn(3, 100, 20).cuda()
+        >>> out = sbn(inp)
+        >>> # channel last tensor
+        >>> sbn = apex.parallel.SyncBatchNorm(100, channel_last=True).cuda()
+        >>> inp = torch.randn(10, 14, 14, 100).cuda()
+    """
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False):
+        super(SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
+        self.process_group = process_group
+        self.channel_last = channel_last
+
+    def _specify_process_group(self, process_group):
+        self.process_group = process_group
+
+    def _specify_channel_last(self, channel_last):
+        self.channel_last = channel_last
+
+    def forward(self, input):
+        # if input.dim() == 2, we switch to channel_last for efficient memory accessing
+        channel_last = self.channel_last if input.dim() != 2 else True
+
+        if not self.training and self.track_running_stats and not channel_last:
+            # fall back to pytorch implementation for inference
+            return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
+        else:
+            exponential_average_factor = 0.0
+            if self.training and self.track_running_stats:
+                self.num_batches_tracked += 1
+                if self.momentum is None:
+                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
+                else:
+                    exponential_average_factor = self.momentum
+            return SyncBatchnormFunction.apply(input, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last)
diff --git a/apex/apex/parallel/optimized_sync_batchnorm_kernel.py b/apex/apex/parallel/optimized_sync_batchnorm_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..5969de06a5d2e7c820d8f135bdf04e576b2895b1
--- /dev/null
+++ b/apex/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -0,0 +1,110 @@
+import torch
+from torch.autograd.function import Function
+
+import syncbn
+from apex.parallel import ReduceOp
+
+class SyncBatchnormFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False):
+        torch.cuda.nvtx.range_push("sync_BN_fw")
+        input = input.contiguous()
+        world_size = 0
+
+        mean = None
+        var_biased = None
+        inv_std = None
+        var = None
+        out = None
+        count = None
+        if track_running_stats:
+            if channel_last:
+                count = int(input.numel()/input.size(-1))
+                mean, var_biased = syncbn.welford_mean_var_c_last(input)
+            else:
+                count = int(input.numel()/input.size(1))
+                mean, var_biased = syncbn.welford_mean_var(input)
+
+            if torch.distributed.is_initialized():
+                if not process_group:
+                    process_group = torch.distributed.group.WORLD
+                world_size = torch.distributed.get_world_size(process_group)
+                mean_all = torch.empty(world_size, mean.size(0), dtype=mean.dtype, device=mean.device)
+                var_all = torch.empty(world_size, var_biased.size(0), dtype=var_biased.dtype, device=var_biased.device)
+                mean_l = [mean_all.narrow(0, i, 1) for i in range(world_size)]
+                var_l = [var_all.narrow(0, i, 1) for i in range(world_size)]
+                torch.distributed.all_gather(mean_l, mean, process_group)
+                torch.distributed.all_gather(var_l, var_biased, process_group)
+                mean, var, inv_std = syncbn.welford_parallel(mean_all, var_all, count, eps)
+                # TODO(Jie): should do fp32 math instead!
+            else:
+                inv_std = 1.0 / torch.sqrt(var_biased + eps)
+                var = var_biased * (count) / (count-1) 
+
+            if count == 1 and world_size < 2:
+                raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
+
+            r_m_inc = mean if running_mean.dtype != torch.float16 else mean.half()
+            r_v_inc = var if running_variance.dtype != torch.float16 else var.half()
+            running_mean.data = running_mean.data * (1-momentum) + momentum*r_m_inc
+            running_variance.data = running_variance.data * (1-momentum) + momentum*r_v_inc
+        else:
+            mean = running_mean.data
+            inv_std = 1.0 / torch.sqrt(running_variance.data + eps)
+
+        ctx.save_for_backward(input, weight, mean, inv_std)
+        ctx.process_group = process_group
+        ctx.channel_last = channel_last
+        ctx.world_size = world_size
+
+        if channel_last:
+            out = syncbn.batchnorm_forward_c_last(input, mean, inv_std, weight, bias)
+        else:
+            out = syncbn.batchnorm_forward(input, mean, inv_std, weight, bias)
+
+        torch.cuda.nvtx.range_pop()
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = grad_output.contiguous()
+        torch.cuda.nvtx.range_push("sync_BN_bw")
+        # mini batch mean & var are calculated by forward path.
+        # mu = 1./N*np.sum(h, axis = 0)
+        # var = 1./N*np.sum((h-mu)**2, axis = 0)
+        saved_input, weight, mean, inv_std = ctx.saved_tensors
+        process_group = ctx.process_group
+        channel_last = ctx.channel_last
+        world_size = ctx.world_size
+        grad_input = grad_weight = grad_bias = None
+
+        # TODO(jie): why do I have to clone here? life time of grad_output?
+        if channel_last:
+            mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn_c_last(grad_output, saved_input, mean, inv_std, weight)
+        else:
+            mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output, saved_input, mean, inv_std, weight)
+
+        # calculate grad_input
+        if ctx.needs_input_grad[0]:
+
+            if torch.distributed.is_initialized():
+                torch.distributed.all_reduce(
+                    mean_dy, ReduceOp.SUM, process_group)
+                mean_dy = mean_dy / world_size
+                torch.distributed.all_reduce(
+                    mean_dy_xmu, ReduceOp.SUM, process_group)
+                mean_dy_xmu = mean_dy_xmu / world_size
+            if channel_last:
+                grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, mean_dy, mean_dy_xmu)
+            else:
+                grad_input = syncbn.batchnorm_backward(grad_output, saved_input, mean, inv_std, weight, mean_dy, mean_dy_xmu)
+
+        if weight is None or not ctx.needs_input_grad[1]:
+            grad_weight = None
+
+        if weight is None or not ctx.needs_input_grad[2]:
+            grad_bias = None
+
+        torch.cuda.nvtx.range_pop()
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None, None
diff --git a/apex/apex/parallel/sync_batchnorm.py b/apex/apex/parallel/sync_batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..245d8a112ef3c0621fa8c604a06da0b6e85f2930
--- /dev/null
+++ b/apex/apex/parallel/sync_batchnorm.py
@@ -0,0 +1,131 @@
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn import functional as F
+
+from .sync_batchnorm_kernel import SyncBatchnormFunction
+from apex.parallel import ReduceOp
+
+
+class SyncBatchNorm(_BatchNorm):
+    """
+    synchronized batch normalization module extented from ``torch.nn.BatchNormNd``
+    with the added stats reduction across multiple processes.
+    :class:`apex.parallel.SyncBatchNorm` is designed to work with
+    ``DistributedDataParallel``.
+
+    When running in training mode, the layer reduces stats across all processes
+    to increase the effective batchsize for normalization layer. This is useful
+    in applications where batch size is small on a given process that would
+    diminish converged accuracy of the model. The model uses collective
+    communication package from ``torch.distributed``.
+
+    When running in evaluation mode, the layer falls back to
+    ``torch.nn.functional.batch_norm``.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``True``
+
+    Example::
+
+        >>> sbn = apex.parallel.SyncBatchNorm(100).cuda()
+        >>> inp = torch.randn(10, 100, 14, 14).cuda()
+        >>> out = sbn(inp)
+        >>> inp = torch.randn(3, 100, 20).cuda()
+        >>> out = sbn(inp)
+    """
+
+    warned = False
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False):
+        if channel_last == True:
+            raise AttributeError("channel_last is not supported by primitive SyncBatchNorm implementation. Try install apex with `--cuda_ext` if channel_last is desired.")
+
+        if not SyncBatchNorm.warned:
+            print("Warning:  using Python fallback for SyncBatchNorm, possibly because apex was installed without --cuda_ext.  The exception raised when attempting to import the cuda backend was: ", self.syncbn_import_error)
+            SyncBatchNorm.warned = True
+
+        super(SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
+        self.process_group = process_group
+
+    def _specify_process_group(self, process_group):
+        self.process_group = process_group
+
+    def forward(self, input):
+        torch.cuda.nvtx.range_push("sync_bn_fw_with_mean_var")
+        mean = None
+        var = None
+        cast = None
+        out = None
+
+        # casting to handle mismatch input type to layer type
+        if self.running_mean is not None:
+            if self.running_mean.dtype != input.dtype:
+                input = input.to(self.running_mean.dtype)
+                cast = input.dtype
+        elif self.weight is not None:
+            if self.weight.dtype != input.dtype:
+                input = input.to(self.weight.dtype)
+                cast = input.dtype
+
+        if not self.training and self.track_running_stats:
+            # fall back to pytorch implementation for inference
+            torch.cuda.nvtx.range_pop()
+            out = F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
+        else:
+            process_group = self.process_group
+            world_size = 1
+            if not self.process_group:
+                process_group = torch.distributed.group.WORLD
+            self.num_batches_tracked += 1
+            with torch.no_grad():
+                channel_first_input = input.transpose(0, 1).contiguous()
+                squashed_input_tensor_view = channel_first_input.view(
+                    channel_first_input.size(0), -1)
+                # total number of data points for each variance entry. Used to calculate unbiased variance estimate
+                m = None
+                local_m = float(squashed_input_tensor_view.size()[1])
+                local_mean = torch.mean(squashed_input_tensor_view, 1)
+                local_sqr_mean = torch.pow(
+                    squashed_input_tensor_view, 2).mean(1)
+                if torch.distributed.is_initialized():
+                    world_size = torch.distributed.get_world_size(process_group)
+                    torch.distributed.all_reduce(
+                        local_mean, ReduceOp.SUM, process_group)
+                    mean = local_mean / world_size
+                    torch.distributed.all_reduce(
+                        local_sqr_mean, ReduceOp.SUM, process_group)
+                    sqr_mean = local_sqr_mean / world_size
+                    m = local_m * world_size
+                else:
+                    m = local_m
+                    mean = local_mean
+                    sqr_mean = local_sqr_mean
+                # var(x) = E (( x - mean_x ) ** 2)
+                #        = 1 / N * sum ( x - mean_x ) ** 2
+                #        = 1 / N * sum (x**2) - mean_x**2
+                var = sqr_mean - mean.pow(2)
+
+                if self.running_mean is not None:
+                    self.running_mean = self.momentum * mean + \
+                        (1 - self.momentum) * self.running_mean
+                if self.running_var is not None:
+                    # as noted by the paper, we used unbiased variance estimate of the mini-batch
+                    # Var[x] = m / (m-1) * Eb (sample_variance)
+                    self.running_var = m / \
+                        (m-1) * self.momentum * var + \
+                        (1 - self.momentum) * self.running_var
+            torch.cuda.nvtx.range_pop()
+            out = SyncBatchnormFunction.apply(input, self.weight, self.bias, mean, var, self.eps, process_group, world_size)
+        out = out.to(cast)
diff --git a/apex/apex/parallel/sync_batchnorm_kernel.py b/apex/apex/parallel/sync_batchnorm_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..e407a63da3020cd154708f270d9bf890158413a5
--- /dev/null
+++ b/apex/apex/parallel/sync_batchnorm_kernel.py
@@ -0,0 +1,87 @@
+import torch
+from torch.autograd.function import Function
+
+from apex.parallel import ReduceOp
+
+
+class SyncBatchnormFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, weight, bias, running_mean, running_variance, eps, process_group, world_size):
+        torch.cuda.nvtx.range_push("sync_BN_fw")
+        # transpose it to channel last to support broadcasting for input with different rank
+        c_last_input = input.transpose(1, -1).contiguous().clone()
+
+        ctx.save_for_backward(c_last_input, weight, bias,
+                              running_mean, running_variance)
+        ctx.eps = eps
+        ctx.process_group = process_group
+        ctx.world_size = world_size
+
+        c_last_input = (c_last_input - running_mean) / \
+            torch.sqrt(running_variance + eps)
+
+        if weight is not None:
+            c_last_input = c_last_input * weight
+        if bias is not None:
+            c_last_input = c_last_input + bias
+
+        torch.cuda.nvtx.range_pop()
+        return c_last_input.transpose(1, -1).contiguous().clone()
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        torch.cuda.nvtx.range_push("sync_BN_bw")
+        # mini batch mean & var are calculated by forward path.
+        # mu = 1./N*np.sum(h, axis = 0)
+        # var = 1./N*np.sum((h-mu)**2, axis = 0)
+        c_last_input, weight, bias, running_mean, running_variance = ctx.saved_tensors
+
+        eps = ctx.eps
+        process_group = ctx.process_group
+        world_size = ctx.world_size
+        grad_input = grad_weight = grad_bias = None
+        num_features = running_mean.size()[0]
+
+        # transpose it to channel last to support broadcasting for input with different rank
+        torch.cuda.nvtx.range_push("carilli field")
+        c_last_grad = grad_output.transpose(1, -1).contiguous()
+        # squash non-channel dimension so we can easily calculate mean
+        c_grad = c_last_grad.view(-1, num_features).contiguous()
+        torch.cuda.nvtx.range_pop()
+
+        # calculate grad_input
+        if ctx.needs_input_grad[0]:
+            # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0)
+            #     - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0))
+            mean_dy = c_grad.mean(0)
+            mean_dy_xmu = (c_last_grad * (c_last_input -
+                                          running_mean)).view(-1, num_features).mean(0)
+            if torch.distributed.is_initialized():
+                torch.distributed.all_reduce(
+                    mean_dy, ReduceOp.SUM, process_group)
+                mean_dy = mean_dy / world_size
+                torch.distributed.all_reduce(
+                    mean_dy_xmu, ReduceOp.SUM, process_group)
+                mean_dy_xmu = mean_dy_xmu / world_size
+            c_last_grad_input = (c_last_grad - mean_dy - (c_last_input - running_mean) / (
+                running_variance + eps) * mean_dy_xmu) / torch.sqrt(running_variance + eps)
+            if weight is not None:
+                c_last_grad_input.mul_(weight)
+            grad_input = c_last_grad_input.transpose(1, -1).contiguous()
+
+        # calculate grad_weight
+        grad_weight = None
+        if weight is not None and ctx.needs_input_grad[1]:
+            # dgamma = np.sum((h - mu) * (var + eps)**(-1. / 2.) * dy, axis=0)
+            grad_weight = ((c_last_input - running_mean) / torch.sqrt(
+                running_variance + eps) * c_last_grad).view(-1, num_features).sum(0)
+
+        # calculate grad_bias
+        grad_bias = None
+        if bias is not None and ctx.needs_input_grad[2]:
+            # dbeta = np.sum(dy, axis=0)
+            grad_bias = c_grad.sum(0)
+
+        torch.cuda.nvtx.range_pop()
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None
diff --git a/apex/apex/reparameterization/README.md b/apex/apex/reparameterization/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e86fd8fc1eecb066b10814fea5967acbb67fe7b
--- /dev/null
+++ b/apex/apex/reparameterization/README.md
@@ -0,0 +1 @@
+Under construction...
diff --git a/apex/apex/reparameterization/__init__.py b/apex/apex/reparameterization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9519e7c981f282ad97b15230e9f7d746a1d0e7b
--- /dev/null
+++ b/apex/apex/reparameterization/__init__.py
@@ -0,0 +1,127 @@
+from .weight_norm import WeightNorm
+from .reparameterization import Reparameterization
+
+def apply_weight_norm(module, name='', dim=0, hook_child=True):
+    """
+    Applies weight normalization to a parameter in the given module.
+    If no parameter is provided, applies weight normalization to all
+    parameters in model (except 1-d vectors and scalars).
+
+    .. math::
+         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+
+    Weight normalization is a reparameterization that decouples the magnitude
+    of a weight tensor from its direction. This replaces the parameter specified
+    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
+    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
+    Weight normalization is implemented via a hook that recomputes the weight
+    tensor from the magnitude and direction before every :meth:`~Module.forward`
+    call.
+
+    By default, with `dim=0`, the norm is computed independently per output
+    channel/plane. To compute a norm over the entire weight tensor, use
+    `dim=None`.
+
+    See https://arxiv.org/abs/1602.07868
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+        dim (int, optional): dimension over which to compute the norm
+        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
+            parameters. If False, it's added to `module` instead. Default: True
+
+    Returns:
+        The original module with the weight norm hook
+
+    Example::
+
+        >>> m = apply_weight_norm(nn.Linear(20, 40), name='weight')
+        Linear (20 -> 40)
+        >>> m.weight_g.size()
+        torch.Size([40, 1])
+        >>> m.weight_v.size()
+        torch.Size([40, 20])
+
+    """
+    return apply_reparameterization(module, reparameterization=WeightNorm, hook_child=hook_child,
+                                    name=name, dim=dim)
+
+def remove_weight_norm(module, name='', remove_all=False):
+    """
+    Removes the weight normalization reparameterization of a parameter from a module.
+    If no parameter is supplied then all weight norm parameterizations are removed.
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+    Example:
+        >>> m = apply_weight_norm(nn.Linear(20, 40))
+        >>> remove_weight_norm(m)
+    """
+    return remove_reparameterization(module, reparameterization=WeightNorm,
+                                    name=name, remove_all=remove_all)
+
+def apply_reparameterization(module, reparameterization=None, name='', dim=0, hook_child=True):
+    """
+    Applies a given weight reparameterization (such as weight normalization) to
+    a parameter in the given module. If no parameter is given, applies the reparameterization
+    to all parameters in model (except 1-d vectors and scalars).
+
+    Args:
+        module (nn.Module): containing module
+        reparameterization (Reparameterization): reparamaterization class to apply
+        name (str, optional): name of weight parameter
+        dim (int, optional): dimension over which to perform reparameterization op
+        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
+            parameters. If False, it's added to `module` instead. Default: True
+
+    Returns:
+        The original module with the reparameterization hook
+
+    Example::
+
+        >>> m = apply_reparameterization(nn.Linear(20, 40), WeightNorm)
+        Linear (20 -> 40)
+
+    """
+    assert reparameterization is not None
+    if name != '':
+        Reparameterization.apply(module, name, dim, reparameterization, hook_child)
+    else:
+        names = list(module.state_dict().keys())
+        for name in names:
+            apply_reparameterization(module, reparameterization, name, dim, hook_child)
+    return module
+
+def remove_reparameterization(module, reparameterization=Reparameterization,
+                                name='', remove_all=False):
+    """
+    Removes the given reparameterization of a parameter from a module.
+    If no parameter is supplied then all reparameterizations are removed.
+    Args:
+        module (nn.Module): containing module
+        reparameterization (Reparameterization): reparamaterization class to apply
+        name (str, optional): name of weight parameter
+        remove_all (bool, optional): if True, remove all reparamaterizations of given type. Default: False
+    Example:
+        >>> m = apply_reparameterization(nn.Linear(20, 40),WeightNorm)
+        >>> remove_reparameterization(m)
+    """
+    if name != '' or remove_all:
+        to_remove = []
+        for k, hook in module._forward_pre_hooks.items():
+            if isinstance(hook, reparameterization) and (hook.name == name or remove_all):
+                hook.remove(module)
+                to_remove.append(k)
+        if len(to_remove) > 0:
+            for k in to_remove:
+                del module._forward_pre_hooks[k]
+            return module
+        if not remove_all:
+            raise ValueError("reparameterization of '{}' not found in {}"
+                             .format(name, module))
+    else:
+        modules = [module]+[x for x in module.modules()]
+        for m in modules:
+            remove_reparameterization(m, reparameterization=reparameterization, remove_all=True)
+        return module
diff --git a/apex/apex/reparameterization/reparameterization.py b/apex/apex/reparameterization/reparameterization.py
new file mode 100644
index 0000000000000000000000000000000000000000..24b11ba3925bd767c7539ce144747e783b313d90
--- /dev/null
+++ b/apex/apex/reparameterization/reparameterization.py
@@ -0,0 +1,151 @@
+import torch
+from torch.nn.parameter import Parameter
+import sys
+class Reparameterization(object):
+    """
+    Class interface for performing weight reparameterizations
+    Arguments:
+        name (str): name of weight parameter
+        dim (int): dimension over which to compute the norm
+        module (nn.Module): parent module to which param `name` is registered to
+        retain_forward (bool, optional): if False deletes weight on call to 
+            module.backward. Used to avoid memory leaks with DataParallel Default: True
+    Attributes:
+        reparameterization_names (list, str): contains names of all parameters 
+            needed to compute reparameterization.
+        backward_hook_key (int): torch.utils.hooks.RemovableHandle.id for hook used in module backward pass.
+    """
+
+    def __init__(self, name, dim, module, retain_forward=True):
+        self.name = name
+        self.dim = dim
+        self.evaluated = False
+        self.retain_forward = retain_forward
+        self.reparameterization_names = []
+        self.backward_hook_key = None
+        self.module = module
+
+    def compute_weight(self, module=None, name=None):
+        """
+        Computes reparameterized weight value to assign value to module attribute
+        with name `name`.
+        See WeightNorm class for example.
+        Arguments:
+            module (nn.Module): module with weight we'd like to reparameterize
+        Returns:
+            w (Tensor): Tensor object containing value of reparameterized weight
+        """
+        raise NotImplementedError
+
+    def reparameterize(self, name, weight, dim):
+        """
+        Creates Parameters to be used for reparameterization and creates names that
+        for attributes for the module these Parameters will correspond to.
+        The parameters will be registered according to the names provided.
+        See WeightNorm class for example.
+        Arguments:
+            module (nn.Module): module with weight we'd like to reparameterize
+            name (str, optional): name of weight parameter
+            dim (int, optional): dimension over which to compute parameterization
+        Returns:
+            names (list, str): names of Parameters to be used for reparameterization
+            params (list, Parameter): Parameters to be used for reparameterization
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def apply(module, name, dim, reparameterization=None, hook_child=True):
+        """
+        Applies reparametrization to module's `name` parameter and modifies instance attributes as appropriate.
+        `hook_child` adds reparameterization hook to direct parent of the parameters. If False, it's added to `module` instead.
+        """
+        if reparameterization is None:
+            reparameterization = Reparameterization
+        module2use, name2use = Reparameterization.get_module_and_name(module, name)
+        # does not work on sparse
+        if name2use is None or isinstance(module2use, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
+            return
+
+        if hook_child:
+            fn = reparameterization(name2use, dim, module2use)
+        else:
+            fn = reparameterization(name, dim, module)
+
+        weight = getattr(module2use, name2use)
+        if weight.dim() <= 1:
+            return
+
+        # remove weight from parameter list
+        del module2use._parameters[name2use]
+
+        # add parameters of reparameterization of parameter to module
+        names, params = fn.reparameterize(name2use, weight, dim)
+        for n, p in zip(names, params):
+            module2use.register_parameter(n, p)
+
+        # add parameters to reparameterization so they can be removed later
+        fn.reparameterization_names = names
+
+        setattr(module2use, name2use, None)
+
+        hook_module = module2use
+        if not hook_child:
+            hook_module = module
+        # recompute weight before every forward()
+        hook_module.register_forward_pre_hook(fn)
+
+        # remove weight during backward
+        handle = hook_module.register_backward_hook(fn.backward_hook)
+        # get hook key so we can delete it later
+        fn.backward_hook_key = handle.id
+
+        return fn
+
+    @staticmethod
+    def get_module_and_name(module, name):
+        """
+        recursively fetches (possible) child module and name of weight to be reparameterized
+        """
+        name2use = None
+        module2use = None
+        names = name.split('.')
+        if len(names) == 1 and names[0] != '':
+            name2use = names[0]
+            module2use = module
+        elif len(names) > 1:
+            module2use = module
+            name2use = names[0]
+            for i in range(len(names)-1):
+                module2use = getattr(module2use, name2use)
+                name2use = names[i+1]
+        return module2use, name2use
+
+    def get_params(self, module):
+        """gets params of reparameterization based on known attribute names"""
+        return [getattr(module, n) for n in self.reparameterization_names]
+
+    def remove(self, module):
+        """removes reparameterization and backward hook (does not remove forward hook)"""
+        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
+        for p in self.get_params(module2use):
+            p.requires_grad = False
+        weight = self.compute_weight(module2use, name2use)
+        delattr(module2use, name2use)
+        for n in self.reparameterization_names:
+            del module2use._parameters[n]
+        module2use.register_parameter(name2use, Parameter(weight.data))
+        del module._backward_hooks[self.backward_hook_key]
+
+    def __call__(self, module, inputs):
+        """callable hook for forward pass"""
+        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
+        _w = getattr(module2use, name2use)
+        if not self.evaluated or _w is None:
+            setattr(module2use, name2use, self.compute_weight(module2use, name2use))
+            self.evaluated = True
+
+    def backward_hook(self, module, grad_input, grad_output):
+        """callable hook for backward pass"""
+        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
+        wn = getattr(module2use, name2use)
+        self.evaluated = False
diff --git a/apex/apex/reparameterization/weight_norm.py b/apex/apex/reparameterization/weight_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..489cf691a6a08eeaf26f67e4adfcd4a3f1d8df92
--- /dev/null
+++ b/apex/apex/reparameterization/weight_norm.py
@@ -0,0 +1,78 @@
+import torch
+from torch.nn.parameter import Parameter
+from ..fp16_utils import Fused_Weight_Norm
+import time
+
+from .reparameterization import Reparameterization
+
+def _norm(p, dim):
+    """Computes the norm over all dimensions except dim"""
+    if dim is None:
+        return p.norm()
+    elif dim == 0:
+        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
+        return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
+    elif dim == p.dim() - 1:
+        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
+        return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
+    return _norm(p.transpose(0, dim), 0).transpose(0, dim)
+
+HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor)
+
+class WeightNorm(Reparameterization):
+    """
+    Weight normalization is a reparameterization that decouples the magnitude
+    of a weight tensor from its direction. This replaces the parameter specified
+    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
+    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
+    Weight normalization is implemented via a hook that recomputes the weight
+    tensor from the magnitude and direction before every :meth:`~Module.forward`
+    call.
+
+    .. math::
+         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+
+    By default, with `dim=0`, the norm is computed independently per output
+    channel/plane. To compute a norm over the entire weight tensor, use
+    `dim=None`.
+    """
+    def compute_weight(self, module=None, name=None):
+        """
+        Computes weight normalized weight value to assign value to module attribute
+        with name `name`.
+        Arguments:
+            module (nn.Module): module with weight we'd like to reparameterize
+        Returns:
+            w (Tensor): Tensor object containing value of reparameterized weight
+        """
+        if module is None:
+            module = self.module
+        if name is None:
+            name = self.name
+        module, name = Reparameterization.get_module_and_name(module, name)
+        g = getattr(module, name + '_g')
+        v = getattr(module, name + '_v')
+
+        fused_weight_norm = Fused_Weight_Norm.apply
+        v = v.contiguous()
+        w = fused_weight_norm(v, g, self.dim)
+
+        return w
+
+    def reparameterize(self, name, weight, dim):
+        """
+        Creates Parameters v and gto be used for weight normalization
+        and creates names that for attributes for the module these Parameters
+        will correspond to. The parameters will be registered according to the names
+        provided.
+        Arguments:
+            module (nn.Module): module with weight we'd like to reparameterize
+            name (str, optional): name of weight parameter
+            dim (int, optional): dimension over which to compute parameterization
+        Returns:
+            names (list, str): names of Parameters to be used for reparameterization
+            params (list, Parameter): Parameters to be used for reparameterization
+        """
+        names = [name + '_g', name + '_v']
+        params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)]
+        return names, params
diff --git a/apex/csrc/amp_C_frontend.cpp b/apex/csrc/amp_C_frontend.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4ee32edb4d642f5285190fd3f8a0a5793fc1d60
--- /dev/null
+++ b/apex/csrc/amp_C_frontend.cpp
@@ -0,0 +1,54 @@
+#include <torch/extension.h>
+
+void multi_tensor_scale_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float scale);
+
+void multi_tensor_axpby_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float a,
+  float b,
+  int arg_to_check);
+
+std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::optional<bool> per_tensor_python);
+
+void multi_tensor_lamb_stage1_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor per_tensor_decay,
+  const int step,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const float global_grad_norm,
+  const float max_global_grad_norm);
+
+void multi_tensor_lamb_stage2_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor per_tensor_param_norm,
+  at::Tensor per_tensor_update_norm,
+  const float step_size);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("multi_tensor_scale", &multi_tensor_scale_cuda,
+        "Fused overflow check + scale for a list of contiguous tensors");
+  m.def("multi_tensor_axpby", &multi_tensor_axpby_cuda,
+        "out = a*x + b*y for a list of contiguous tensors");
+  m.def("multi_tensor_l2norm", &multi_tensor_l2norm_cuda,
+        "Computes L2 norm for a list of contiguous tensors");
+  m.def("multi_tensor_lamb_stage1_cuda", &multi_tensor_lamb_stage1_cuda,
+        "Computes update part of LAMB optimizer");
+  m.def("multi_tensor_lamb_stage2_cuda", &multi_tensor_lamb_stage2_cuda,
+        "Completes application of gradient to parameters for LAMB optimizer");
+}
diff --git a/apex/csrc/flatten_unflatten.cpp b/apex/csrc/flatten_unflatten.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d49ce759f1fa805d9335aca6577b2460d773a1e7
--- /dev/null
+++ b/apex/csrc/flatten_unflatten.cpp
@@ -0,0 +1,18 @@
+#include <torch/extension.h>
+#include <torch/csrc/utils/tensor_flatten.h>
+// https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h
+
+at::Tensor flatten(std::vector<at::Tensor> tensors)
+{
+  return torch::utils::flatten_dense_tensors(tensors);
+}
+
+std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tensor> tensors)
+{
+  return torch::utils::unflatten_dense_tensors(flat, tensors);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("flatten", &flatten, "Flatten dense tensors");
+  m.def("unflatten", &unflatten, "Unflatten dense tensors");
+}
diff --git a/apex/csrc/fused_adam_cuda.cpp b/apex/csrc/fused_adam_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7382f8f4c9d4cc8f3b76d31b065c82d0ccfc3f2a
--- /dev/null
+++ b/apex/csrc/fused_adam_cuda.cpp
@@ -0,0 +1,28 @@
+#include <torch/extension.h>
+
+// CUDA forward declaration
+void fused_adam_cuda(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay);
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+// C++ interface
+void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay) {
+        CHECK_INPUT(p)
+        if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
+        CHECK_INPUT(m);
+        CHECK_INPUT(v);
+        CHECK_INPUT(g);
+        int64_t num_elem = p.numel();
+        AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
+        AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
+        AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
+        AT_ASSERTM(p_copy.numel() == num_elem || p_copy.numel() == 0, "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");
+
+        fused_adam_cuda(p, p_copy, m, v, g, lr, beta1, beta2, eps, grad_scale, step, mode, bias_correction, decay);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+        m.def("adam", &adam, "Adam optimized CUDA implementation.");
+}
diff --git a/apex/csrc/fused_adam_cuda_kernel.cu b/apex/csrc/fused_adam_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..95581d1e0030fa5d62cee2f94a8ddcef813a2e90
--- /dev/null
+++ b/apex/csrc/fused_adam_cuda_kernel.cu
@@ -0,0 +1,137 @@
+#include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/detail/IndexUtils.cuh"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include "ATen/TensorUtils.h"
+#include "ATen/Type.h"
+#include "ATen/AccumulateType.h"
+#include <THC/THCGeneral.h>
+
+#include "type_shim.h"
+
+typedef enum{
+    ADAM_MODE_0   =0, // eps under square root
+    ADAM_MODE_1   =1  // eps outside square root
+} adamMode_t;
+
+template <typename T, typename GRAD_T>
+__global__ void adam_cuda_kernel(
+        GRAD_T* __restrict__ p,
+        T* __restrict__ p_copy, // For mixed precision training, pass NULL if not needed
+        T* __restrict__ m,
+        T* __restrict__ v,
+        const GRAD_T * __restrict__ g,
+        const float b1,
+        const float b2,
+        const float eps,
+        const float grad_scale,
+        const float step_size,
+        const size_t tsize,
+        adamMode_t mode,
+        const float decay)
+{
+        //Assuming 2D grids and 2D blocks
+        const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+        const int threadsPerBlock = blockDim.x * blockDim.y;
+        const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
+        const int i = (blockId * threadsPerBlock + threadIdInBlock);
+        const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
+
+        for (int j = i; j < tsize; j+=totThreads) {
+                T scaled_grad = g[j]/grad_scale;
+                m[j] = b1*m[j] + (1-b1)*scaled_grad;
+                v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
+                float denom;
+                if (mode == ADAM_MODE_0)
+                    denom = sqrtf(v[j] + eps);
+                else // Mode 1
+                    denom = sqrtf(v[j]) + eps;
+                float update = (m[j]/denom) + (decay*p[j]);
+                p[j] = (GRAD_T) (p[j] - (step_size*update));
+                if (p_copy != NULL) p_copy[j] = (GRAD_T) p[j];
+        }
+}
+
+void fused_adam_cuda(
+        at::Tensor & p,
+        at::Tensor & p_copy,
+        at::Tensor & m,
+        at::Tensor & v,
+        at::Tensor & g,
+        float lr,
+        float beta1,
+        float beta2,
+        float eps,
+        float grad_scale,
+        int step,
+        int mode,
+        int bias_correction,
+        float decay)
+{
+//        using namespace at;
+
+        //Get tensor size
+        int tsize = p.numel();
+        //Determine #threads and #blocks
+        const int threadsPerBlock = 512;
+        const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
+        AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
+        //Constants
+        float step_size = 0;
+        if (bias_correction == 1) {
+            const float bias_correction1 = 1 - std::pow(beta1, step);
+            const float bias_correction2 = 1 - std::pow(beta2, step);
+            step_size = lr * std::sqrt(bias_correction2)/bias_correction1;
+        }
+        else {
+            step_size = lr;
+        }
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        if (g.scalar_type() == at::ScalarType::Half) {
+//all other values should be fp32 for half gradients
+//            AT_ASSERTM(p.scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
+//dispatch is done on the gradient type
+            using namespace at; // prevents "toString is undefined" errors
+            DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel", 
+                using accscalar_t = at::acc_type<scalar_t_0, true>;
+                adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
+                        p.data<scalar_t_0>(),
+                        NULL, //don't output p_copy for fp32, it's wasted write
+                        m.data<accscalar_t>(),
+                        v.data<accscalar_t>(),
+                        g.data<scalar_t_0>(),
+                        beta1,
+                        beta2,
+                        eps,
+                        grad_scale,
+                        step_size,
+                        tsize,
+                        (adamMode_t) mode,
+                        decay);
+                )
+      } else {
+            using namespace at;
+            DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
+                adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
+                        p.data<scalar_t_0>(),
+                        NULL, //don't output p_copy for fp32, it's wasted write
+                        m.data<scalar_t_0>(),
+                        v.data<scalar_t_0>(),
+                        g.data<scalar_t_0>(),
+                        beta1,
+                        beta2,
+                        eps,
+                        grad_scale,
+                        step_size,
+                        tsize,
+                        (adamMode_t) mode,
+                        decay);
+            );
+      }
+      THCudaCheck(cudaGetLastError());
+
+}
diff --git a/apex/csrc/layer_norm_cuda.cpp b/apex/csrc/layer_norm_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b512808c2f773a0701d16dd0cb41eef19ef29660
--- /dev/null
+++ b/apex/csrc/layer_norm_cuda.cpp
@@ -0,0 +1,240 @@
+#include <torch/extension.h>
+#include <vector>
+#include <cassert>
+
+namespace {
+void compute_n1_n2(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    int& n1,
+    int& n2)
+{
+    int idiff = input.ndimension() - normalized_shape.size();
+    n2 = 1;
+    for (int i = 0;  i < (int)normalized_shape.size();  ++i) {
+	    assert( input.sizes()[i+idiff] == normalized_shape[i] );
+	    n2 *= normalized_shape[i];
+    }
+    n1 = 1;
+    for (int i = 0;  i < idiff;  ++i) {
+	    n1 *= input.sizes()[i];
+    }
+}
+
+void check_args(
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta
+    )
+{
+    AT_CHECK(!gamma.defined() || gamma.sizes().equals(normalized_shape));
+    AT_CHECK(!beta.defined() || beta.sizes().equals(normalized_shape));
+}
+
+void check_args(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    int& n1,
+    int& n2
+    )
+{
+    int64_t normalized_ndim = normalized_shape.size();
+
+    if (normalized_ndim < 1) {
+      std::stringstream ss;
+      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
+         << "containing at least one element, but got normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    auto input_shape = input.sizes();
+    auto input_ndim = input.dim();
+
+    if (input_ndim < normalized_ndim ||
+        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Given normalized_shape=" << normalized_shape
+         << ", expected input with shape [*";
+      for (auto size : normalized_shape) {
+        ss << ", " << size;
+      }
+      ss << "], but got input of size" << input_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    compute_n1_n2(input,normalized_shape,n1,n2);
+}
+
+
+void check_args(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta,
+    int& n1,
+    int& n2
+    )
+{
+    check_args(input,normalized_shape,n1,n2);
+    check_args(normalized_shape,gamma,beta);
+}
+}
+
+void cuda_layer_norm(
+    at::Tensor* output,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon);
+
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> layer_norm(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    double epsilon) {
+  CHECK_INPUT(input);
+  int n1,n2;
+  check_args(input,normalized_shape,n1,n2);
+  at::Tensor output = at::empty_like(input);
+  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
+  at::Tensor invvar = at::empty_like(mean);
+  cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
+      normalized_shape,NULL,NULL,epsilon);
+  return {output, mean, invvar};
+}
+std::vector<at::Tensor> layer_norm_affine(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta,
+    double epsilon) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(gamma);
+  CHECK_INPUT(beta);
+  int n1,n2;
+  check_args(input,normalized_shape,gamma,beta,n1,n2);
+  at::Tensor output = at::empty_like(input);
+  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
+  at::Tensor invvar = at::empty_like(mean);
+  cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
+      normalized_shape,&gamma,&beta,epsilon);
+  return {output, mean, invvar};
+}
+
+void cuda_layer_norm_gradient(
+    at::Tensor* dout,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon,
+    at::Tensor* grad_input,
+    at::Tensor* grad_gamma,
+    at::Tensor* grad_beta
+    );
+
+at::Tensor layer_norm_gradient(
+    at::Tensor dout,
+    at::Tensor mean,
+    at::Tensor invvar,
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    double epsilon) {
+  CHECK_INPUT(dout);
+  CHECK_INPUT(mean);
+  CHECK_INPUT(invvar);
+  CHECK_INPUT(input);
+  int n1,n2;
+  check_args(input,normalized_shape,n1,n2);
+  at::Tensor grad_input = at::empty_like(input);
+  cuda_layer_norm_gradient(&dout,&mean,&invvar,&input,n1,n2,
+      normalized_shape,NULL,NULL,epsilon,
+      &grad_input,NULL,NULL);
+  return grad_input;
+}
+std::vector<at::Tensor> layer_norm_gradient_affine(
+    at::Tensor dout,
+    at::Tensor mean,
+    at::Tensor invvar,
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta,
+    double epsilon) {
+  CHECK_INPUT(dout);
+  CHECK_INPUT(mean);
+  CHECK_INPUT(invvar);
+  CHECK_INPUT(input);
+  CHECK_INPUT(gamma);
+  CHECK_INPUT(beta);
+  int n1,n2;
+  check_args(input,normalized_shape,gamma,beta,n1,n2);
+  at::Tensor grad_input = at::empty_like(input);
+  at::Tensor grad_gamma = at::empty_like(gamma);
+  at::Tensor grad_beta = at::empty_like(beta);
+  cuda_layer_norm_gradient(&dout,&mean,&invvar,&input,n1,n2,
+      normalized_shape,&gamma,&beta,epsilon,
+      &grad_input,&grad_gamma,&grad_beta);
+  return {grad_input, grad_gamma, grad_beta};
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward_affine", &layer_norm_affine, "LayerNorm forward (CUDA)");
+  m.def("forward", &layer_norm, "LayerNorm forward (CUDA)");
+  m.def("backward_affine", &layer_norm_gradient_affine, "LayerNorm backward (CUDA)");
+  m.def("backward", &layer_norm_gradient, "LayerNorm backward (CUDA)");
+}
+
diff --git a/apex/csrc/layer_norm_cuda_kernel.cu b/apex/csrc/layer_norm_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..889c891d17a6f97a89b6c451356ddc3fabdd2227
--- /dev/null
+++ b/apex/csrc/layer_norm_cuda_kernel.cu
@@ -0,0 +1,805 @@
+#include "ATen/ATen.h"
+#include "ATen/AccumulateType.h"
+#include "ATen/cuda/CUDAContext.h"
+#include <THC/THCDeviceUtils.cuh>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "type_shim.h"
+
+template<typename U> __device__
+void cuWelfordOnlineSum(
+  const U curr,
+  U& mu,
+  U& sigma2,
+  U& count)
+{
+  count = count + U(1);
+  U delta = curr - mu;
+  U lmean = mu + delta / count;
+  mu = lmean;
+  U delta2 = curr - lmean;
+  sigma2 = sigma2 + delta * delta2;
+}
+
+template<typename U> __device__
+void cuChanOnlineSum(
+  const U muB,
+  const U sigma2B,
+  const U countB,
+  U& mu,
+  U& sigma2,
+  U& count)
+{
+  U delta = muB - mu;
+  U nA = count;
+  U nB = countB;
+  count = count + countB;
+  U nX = count;
+  if (nX > U(0)) {
+    nA = nA / nX;
+    nB = nB / nX;
+    mu = nA*mu + nB*muB;
+    sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
+  } else {
+    mu = U(0);
+    sigma2 = U(0);
+  }
+}
+
+template<typename T, typename U> __device__
+void cuWelfordMuSigma2(
+  const T* __restrict__ vals,
+  const int n1,
+  const int n2,
+  U& mu,
+  U& sigma2,
+  U* buf) 
+{
+  // Assumptions:
+  // 1) blockDim.x == warpSize
+  // 2) Tensor is contiguous
+  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
+  //
+  // compute variance and mean over n2
+  U count = U(0);
+  mu= U(0);
+  sigma2 = U(0);
+  int i1 = blockIdx.y;
+  if (i1 < n1) {
+    // one warp normalizes one n1 index,
+    // synchronization is implicit
+    // initialize with standard Welford algorithm
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const T* lvals = vals + i1*n2;
+    int l = 4*thrx;
+    for (;  l+3 < n2;  l+=4*numx) {
+      for (int k = 0;  k < 4;  ++k) {
+        U curr = static_cast<U>(lvals[l+k]);
+        cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
+      }
+    }
+    for (;  l < n2;  ++l) {
+      U curr = static_cast<U>(lvals[l]);
+      cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
+    }
+    // intra-warp reductions
+    for (int l = 0;  l <= 4;  ++l) {
+      int srcLaneB = (threadIdx.x+(1<<l))&31;
+      U muB = WARP_SHFL(mu, srcLaneB);
+      U countB = WARP_SHFL(count, srcLaneB);
+      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
+    }
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      U* ubuf = (U*)buf;
+      U* ibuf = (U*)(ubuf + blockDim.y);
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_y = threadIdx.y - offset;
+          ubuf[2*wrt_y] = mu;
+          ubuf[2*wrt_y+1] = sigma2;
+          ibuf[wrt_y] = count;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.x == 0 && threadIdx.y < offset) {
+          U muB = ubuf[2*threadIdx.y];
+          U sigma2B = ubuf[2*threadIdx.y+1];
+          U countB = ibuf[threadIdx.y];
+          cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
+        }
+        __syncthreads();
+      }
+      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ubuf[0] = mu;
+        ubuf[1] = sigma2;
+      }
+      __syncthreads();
+      mu = ubuf[0];
+      sigma2 = ubuf[1]/U(n2);
+      // don't care about final value of count, we know count == n2
+    } else {
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2/U(n2), 0);
+    }
+  }
+}
+
+template<> __device__
+void cuWelfordMuSigma2(
+  const at::Half* __restrict__ vals,
+  const int n1,
+  const int n2,
+  float& mu,
+  float& sigma2,
+  float* buf) 
+{
+  // Assumptions:
+  // 1) blockDim.x == warpSize
+  // 2) Tensor is contiguous
+  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
+  //
+  // compute variance and mean over n2
+  float count = 0.0f;
+  mu= float(0);
+  sigma2 = float(0);
+  int i1 = blockIdx.y;
+  if (i1 < n1) {
+    // one warp normalizes one n1 index,
+    // synchronization is implicit
+    // initialize with standard Welford algorithm
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const at::Half* lvals = vals + i1*n2;
+    int l = 8*thrx;
+    if ((((size_t)lvals)&3) != 0) {
+      // 16 bit alignment
+      // first thread consumes first point
+      if (thrx == 0) {
+        float curr = static_cast<float>(lvals[0]);
+        cuWelfordOnlineSum(curr,mu,sigma2,count);
+      }
+      ++l;
+    }
+    // at this point, lvals[l] are 32 bit aligned for all threads.
+    for (;  l+7 < n2;  l+=8*numx) {
+      for (int k = 0;  k < 8;  k+=2) {
+        float2 curr = __half22float2(*((__half2*)(lvals+l+k)));
+        cuWelfordOnlineSum(curr.x,mu,sigma2,count);
+	cuWelfordOnlineSum(curr.y,mu,sigma2,count);
+      }
+    }
+    for (;  l < n2;  ++l) {
+      float curr = static_cast<float>(lvals[l]);
+      cuWelfordOnlineSum(curr,mu,sigma2,count);
+    }
+    // intra-warp reductions
+    for (int l = 0;  l <= 4;  ++l) {
+      int srcLaneB = (threadIdx.x+(1<<l))&31;
+      float muB = WARP_SHFL(mu, srcLaneB);
+      float countB = WARP_SHFL(count, srcLaneB);
+      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
+    }
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      float* ubuf = (float*)buf;
+      float* ibuf = (float*)(ubuf + blockDim.y);
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_y = threadIdx.y - offset;
+          ubuf[2*wrt_y] = mu;
+          ubuf[2*wrt_y+1] = sigma2;
+          ibuf[wrt_y] = count;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.x == 0 && threadIdx.y < offset) {
+          float muB = ubuf[2*threadIdx.y];
+          float sigma2B = ubuf[2*threadIdx.y+1];
+          float countB = ibuf[threadIdx.y];
+          cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
+        }
+        __syncthreads();
+      }
+      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ubuf[0] = mu;
+        ubuf[1] = sigma2;
+      }
+      __syncthreads();
+      mu = ubuf[0];
+      sigma2 = ubuf[1]/float(n2);
+      // don't care about final value of count, we know count == n2
+    } else {
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2/float(n2), 0);
+    }
+  }
+}
+
+template<typename U> U rsqrt(U v) {
+  return U(1) / sqrt(v);
+}
+template<> float rsqrt(float v) {
+  return rsqrtf(v);
+}
+template<> double rsqrt(double v) {
+  return rsqrt(v);
+}
+
+namespace {
+// This is the un-specialized struct.  Note that we prevent instantiation of this
+// struct by putting an undefined symbol in the function body so it won't compile.
+//  template <typename T>
+//  struct SharedMemory
+//  {
+//      // Ensure that we won't compile any un-specialized types
+//      __device__ T *getPointer()
+//      {
+//          extern __device__ void error(void);
+//          error();
+//          return NULL;
+//      }
+//  };
+// https://github.com/NVIDIA/apex/issues/246
+template <typename T>
+struct SharedMemory;
+
+template <>
+struct SharedMemory <float>
+{
+    __device__ float *getPointer()
+    {
+        extern __shared__ float s_float[];
+        return s_float;
+    }
+};
+
+template <>
+struct SharedMemory <double>
+{
+    __device__ double *getPointer()
+    {
+        extern __shared__ double s_double[];
+        return s_double;
+    }
+};
+}
+
+template<typename T, typename U> __global__
+void cuApplyLayerNorm(
+  T* __restrict__ output_vals,
+  U* __restrict__ mean,
+  U* __restrict__ invvar,
+  const T* __restrict__ vals,
+  const int n1,
+  const int n2,
+  const U epsilon,
+  const T* __restrict__ gamma,
+  const T* __restrict__ beta
+  ) 
+{
+  // Assumptions:
+  // 1) blockDim.x == warpSize
+  // 2) Tensors are contiguous
+  //
+  int i1 = blockIdx.y;
+  if (i1 < n1) {
+    SharedMemory<U> shared;
+    U* buf = shared.getPointer();
+    U mu,sigma2;
+    cuWelfordMuSigma2(vals,n1,n2,mu,sigma2,buf);
+    const T* lvals = vals + i1*n2;
+    T* ovals = output_vals + i1*n2;
+    U c_invvar = rsqrt(sigma2 + epsilon);
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    if (gamma != NULL && beta != NULL) {
+      for (int i = thrx;  i < n2;  i+=numx) {
+        U curr = static_cast<U>(lvals[i]);
+        ovals[i] = gamma[i] * static_cast<T>(c_invvar * (curr - mu)) + beta[i];
+      }
+    } else {
+      for (int i = thrx;  i < n2;  i+=numx) {
+        U curr = static_cast<U>(lvals[i]);
+        ovals[i] = static_cast<T>(c_invvar * (curr - mu));
+      }
+    }
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      mean[i1] = mu;
+      invvar[i1] = c_invvar;
+    }
+  }
+}
+
+template<typename T, typename U> __device__
+void cuLoadWriteStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    U* warp_buf1,
+    U* warp_buf2,
+    const T* input,
+    const T* dout,
+    const int i1_end,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar
+    )
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    U curr_mean = mean[i1];
+    U curr_invvar = invvar[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*n2+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<n2) {
+        U curr_input = static_cast<U>(input[load_idx]);
+	U curr_dout = static_cast<U>(dout[load_idx]);
+	warp_buf1[write_idx] = curr_dout;
+	warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
+      } else {
+        warp_buf1[write_idx] = U(0);
+        warp_buf2[write_idx] = U(0);
+      }
+    }
+  } else {
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      warp_buf1[write_idx] = U(0);
+      warp_buf2[write_idx] = U(0);
+    }
+  }
+}
+
+template<typename T, typename U> __device__
+void cuLoadAddStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    U* warp_buf1,
+    U* warp_buf2,
+    const T* input,
+    const T* dout,
+    const int i1_end,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar
+    )
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    U curr_mean = mean[i1];
+    U curr_invvar = invvar[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*n2+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<n2) {
+        U curr_input = static_cast<U>(input[load_idx]);
+	U curr_dout = static_cast<U>(dout[load_idx]);
+	warp_buf1[write_idx] += curr_dout;
+	warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
+      }
+    }
+  }
+}
+
+template<typename T, typename U> __global__
+void cuComputePartGradGammaBeta(
+    const T* __restrict__ dout,
+    const T* __restrict__ input,
+    const int n1,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar,
+    U epsilon,
+    U* part_grad_gamma,
+    U* part_grad_beta)
+{
+    const int numsegs_n1 = (n1+blockDim.y*blockDim.y-1) / (blockDim.y*blockDim.y);
+    const int segs_per_block = (numsegs_n1 + gridDim.y - 1) / gridDim.y;
+    const int i1_beg = blockIdx.y * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_beg_plus_one = (blockIdx.y+1) * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_end = i1_beg_plus_one < n1 ? i1_beg_plus_one : n1;
+    const int row_stride = blockDim.x+1;
+    const int thr_load_col_off = (threadIdx.x*blockDim.y)&(blockDim.x-1);
+    const int thr_load_row_off = (threadIdx.x*blockDim.y)/blockDim.x + threadIdx.y*blockDim.y;
+    const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
+    SharedMemory<U> shared;
+    U* buf = shared.getPointer(); // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y - 1)*(blockDim.x/blockDim.y) elements
+    U* warp_buf1 = (U*)buf;
+    U* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
+    // compute partial sums from strided inputs
+    // do this to increase number of loads in flight
+    cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
+    for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
+      cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
+    }
+    __syncthreads();
+    // inter-warp reductions
+    // sum within each warp
+    U acc1 = U(0);
+    U acc2 = U(0);
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int row1 = threadIdx.y + k*blockDim.y;
+      int idx1 = row1*row_stride + threadIdx.x;
+      acc1 += warp_buf1[idx1];
+      acc2 += warp_buf2[idx1];
+    }
+    warp_buf1[threadIdx.y*row_stride+threadIdx.x] = acc1;
+    warp_buf2[threadIdx.y*row_stride+threadIdx.x] = acc2;
+    __syncthreads();
+    // sum all warps
+    for (int offset = blockDim.y/2;  offset > 1;  offset /= 2) {
+      if (threadIdx.y < offset) {
+        int row1 = threadIdx.y;
+	int row2 = threadIdx.y + offset;
+	int idx1 = row1*row_stride + threadIdx.x;
+	int idx2 = row2*row_stride + threadIdx.x;
+	warp_buf1[idx1] += warp_buf1[idx2];
+	warp_buf2[idx1] += warp_buf2[idx2];
+      }
+      __syncthreads();
+    }
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadIdx.y == 0 && i2 < n2) {
+      int row1 = threadIdx.y;
+      int row2 = threadIdx.y + 1;
+      int idx1 = row1*row_stride + threadIdx.x;
+      int idx2 = row2*row_stride + threadIdx.x;
+      part_grad_beta[blockIdx.y*n2+i2] = warp_buf1[idx1] + warp_buf1[idx2];
+      part_grad_gamma[blockIdx.y*n2+i2] = warp_buf2[idx1] + warp_buf2[idx2];
+    }
+}
+
+template<typename T, typename U> __global__
+void cuComputeGradGammaBeta(
+    const U* part_grad_gamma,
+    const U* part_grad_beta,
+    const int part_size,
+    const int n1,
+    const int n2,
+    T* grad_gamma,
+    T* grad_beta)
+{
+    // sum partial gradients for gamma and beta
+    SharedMemory<U> shared;
+    U* buf = shared.getPointer(); 
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i2 < n2) {
+      // each warp does sequential reductions until reduced part_size is num_warps
+      int num_warp_reductions = part_size / blockDim.y;
+      U sum_gamma = U(0);
+      U sum_beta = U(0);
+      const U* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * n2 + i2;
+      const U* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * n2 + i2;
+      for (int warp_offset = 0;  warp_offset < num_warp_reductions;  ++warp_offset) {
+        sum_gamma += part_grad_gamma_ptr[warp_offset*n2];
+        sum_beta += part_grad_beta_ptr[warp_offset*n2];
+      }
+      // inter-warp reductions
+      const int nbsize3 = blockDim.x * blockDim.y / 2;
+      for (int offset = blockDim.y/2;  offset >= 1;  offset /= 2) {
+        // top half write to shared memory
+        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          buf[write_idx] = sum_gamma;
+          buf[write_idx+nbsize3] = sum_beta;
+        }
+        __syncthreads();
+        // bottom half sums
+        if (threadIdx.y < offset) {
+          const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_gamma += buf[read_idx];
+          sum_beta += buf[read_idx+nbsize3];
+        }
+        __syncthreads();
+      }
+      // write out fully summed gradients
+      if (threadIdx.y == 0) {
+        grad_gamma[i2] = sum_gamma;
+        grad_beta[i2] = sum_beta;
+      }
+    }
+}
+
+template<typename T, typename U> __global__
+void cuComputeGradInput(
+    const T* __restrict__ dout,
+    const T* __restrict__ input,
+    const int n1,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar,
+    U epsilon,
+    const T* gamma,
+    T* grad_input)
+{
+  int i1 = blockIdx.y;
+  if (i1 < n1) {
+    U sum_loss1 = U(0);
+    U sum_loss2 = U(0);
+    const U c_mean = mean[i1];
+    const U c_invvar = invvar[i1];
+    const T* k_input = input + i1*n2;
+    const T* k_dout = dout + i1*n2;
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    if (gamma != NULL) {
+      int l = 4*thrx;
+      for (;  l+3 < n2;  l+=4*numx) {
+        for (int k = 0;  k < 4;  ++k) {
+          const U c_h = static_cast<U>(k_input[l+k]);
+          const U c_loss = static_cast<U>(k_dout[l+k]);
+          sum_loss1 += c_loss * gamma[l+k];
+          sum_loss2 += c_loss * gamma[l+k] * (c_h - c_mean) * c_invvar;
+        }
+      }
+      for (;  l < n2;  ++l) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        sum_loss1 += c_loss * gamma[l];
+        sum_loss2 += c_loss * gamma[l] * (c_h - c_mean) * c_invvar;
+      }
+    } else {
+      int l = 4*thrx;
+      for (;  l+3 < n2;  l+=4*numx) {
+        for (int k = 0;  k < 4;  ++k) {
+          const U c_h = static_cast<U>(k_input[l+k]);
+          const U c_loss = static_cast<U>(k_dout[l+k]);
+          sum_loss1 += c_loss;
+          sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
+        }
+      }
+      for (;  l < n2;  ++l) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        sum_loss1 += c_loss;
+        sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
+      }
+    }
+    // intra-warp reductions
+    for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
+      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
+      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
+    }
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      SharedMemory<U> shared;
+      U* buf = shared.getPointer(); 
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          buf[2*wrt_i] = sum_loss1;
+          buf[2*wrt_i+1] = sum_loss2;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.y < offset) {
+          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_loss1 += buf[2*read_i];
+          sum_loss2 += buf[2*read_i+1];
+        }
+        __syncthreads();
+      }
+      if (threadIdx.y == 0) {
+        buf[2*threadIdx.x] = sum_loss1;
+        buf[2*threadIdx.x+1] = sum_loss2;
+      }
+      __syncthreads();
+      if (threadIdx.y !=0) {
+        sum_loss1 = buf[2*threadIdx.x];
+        sum_loss2 = buf[2*threadIdx.x+1];
+      } 
+    }
+    // all threads now have the two sums over l
+    U fH = (U)n2;
+    U term1 = (U(1) / fH) * c_invvar;
+    T* k_grad_input = grad_input + i1*n2;
+    if (gamma != NULL) {
+      for (int l = thrx;  l < n2;  l+=numx) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        U f_grad_input = fH * c_loss * gamma[l];
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    } else {
+      for (int l = thrx;  l < n2;  l+=numx) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        U f_grad_input = fH * c_loss;
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    }
+  }
+}
+
+template<typename T, typename U> 
+void HostApplyLayerNorm(
+    T* output,
+    U* mean,
+    U* invvar,
+    const T* input,
+    int n1,
+    int n2,
+    double epsilon,
+    const T* gamma,
+    const T* beta
+    )
+{
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    const dim3 threads(32,4,1);
+    const dim3 blocks(1,n1,1);
+    int nshared = 
+        threads.y > 1 ? 
+	    threads.y*sizeof(U)+(threads.y/2)*sizeof(U) : 
+	    0;
+    cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(
+		    output,
+		    mean,
+		    invvar,
+		    input,
+		    n1,n2,
+		    U(epsilon),
+                    gamma,beta);
+}
+
+void cuda_layer_norm(
+    at::Tensor* output,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon)
+{
+    using namespace at;
+    DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
+        using accscalar_t = at::acc_type<scalar_t_0, true>;
+        HostApplyLayerNorm(
+            output->data<scalar_t_0>(),
+	    mean->data<accscalar_t>(),
+	    invvar->data<accscalar_t>(),
+	    input->data<scalar_t_0>(),
+	    n1,n2,
+	    epsilon,
+	    gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
+	    beta != NULL ? beta->data<scalar_t_0>() : NULL);
+      )
+}
+
+template<typename T, typename U> 
+void HostLayerNormGradient(
+    const T* dout,
+    const U* mean,
+    const U* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    const T* gamma,
+    const T* beta,
+    double epsilon,
+    T* grad_input,
+    T* grad_gamma,
+    T* grad_beta
+    )
+{
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    if (gamma != NULL && beta != NULL) {
+      // compute grad_gamma(j) and grad_beta(j)
+      const int part_size = 16;
+      const dim3 threads2(32,4,1);
+      const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
+      const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
+      const int nshared2_b = threads2.x * threads2.y * sizeof(U);
+      const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
+      at::Tensor part_grad_gamma = at::empty({part_size,n2}, input->options().dtype(input->scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input->scalar_type()));
+      at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
+      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
+		      dout,
+		      input->data<T>(),
+		      n1,n2,
+		      mean,
+		      invvar,
+		      U(epsilon),
+		      part_grad_gamma.data<U>(),
+		      part_grad_beta.data<U>());
+
+      const dim3 threads3(32,8,1);
+      const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
+      const int nshared3 = threads3.x * threads3.y * sizeof(U);
+      cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
+		      part_grad_gamma.data<U>(),
+		      part_grad_beta.data<U>(),
+		      part_size,
+		      n1,n2,
+		      grad_gamma,
+		      grad_beta);
+    }
+
+    // compute grad_input
+    const dim3 threads1(32,4,1);
+    const dim3 blocks1(1,n1,1);
+    int nshared =
+	    threads1.y > 1 ?
+	    threads1.y*threads1.x*sizeof(U) :
+	    0;
+    cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
+            dout,
+            input->data<T>(),
+            n1,n2,
+            mean,
+            invvar,
+            U(epsilon),
+            gamma,
+            grad_input);
+}
+
+void cuda_layer_norm_gradient(
+    at::Tensor* dout,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon,
+    at::Tensor* grad_input,
+    at::Tensor* grad_gamma,
+    at::Tensor* grad_beta)
+{
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
+        using accscalar_t = at::acc_type<scalar_t_0, true>;
+        HostLayerNormGradient(
+	    dout->data<scalar_t_0>(),
+	    mean->data<accscalar_t>(),
+	    invvar->data<accscalar_t>(),
+	    input,
+	    n1,n2,
+	    gamma->data<scalar_t_0>(),
+	    beta->data<scalar_t_0>(),
+	    epsilon,
+	    grad_input->data<scalar_t_0>(),
+	    grad_gamma->data<scalar_t_0>(),
+	    grad_beta->data<scalar_t_0>());
+      )
+}
diff --git a/apex/csrc/multi_tensor_apply.cuh b/apex/csrc/multi_tensor_apply.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0e63031594c6b0618489349fea0c428b66071c77
--- /dev/null
+++ b/apex/csrc/multi_tensor_apply.cuh
@@ -0,0 +1,125 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+
+#include <assert.h>
+
+// #include <iostream>
+
+// This header is the one-stop shop for all your multi-tensor apply needs.
+
+
+// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template<int n> struct TensorListMetadata
+{
+  void* addresses[n][depth_to_max_tensors[n-1]];
+  int sizes[depth_to_max_tensors[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]]; // I fear this needs to be a full int.
+  int start_tensor_this_launch;
+};
+
+
+template<typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(
+    int chunk_size,
+    volatile int* noop_flag,
+    T tl,
+    U callable,
+    ArgTypes... args)
+{
+  // Hand the chunk information to the user-supplied functor to process however it likes.
+  callable(chunk_size, noop_flag, tl, args...); 
+}
+
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+  int block_size,
+  int chunk_size,
+  const at::Tensor& noop_flag,
+  const std::vector<std::vector<at::Tensor>>& tensor_lists,
+  T callable,
+  ArgTypes... args)
+{
+  AT_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+  int len0 = tensor_lists[0].size();
+  AT_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+
+  for(int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
+  {
+    AT_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+    for(int t = 0; t < tensor_lists[l].size(); t++)
+    {
+      // TODO:  Print which tensor fails.
+      AT_CHECK(tensor_lists[l][t].is_contiguous(), "A tensor was not contiguous.");
+      AT_CHECK(tensor_lists[l][t].is_cuda(), "A tensor was not cuda.");
+      AT_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+    }
+  }
+
+  int ntensors = tensor_lists[0].size();
+
+  TensorListMetadata<depth> tl;
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  
+  tl.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for(int t = 0; t < ntensors; t++)
+  {
+    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+    for(int d = 0; d < depth; d++)
+      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+    loc_tensor_info++;
+
+    int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+
+    for(int chunk = 0; chunk < chunks_this_tensor; chunk++)
+    {
+      // std::cout << chunks_this_tensor << std::endl;
+      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tl.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+  
+      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
+                           chunk == chunks_this_tensor - 1);
+      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+      if(tensors_full || blocks_full || last_chunk)
+      {
+        // using accscalar_t = acc_type<scalar_t, true>;
+        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+          chunk_size,
+          noop_flag.data<int>(),
+          tl,
+          callable,
+          args...);
+
+        AT_CUDA_CHECK(cudaGetLastError());
+
+        // Reset.  The control flow possibilities here make my brain hurt.
+        loc_block_info = 0;
+        if(chunk == chunks_this_tensor - 1)
+        {
+          // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
+          loc_tensor_info = 0; 
+          tl.start_tensor_this_launch = t + 1;
+        }
+        else
+        {
+          // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
+          tl.sizes[0] = tl.sizes[loc_tensor_info-1];
+          for(int d = 0; d < depth; d++)
+            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info-1];
+          loc_tensor_info = 1;
+          tl.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+}
diff --git a/apex/csrc/multi_tensor_axpby_kernel.cu b/apex/csrc/multi_tensor_axpby_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0fccabffd03a18e60513b69248fc243738f45ba5
--- /dev/null
+++ b/apex/csrc/multi_tensor_axpby_kernel.cu
@@ -0,0 +1,118 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+template<typename x_t, typename y_t, typename out_t>
+struct AxpbyFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<3>& tl,
+    float a,
+    float b,
+    int arg_to_check)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    y_t* y = (y_t*)tl.addresses[1][tensor_loc];
+    y += chunk_idx*chunk_size;
+
+    out_t* out = (out_t*)tl.addresses[2][tensor_loc];
+    out += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // Non-divergent exit condition for __syncthreads, not necessary here
+    float xs[ILP];
+    float ys[ILP];
+    for(int i_start = 0;
+        i_start < n && i_start < chunk_size;
+        i_start += blockDim.x*ILP)
+    {
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        xs[ii] = 0;
+        ys[ii] = 0;
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          xs[ii] = static_cast<float>(x[i]);
+          ys[ii] = static_cast<float>(y[i]);
+        }
+      }
+
+      // see note in multi_tensor_scale_kernel.cu
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
+          bool finite = true;
+          if(arg_to_check == -1)
+            finite = (isfinite(xs[ii]) && isfinite(ys[ii]));
+          if(arg_to_check == 0)
+            finite = isfinite(xs[ii]);
+          if(arg_to_check == 1)
+            finite = isfinite(ys[ii]);
+          if(!finite)
+            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_axpby_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float a,
+  float b,
+  int arg_to_check)
+{
+  using namespace at;
+  // The output (downscaled) type is always float.
+  // If build times suffer, think about where to put this dispatch,
+  // and what logic should be moved out of multi_tensor_apply.
+
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_axpby_cuda",
+    DISPATCH_FLOAT_AND_HALF(tensor_lists[1][0].scalar_type(), 1, "multi_tensor_axpby_cuda",
+      DISPATCH_FLOAT_AND_HALF(tensor_lists[2][0].scalar_type(), 2, "multi_tensor_axpby_cuda",
+           multi_tensor_apply<3>(
+             BLOCK_SIZE,
+             chunk_size,
+             noop_flag,
+             tensor_lists,
+             AxpbyFunctor<scalar_t_0, scalar_t_1, scalar_t_2>(),
+             a,
+             b,
+             arg_to_check); )))
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/apex/csrc/multi_tensor_l2norm_kernel.cu b/apex/csrc/multi_tensor_l2norm_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5277c0c932e8e5553ac088831a3828e97c65e247
--- /dev/null
+++ b/apex/csrc/multi_tensor_l2norm_kernel.cu
@@ -0,0 +1,180 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+template<typename x_t>
+struct L2NormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<1>& tl,
+    float* output,
+    float* output_per_tensor,
+    bool per_tensor,
+    int max_chunks_per_tensor)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    for(int i = 0; i < ILP; i++)
+      vals[i] = 0.f;
+
+    for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+    {
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          float next = static_cast<float>(x[i]);
+          vals[ii] += next*next;
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val += vals[i];
+
+    float final = reduce_block_into_lanes(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      if(!isfinite(final))
+        *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+      output[blockIdx.x] += final;
+      if(per_tensor)
+        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc)*max_chunks_per_tensor + chunk_idx] = final;
+    }
+  }
+};
+
+
+__global__ void cleanup(
+  float* output,
+  float* output_per_tensor,
+  float* ret,
+  float* ret_per_tensor,
+  bool per_tensor,
+  int max_chunks_per_tensor)
+{
+  __shared__ float vals[512];
+
+  if(blockIdx.x == 0)
+  {
+    float val = 0;
+    if(threadIdx.x < 320)
+      val = output[threadIdx.x];
+
+    float final = reduce_block_into_lanes(vals, val);
+
+    if(threadIdx.x == 0)
+      *ret = sqrt(final);
+  }
+
+  if(per_tensor)
+  {
+    float* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor;
+
+    float val = 0;
+    for(int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
+      val += output_this_tensor[i];
+
+    float final = reduce_block_into_lanes(vals, val);
+
+    if(threadIdx.x == 0)
+      ret_per_tensor[blockIdx.x] = sqrt(final);
+  }
+}
+
+
+std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::optional<bool> per_tensor_python)
+{
+  bool per_tensor = per_tensor_python.has_value() ? per_tensor_python.value() : false;
+
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+  auto output = at::zeros({320}, float_options);
+
+  at::Tensor output_per_tensor;
+  at::Tensor ret_per_tensor;
+
+  int ntensors = tensor_lists[0].size();
+  int max_chunks_per_tensor = -1;
+
+  if(per_tensor)
+  {
+    for(int t = 0; t < ntensors; t++)
+    {
+      int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+      if(max_chunks_this_tensor > max_chunks_per_tensor)
+        max_chunks_per_tensor = max_chunks_this_tensor;
+    }
+    output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, float_options);
+    ret_per_tensor = at::empty({ntensors}, float_options);
+  }
+  else
+  {
+    ret_per_tensor = at::empty({0}, float_options);
+  }
+
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
+    multi_tensor_apply<1>(
+      BLOCK_SIZE,
+      chunk_size,
+      noop_flag,
+      tensor_lists,
+      L2NormFunctor<scalar_t_0>(),
+      output.data<float>(),
+      per_tensor ? output_per_tensor.data<float>() : nullptr,
+      per_tensor,
+      max_chunks_per_tensor);)
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+
+  // This involves one more small kernel launches, but will be negligible end to end.
+  // I could get rid of these by hacking the functor + multi tensor harness with persistence
+  // logic, but keeping it simple for now
+  auto ret = at::empty({1}, output.options());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
+    output.data<float>(),
+    per_tensor ? output_per_tensor.data<float>() : nullptr,
+    ret.data<float>(),
+    per_tensor ? ret_per_tensor.data<float>() : nullptr,
+    per_tensor,
+    max_chunks_per_tensor);
+
+  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
+}
diff --git a/apex/csrc/multi_tensor_lamb_stage_1.cu b/apex/csrc/multi_tensor_lamb_stage_1.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2a80d1d44178525957bb1a625a0747349c0098fa
--- /dev/null
+++ b/apex/csrc/multi_tensor_lamb_stage_1.cu
@@ -0,0 +1,149 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+// Step 1 computes the 'update' value of regular Adam optimizer.
+template<typename GRAD_T, typename T>
+struct LAMBStage1Functor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<5>& tl,
+    const float* per_tensor_decay,
+    const float beta1,
+    const float beta2,
+    const float beta1_correction,
+    const float beta2_correction,
+    const float epsilon,
+    const float clipped_global_grad_norm)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    float decay = per_tensor_decay[tensor_num];
+
+    GRAD_T* g = (GRAD_T*)tl.addresses[0][tensor_loc];
+    g += chunk_idx*chunk_size;
+
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    T* m = (T*)tl.addresses[2][tensor_loc];
+    m += chunk_idx*chunk_size;
+
+    T* v = (T*)tl.addresses[3][tensor_loc];
+    v += chunk_idx*chunk_size;
+
+    T* update = (T*)tl.addresses[4][tensor_loc];
+    update += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // see note in multi_tensor_scale_kernel.cu
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      GRAD_T r_g[ILP];
+      T r_p[ILP];
+      T r_m[ILP];
+      T r_v[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_g[ii] = g[i];
+          r_p[ii] = p[i];
+          r_m[ii] = m[i];
+          r_v[ii] = v[i];
+        } else {
+          r_g[ii] = GRAD_T(0);
+          r_p[ii] = T(0);
+          r_m[ii] = T(0);
+          r_v[ii] = T(0);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+        r_m[ii] = r_m[ii] * beta1 + (1-beta1) * scaled_grad;
+        r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+        T next_m_unbiased = r_m[ii] / beta1_correction;
+        T next_v_unbiased = r_v[ii] / beta2_correction;
+        T denom = std::sqrt(next_v_unbiased) + epsilon;
+        r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          update[i] = r_p[ii];
+          m[i] = r_m[ii];
+          v[i] = r_v[ii];
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_lamb_stage1_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor per_tensor_decay,
+  const int step,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const float global_grad_norm,
+  const float max_global_grad_norm)
+{
+  using namespace at;
+
+  float clipped_global_grad_norm = global_grad_norm > max_global_grad_norm ? global_grad_norm / max_global_grad_norm : 1.0f;
+  float next_step = float(step+1);
+  float beta1_correction = 1.0f - std::pow(beta1, next_step);
+  float beta2_correction = 1.0f - std::pow(beta2, next_step);
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
+    DISPATCH_FLOAT_AND_HALF(tensor_lists[1][0].scalar_type(), 1, "lamb_stage_1",
+      multi_tensor_apply<5>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        LAMBStage1Functor<scalar_t_0, scalar_t_1>(),
+        per_tensor_decay.data<float>(),
+        beta1,
+        beta2,
+        beta1_correction,
+        beta2_correction,
+        epsilon,
+        clipped_global_grad_norm); ))
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/apex/csrc/multi_tensor_lamb_stage_2.cu b/apex/csrc/multi_tensor_lamb_stage_2.cu
new file mode 100644
index 0000000000000000000000000000000000000000..29067528dbd21b5d3d95a0f8c29348829fa22f4a
--- /dev/null
+++ b/apex/csrc/multi_tensor_lamb_stage_2.cu
@@ -0,0 +1,108 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
+// It computes new parameter value.
+template<typename T>
+struct LAMBStage2Functor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<2>& tl,
+    const float* per_tensor_param_norm,
+    const float* per_tensor_update_norm,
+    const float learning_rate)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    float param_norm = per_tensor_param_norm[tensor_num];
+    float update_norm = per_tensor_update_norm[tensor_num];
+    T ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
+
+    T* p = (T*)tl.addresses[0][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    T* update = (T*)tl.addresses[1][tensor_loc];
+    update += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      T r_p[ILP];
+      T r_update[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_p[ii] = p[i];
+          r_update[ii] = update[i];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        r_p[ii] = r_p[ii] - (ratio*r_update[ii]);
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          p[i] = r_p[ii];
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_lamb_stage2_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor per_tensor_param_norm,
+  at::Tensor per_tensor_update_norm,
+  const float learning_rate)
+{
+  using namespace at;
+
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
+      multi_tensor_apply<2>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        LAMBStage2Functor<scalar_t_0>(),
+        per_tensor_param_norm.data<float>(),
+        per_tensor_update_norm.data<float>(),
+        learning_rate); )
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/apex/csrc/multi_tensor_scale_kernel.cu b/apex/csrc/multi_tensor_scale_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3425042aa8b5abde378b873f19f57611e1917c58
--- /dev/null
+++ b/apex/csrc/multi_tensor_scale_kernel.cu
@@ -0,0 +1,101 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+// Stringstream is a big hammer, but I want to rely on operator<< for dtype.
+#include <sstream>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+template<typename in_t, typename out_t>
+struct ScaleFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<2>& tl,
+    float scale)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    in_t* in = (in_t*)tl.addresses[0][tensor_loc];
+    in += chunk_idx*chunk_size;
+   
+    out_t* out = (out_t*)tl.addresses[1][tensor_loc];
+    out += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // Non-divergent exit condition for __syncthreads, not necessary here
+    float incoming_vals[ILP];
+    for(int i_start = 0;
+        i_start < n && i_start < chunk_size;
+        i_start += blockDim.x*ILP)
+    {
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        incoming_vals[ii] = 0;
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+          incoming_vals[ii] = static_cast<float>(in[i]);
+      }
+
+      // note for clarification to future michael:
+      // From a pure memory dependency perspective, there's likely no point unrolling
+      // the write loop, since writes just fire off once their LDGs arrive.
+      // Put another way, the STGs are dependent on the LDGs, but not on each other.
+      // There is still compute ILP benefit from unrolling the loop though.
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
+          if(!isfinite(incoming_vals[ii]))
+            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_scale_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float scale)
+{
+  using namespace at;
+  // The output (downscaled) type is always float.
+  // If build times suffer, think about where to put this dispatch,
+  // and what logic should be moved out of multi_tensor_apply.
+
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_scale_cuda",
+    DISPATCH_FLOAT_AND_HALF(tensor_lists[1][0].scalar_type(), 1, "multi_tensor_scale_cuda",
+      multi_tensor_apply<2>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        ScaleFunctor<scalar_t_0, scalar_t_1>(),
+        scale); ))
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/apex/csrc/syncbn.cpp b/apex/csrc/syncbn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2aae4020c6836551f875ab947df3e82b7d3e0ff5
--- /dev/null
+++ b/apex/csrc/syncbn.cpp
@@ -0,0 +1,95 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+
+#include <vector>
+
+// returns {mean,biased_var}
+// implemented using welford 
+std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input);
+
+// reduces array of mean/var across processes
+// returns global {mean,inv_std,biased_var}
+// implemented using welford 
+std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_nodes,
+                                              const at::Tensor var_biased_feature_nodes,
+                                              int numel,
+                                              const float eps);
+
+// elementwise BN operation, returns output
+// input/weight/shift should have identical data type;
+// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
+at::Tensor batchnorm_forward_CUDA(const at::Tensor input,
+                                  const at::Tensor mean,
+                                  const at::Tensor inv_std,
+                                  const at::optional<at::Tensor> weight,
+                                  const at::optional<at::Tensor> shift);
+
+// backward BN operation, returns {mean_dy, mean_dy_xmu, grad_weight, grad_bias}
+// grad_output/input should have identical data type;
+// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
+// implemented using kahan summation
+std::vector<at::Tensor> reduce_bn_CUDA(const at::Tensor grad_output,
+                                           const at::Tensor input,
+                                           const at::Tensor mean,
+                                           const at::Tensor inv_std,
+                                           const at::optional<at::Tensor> weight);
+
+// elementwise backward BN operation, returns grad_input
+// grad_output/input/weight precision could be fp16/fp32;
+// mean/inv_std/mean_dy/mean_dy_xmu precision is fp32
+at::Tensor batchnorm_backward_CUDA(const at::Tensor grad_output,
+                                   const at::Tensor input,
+                                   const at::Tensor mean,
+                                   const at::Tensor inv_std,
+                                   const at::optional<at::Tensor> weight,
+                                   const at::Tensor mean_dy,
+                                   const at::Tensor mean_dy_xmu);
+
+// returns {mean, biased_var}
+// implemented using welford 
+// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
+std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input);
+
+// elementwise BN operation, returns output
+// input/weight/shift should have identical data type;
+// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
+// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
+at::Tensor batchnorm_forward_c_last_CUDA(const at::Tensor input,
+                                         const at::Tensor mean,
+                                         const at::Tensor inv_std,
+                                         const at::optional<at::Tensor> weight,
+                                         const at::optional<at::Tensor> shift);
+
+// backward BN operation, returns {mean_dy, mean_dy_xmu, grad_weight, grad_bias}
+// grad_output/input should have identical data type;
+// mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
+// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
+std::vector<at::Tensor> reduce_bn_c_last_CUDA(const at::Tensor grad_output,
+                                              const at::Tensor input,
+                                              const at::Tensor mean,
+                                              const at::Tensor inv_std,
+                                              const at::optional<at::Tensor> weight);
+
+// elementwise backward BN operation, returns grad_input
+// grad_output/input/weight precision could be fp16/fp32;
+// mean/inv_std/mean_dy/mean_dy_xmu precision is fp32
+// expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
+at::Tensor batchnorm_backward_c_last_CUDA(const at::Tensor grad_output,
+                                          const at::Tensor input,
+                                          const at::Tensor mean,
+                                          const at::Tensor inv_std,
+                                          const at::optional<at::Tensor> weight,
+                                          const at::Tensor mean_dy,
+                                          const at::Tensor mean_dy_xmu);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("welford_mean_var", &welford_mean_var_CUDA, "welford mean variance");
+  m.def("welford_parallel", &welford_parallel_CUDA, "welford parallel reduce mean variance");
+  m.def("batchnorm_forward", &batchnorm_forward_CUDA, "batchnorm forward");
+  m.def("reduce_bn", &reduce_bn_CUDA, "batchnorm backward reduce grad sum and bias/weight grad");
+  m.def("batchnorm_backward", &batchnorm_backward_CUDA, "batchnorm backward dgrad");
+  m.def("welford_mean_var_c_last", &welford_mean_var_c_last_CUDA, "welford mean variance nhwc");
+  m.def("batchnorm_forward_c_last", &batchnorm_forward_c_last_CUDA, "batchnorm forward nhwc");
+  m.def("reduce_bn_c_last", &reduce_bn_c_last_CUDA, "batchnorm backwards reduce grad sum and bias/weight grad nhwc");
+  m.def("batchnorm_backward_c_last", &batchnorm_backward_c_last_CUDA, "batchnorm backward dgrad nhwc");
+}
diff --git a/apex/csrc/type_shim.h b/apex/csrc/type_shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f07ac0c45ab3dfdb667d5cd319c61be46b67cea
--- /dev/null
+++ b/apex/csrc/type_shim.h
@@ -0,0 +1,130 @@
+#include <ATen/ATen.h>
+
+// Forward/backward compatiblity hack around
+// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
+// pending more future-proof guidance from upstream.
+// struct TypeShim
+// {
+//   const at::Type& payload;
+//   TypeShim(const at::Type& type) : payload(type) {}
+//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
+//   operator const at::Type&(){ return payload; };
+//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
+//   //operator at::ScalarType(){ return payload.; };
+// };
+
+#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
+#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Double: \
+    { \
+      using scalar_t_##LEVEL = double; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
+  #define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Double: \
+    { \
+      using scalar_t_##LEVEL = double; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
+template<typename T>
+__device__ __forceinline__ T reduce_block_into_lanes
+  (T *x,
+   T val,
+   int lanes=1,
+   bool share_result=false) // lanes is intended to be <= 32.
+{
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+  {
+    if(tid < i)
+      x[tid] = x[tid] + x[tid+i];
+    __syncthreads();
+  }
+
+  T final;
+
+  if(tid < 32)
+  {
+    if(blockSize >= 64)
+      final = x[tid] + x[tid+32];
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = final + __shfl_down_sync(0xffffffff, final, i);
+  }
+
+  if(share_result)
+  {
+    if(tid < lanes)
+      x[tid] = final; // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
+
+  return final;
+}
diff --git a/apex/csrc/welford.cu b/apex/csrc/welford.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ad2fc961d3de3b15fc823c8e86e258453746af55
--- /dev/null
+++ b/apex/csrc/welford.cu
@@ -0,0 +1,1352 @@
+#include <iostream>
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+#include "type_shim.h"
+
+
+__device__ __forceinline__ int lastpow2(int n)
+{
+  int out = 1 << (31 - __clz(n));
+  if(n == out)
+    out >>= 1;
+  return out;
+}
+
+__host__ __forceinline__ int h_next_pow2(unsigned int n) {
+    n--;
+    n |= (n >>  1);
+    n |= (n >>  2);
+    n |= (n >>  4);
+    n |= (n >>  8);
+    n |= (n >> 16);
+    return ++n;
+}
+
+__host__ __forceinline__ int h_last_pow2(unsigned int n) {
+    n |= (n >>  1);
+    n |= (n >>  2);
+    n |= (n >>  4);
+    n |= (n >>  8);
+    n |= (n >> 16);
+    return n - (n >> 1);
+}
+
+
+#define WARP_SIZE 32
+
+template<typename T>
+__device__ __forceinline__ T warp_reduce_sum(T val)
+{
+  #pragma unroll
+  for(int i = WARP_SIZE/2; i > 0; i >>= 1)
+    val = val + __shfl_down_sync(0xffffffff, val, i);
+  return val;
+}
+
+template<typename T>
+__device__ __forceinline__ T reduce_block(T *x, T val)
+{
+  int tid = threadIdx.y*blockDim.x + threadIdx.x;
+  int blockSize = blockDim.x * blockDim.y;
+
+  if (blockSize > 32) {
+    val = warp_reduce_sum(val);
+    if (tid % WARP_SIZE == 0)
+      x[tid/WARP_SIZE] = val;
+
+    __syncthreads();
+
+    val = (tid < blockSize / WARP_SIZE? x[tid%WARP_SIZE] : T(0));
+  }
+
+  if(tid/WARP_SIZE==0) val = warp_reduce_sum(val);
+
+  return val;
+}
+
+#define ELEMENTS_PER_ITER 4 // enables concurrency within each thread to hide latency
+#define ELEMENTS_PER_THREAD 16
+#define OPTIMAL_TILE_W 32
+#define MAX_H_BLOCK 128
+#define MAX_BLOCK_SIZE 512
+
+__host__ int div_ru(int x, int y) {
+  return h_last_pow2(1 + (x-1)/y);
+}
+
+__host__ void flexible_launch_configs(
+      const int reduction,
+      const int stride,
+      dim3 &block,
+      dim3 &grid,
+      const bool coop_flag = false) {
+  int block_x = std::min(h_last_pow2(stride), OPTIMAL_TILE_W);
+  int block_y = std::min(h_last_pow2(div_ru(reduction , ELEMENTS_PER_THREAD)),
+                         MAX_BLOCK_SIZE / block_x);
+  if (block_x * block_y != MAX_BLOCK_SIZE) {
+    block_x = std::min(h_last_pow2(stride), MAX_BLOCK_SIZE / block_y);
+  }
+
+  int grid_x = div_ru(stride, block_x);
+  int grid_y = std::min(div_ru(reduction, block_y * ELEMENTS_PER_THREAD), MAX_H_BLOCK);
+  if (coop_flag) {
+    // it's not worth having a grid reduction if the reduction dimension is not big enough
+    grid_y = grid_y < 8 ? 1 : grid_y;
+  }
+
+  block.x = block_x;
+  block.y = block_y;
+  block.z = 1;
+  grid.x = grid_x;
+  grid.y = grid_y;
+  grid.z = 1;
+}
+
+template<typename T, typename C>
+__device__ __forceinline__ void welford_merge_element(C& count,
+                                                      T& mean,
+                                                      T& m2n,
+                                                      const C& num_new,
+                                                      const T& mean_new,
+                                                      const T& m2n_new) {
+      T factor = T(1.0) / max(1, (count + num_new));
+      T delta0 = mean - mean_new;
+      mean = (mean_new * num_new + mean * count) * factor;
+      m2n += m2n_new + delta0 * delta0 * num_new * count * factor;
+      count += num_new;
+}
+
+template<typename T>
+__device__ __forceinline__ void warp_reduce_mean_m2n(T &mean, T &m2n, int &num)
+{
+  #pragma unroll
+  for(int i = WARP_SIZE/2; i > 0; i >>= 1) {
+    auto num_new = __shfl_down_sync(0xffffffff, num, i);
+    auto mean_new = __shfl_down_sync(0xffffffff, mean, i);
+    auto m2n_new = __shfl_down_sync(0xffffffff, m2n, i);
+    welford_merge_element(num, mean, m2n, num_new, mean_new, m2n_new);
+  }
+}
+
+template <typename T>
+__device__ void welford_reduce_mean_m2n(
+      T* __restrict__ x,
+      int* __restrict__ count,
+      T &mean,
+      T &m2n,
+      int &num,
+      int block_size,
+      int thread_id)
+{
+  int lane = thread_id % WARP_SIZE;
+  int wid = thread_id / WARP_SIZE;
+
+  if (block_size > 32) {
+    warp_reduce_mean_m2n(mean, m2n, num);
+    if (lane == 0) {
+      x[wid*2] = mean;
+      x[wid*2+1] = m2n;
+      count[wid] = num;
+    }
+    __syncthreads();
+
+    if (wid == 0) {
+      mean = (thread_id < block_size / WARP_SIZE)? x[lane*2] : T(0);
+      m2n = (thread_id < block_size / WARP_SIZE)? x[lane*2+1] : T(0);
+      num = (thread_id < block_size / WARP_SIZE)? count[lane] : int(0);
+    }
+  }
+
+  if (wid==0) warp_reduce_mean_m2n(mean, m2n, num);
+
+  return;
+}
+
+// return spatial size for NC+ Tensors
+__host__ int get_tensor_spatial_size(const at::Tensor& input)
+{
+  auto space_size = input.size(2);
+  for (int i = 3; i < input.ndimension(); i++) {
+    space_size *= input.size(i);
+  }
+  return space_size;
+}
+
+// promote accumulation scalar type. promote half to float.
+__host__ at::ScalarType promote_scalartype(const at::Tensor& input)
+{
+  return input.scalar_type() == at::ScalarType::Half ?
+           at::ScalarType::Float : input.scalar_type();
+}
+
+// return single element size, optional accumulation type promotion.
+__host__ size_t get_element_data_size(const at::Tensor& input, bool accumulation = false)
+{
+  auto scalar_type = accumulation ? promote_scalartype(input) : input.scalar_type();
+  return at::elementSize(scalar_type);
+}
+
+template<typename T, typename C>
+__device__ __forceinline__ void welford_merge_block_vertical(C& count,
+                                                             T& mean,
+                                                             T& m2n,
+                                                             C* shmem_count,
+                                                             T* shmem_mean,
+                                                             T* shmem_m2n) {
+  // write to shared memory
+  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
+  shmem_mean[address_base] = mean;
+  shmem_m2n[address_base] = m2n;
+  shmem_count[address_base] = count;
+
+#pragma unroll
+  for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+    __syncthreads();
+    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+      auto address = address_base + offset * blockDim.x;
+      // read shared memory back to register for reduction
+      auto num_new = shmem_count[address];
+      auto mean_new = shmem_mean[address];
+      auto m2n_new = shmem_m2n[address];
+
+      welford_merge_element(count, mean, m2n, num_new, mean_new, m2n_new);
+
+      // last write is not necessary
+      shmem_mean[address_base] = mean;
+      shmem_m2n[address_base] = m2n;
+      shmem_count[address_base] = count;
+    }
+  }
+}
+
+template<typename T>
+__device__ __forceinline__ void merge_block_vertical(T& sum_dy,
+                                                     T& sum_dy_xmu,
+                                                     T* shmem_sum_dy,
+                                                     T* shmem_sum_dy_xmu) {
+  // write to shared memory
+  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
+  shmem_sum_dy[address_base] = sum_dy;
+  shmem_sum_dy_xmu[address_base] = sum_dy_xmu;
+
+#pragma unroll
+  for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+    __syncthreads();
+    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+      auto address = address_base + offset * blockDim.x;
+
+      sum_dy += shmem_sum_dy[address];
+      sum_dy_xmu += shmem_sum_dy_xmu[address];
+
+      // last write is not necessary
+      shmem_sum_dy[address_base] = sum_dy;
+      shmem_sum_dy_xmu[address_base] = sum_dy_xmu;
+    }
+  }
+}
+
+
+// welford kernel calculating mean/biased_variance/unbiased_variance
+template <typename scalar_t, typename accscalar_t, typename outscalar_t>
+__global__ void welford_kernel(
+      const scalar_t* __restrict__ input,
+      outscalar_t* __restrict__ out_mean,
+      outscalar_t* __restrict__ out_var_biased,
+      const int bs,
+      const int fs,
+      const int ss) {
+  int block_size = blockDim.x * blockDim.y;
+  int count = 0;
+  accscalar_t x_mean = accscalar_t(0);
+  accscalar_t m_2_n = accscalar_t(0);
+
+  int thread_id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  for (int batch_id = threadIdx.y; batch_id < bs; batch_id += blockDim.y) {
+    int input_base = blockIdx.x*ss + batch_id*ss*fs;
+    // sequential welford
+    for (int offset = threadIdx.x; offset < ss ; offset += blockDim.x) {
+      count++;
+      auto x_n = static_cast<accscalar_t>(input[offset+input_base]);
+      auto d = x_n - x_mean;
+      x_mean += d / count;
+      m_2_n += d * (x_n - x_mean);
+    }
+  }
+
+  static __shared__ int s_mem[160];
+  accscalar_t* s_mem_ac = (accscalar_t*) &s_mem[32];
+
+  welford_reduce_mean_m2n<accscalar_t>(s_mem_ac, s_mem, x_mean, m_2_n, count, block_size, thread_id);
+
+  if (thread_id == 0) {
+    out_mean[blockIdx.x] = static_cast<outscalar_t>(x_mean);
+    out_var_biased[blockIdx.x] = static_cast<outscalar_t>(m_2_n/count);
+  }
+}
+
+// elementwise BN kernel
+template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
+__global__ void batchnorm_forward_kernel(
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const layerscalar_t* __restrict__ shift,
+      scalar_t* __restrict__ out,
+      const int ss,
+      const int bs) {
+  auto m_c = mean[blockIdx.x];
+  auto inv_std_c = inv_std[blockIdx.x];
+  auto w_c = weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[blockIdx.x]);
+  auto s_c = shift == NULL ? accscalar_t(0.0) : static_cast<accscalar_t>(shift[blockIdx.x]);
+
+  for (int batch_offset = blockIdx.y*blockDim.y + threadIdx.y; batch_offset < bs; batch_offset += gridDim.y*blockDim.y) {
+    int address_base = blockIdx.x*ss + batch_offset*gridDim.x*ss;
+    for (int offset = threadIdx.x + blockIdx.z*blockDim.x; offset < ss ; offset+= gridDim.z*blockDim.x) {
+      out[address_base+offset] = static_cast<scalar_t>(w_c * (static_cast<accscalar_t>(input[address_base+offset]) - m_c ) * inv_std_c + s_c);
+    }
+  }
+}
+
+// Backward BN kernel, calculates grad_bias, grad_weight as well as intermediate
+// results to calculating grad_input.
+// Breaking the grad_input to two step to support sync BN, which requires all
+// reduce of the intermediate results across processes.
+template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
+__global__ void reduce_bn_kernel(
+      const scalar_t* __restrict__ input,
+      const scalar_t* __restrict__ grad_output,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      accscalar_t* __restrict__ mean_dy,
+      accscalar_t* __restrict__ mean_dy_xmu,
+      layerscalar_t* __restrict__ grad_weight,
+      layerscalar_t* __restrict__ grad_bias,
+      const int bs,
+      const int fs,
+      const int ss) {
+  static __shared__ int s_mem[64];
+  int total_item_num = bs * ss;
+
+  int thread_id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  auto r_mean = mean[blockIdx.x];
+  auto factor = inv_std[blockIdx.x];
+
+  // Kahan sum
+  accscalar_t sum_dy = 0.0;
+  accscalar_t sum_dy_xmu = 0.0;
+  accscalar_t sum_dy_c = 0.0;
+  accscalar_t sum_dy_xmu_c = 0.0;
+  for (int batch_id = threadIdx.y; batch_id < bs; batch_id += blockDim.y) {
+    int input_base = blockIdx.x*ss + batch_id*ss*fs;
+    for (int offset = threadIdx.x; offset < ss ; offset += blockDim.x) {
+      auto e_grad = static_cast<accscalar_t>(grad_output[offset+input_base]);
+      auto e_input = static_cast<accscalar_t>(input[offset+input_base]);
+      // calculating sum_dy
+      auto sum_dy_y = e_grad - sum_dy_c;
+      auto sum_dy_t = sum_dy + sum_dy_y;
+      sum_dy_c = (sum_dy_t - sum_dy) - sum_dy_y;
+      sum_dy = sum_dy_t;
+
+      // calculating sum_dy_xmu
+      auto sum_dy_xmu_y = e_grad * (e_input - r_mean) - sum_dy_xmu_c;
+      auto sum_dy_xmu_t = sum_dy_xmu + sum_dy_xmu_y;
+      sum_dy_xmu_c = (sum_dy_xmu_t - sum_dy_xmu) - sum_dy_xmu_y;
+      sum_dy_xmu = sum_dy_xmu_t;
+    }
+  }
+
+  sum_dy = reduce_block((accscalar_t*)s_mem, sum_dy);
+  __syncthreads();
+  sum_dy_xmu = reduce_block((accscalar_t*)s_mem, sum_dy_xmu);
+
+  if (thread_id == 0) {
+    if (grad_bias != NULL) {
+      grad_bias[blockIdx.x] = static_cast<layerscalar_t>(sum_dy);
+    }
+    if (grad_weight != NULL) {
+      grad_weight[blockIdx.x] = static_cast<layerscalar_t>(sum_dy_xmu * factor);
+    }
+    mean_dy[blockIdx.x] = sum_dy / total_item_num;
+    mean_dy_xmu[blockIdx.x] = sum_dy_xmu / total_item_num;
+  }
+}
+
+// elementwise backward BN kernel
+template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
+__global__ void batchnorm_backward_kernel(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ mean_dy,
+      const accscalar_t* __restrict__ mean_dy_xmu,
+      scalar_t* __restrict__ grad_input,
+      const int ss,
+      const int bs) {
+  auto m_c = static_cast<accscalar_t>(mean[blockIdx.x]);
+  auto m_dy_c = static_cast<accscalar_t>(mean_dy[blockIdx.x]);
+  auto factor_1_c = inv_std[blockIdx.x];
+  auto factor_2_c = (weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[blockIdx.x])) * factor_1_c;
+  factor_1_c = factor_1_c * factor_1_c * mean_dy_xmu[blockIdx.x];
+
+  for (int batch_offset = blockIdx.y*blockDim.y+threadIdx.y; batch_offset < bs; batch_offset += gridDim.y*blockDim.y) {
+    int address_base = blockIdx.x*ss + batch_offset*gridDim.x*ss;
+    for (int offset = threadIdx.x + blockIdx.z*blockDim.x; offset < ss ; offset+= gridDim.z*blockDim.x) {
+      grad_input[address_base+offset] = (static_cast<accscalar_t>(grad_output[address_base+offset]) - m_dy_c - (static_cast<accscalar_t>(input[address_base+offset]) - m_c) * factor_1_c) * factor_2_c;
+    }
+  }
+}
+
+// welford kernel for c last tensor calculating mean/biased_variance/unbiased_variance
+template
+   <typename scalar_t,
+    typename accscalar_t,
+    typename outscalar_t,
+    int PARALLEL_LOADS>
+__global__ void
+welford_kernel_c_last(
+      const scalar_t* __restrict__ input,
+      outscalar_t* __restrict__ out_mean,
+      outscalar_t* __restrict__ out_var_biased,
+      volatile accscalar_t* staging_data,
+      int* semaphores,
+      const int reduction_size,
+      const int stride) {
+  // hide latency with concurrency
+  accscalar_t x_mean[PARALLEL_LOADS];
+  accscalar_t m_2_n[PARALLEL_LOADS];
+  int count[PARALLEL_LOADS];
+
+#pragma unroll
+  for (int i = 0; i < PARALLEL_LOADS; i++) {
+    x_mean[i] = accscalar_t(0);
+    m_2_n[i] = accscalar_t(0);
+    count[i] = accscalar_t(0);
+  }
+  // tensor dimension (m,c)
+
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+    accscalar_t x_math[PARALLEL_LOADS];
+    accscalar_t x_count_inv[PARALLEL_LOADS];
+    accscalar_t is_valid[PARALLEL_LOADS];
+
+    // load multiple data in
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        x_math[j] = input[address_base];
+        count[j]++;
+        x_count_inv[j] = accscalar_t(1) / count[j];
+        is_valid[j] = accscalar_t(1);
+      } else {
+        x_math[j] = accscalar_t(0);
+        x_count_inv[j] = accscalar_t(0);
+        is_valid[j] = accscalar_t(0);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+
+    // calculate mean/m2n with welford
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      accscalar_t delta0 = x_math[j] - x_mean[j];
+      x_mean[j] += delta0 * x_count_inv[j];
+      accscalar_t delta1 = x_math[j] - x_mean[j];
+      m_2_n[j] += delta0 * delta1 * is_valid[j];
+    }
+  }
+
+  // thread reduction to accumulate mean/m_2_n/count between PARALLEL_LOADS
+#pragma unroll
+  for (int j = 1; j < PARALLEL_LOADS; j++) {
+    welford_merge_element(count[0], x_mean[0], m_2_n[0], count[j], x_mean[j], m_2_n[j]);
+  }
+
+  // release x_mean / m_2_n
+  auto mean_th = x_mean[0];
+  auto m2_th = m_2_n[0];
+  auto count_th = count[0];
+
+  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
+  static __shared__ accscalar_t shmem_mean[MAX_BLOCK_SIZE];
+  static __shared__ accscalar_t shmem_m2n[MAX_BLOCK_SIZE];
+  static __shared__ int shmem_count[MAX_BLOCK_SIZE];
+
+  welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);
+
+  // grid reduction if needed (coop launch used at the first place)
+  if (gridDim.y > 1) {
+    volatile accscalar_t* staging_mean = staging_data;
+    volatile accscalar_t* staging_m2n = &staging_data[stride*gridDim.y];
+    volatile int* staging_count = reinterpret_cast<volatile int*>(&staging_m2n[stride*gridDim.y]);
+
+    address_base = c_offset + blockIdx.y * stride;
+    // write data to staging_data;
+    if (threadIdx.y == 0 && c_offset < stride) {
+      staging_mean[address_base] = mean_th;
+      staging_m2n[address_base] = m2_th;
+      staging_count[address_base] = count_th;
+    }
+
+    __threadfence();
+    __syncthreads(); // ensuring writes to staging_ is visible to all blocks
+
+    __shared__ bool is_last_block_done;
+    // mark block done
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int old = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done = (old == (gridDim.y-1));
+    }
+
+    __syncthreads();
+
+    // check that all data is now available in global memory
+    if (is_last_block_done) {
+      count_th = 0;
+      mean_th = accscalar_t(0.0);
+      m2_th = accscalar_t(0.0);
+
+      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+        address_base = c_offset + y * stride;
+        int num_new = c_offset < stride ? staging_count[address_base] : 0;
+        accscalar_t mean_new = c_offset < stride ? staging_mean[address_base] : accscalar_t(0.0);
+        accscalar_t m2n_new = c_offset < stride ? staging_m2n[address_base] : accscalar_t(0.0);
+
+        welford_merge_element(count_th, mean_th, m2_th, num_new, mean_new, m2n_new);
+      }
+
+      welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);
+      if (threadIdx.y == 0 && c_offset < stride) {
+        out_mean[c_offset] = static_cast<outscalar_t>(mean_th);
+        out_var_biased[c_offset] = static_cast<outscalar_t>(m2_th / count_th);
+      }
+    }
+  } else {
+    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
+      out_mean[c_offset] = static_cast<outscalar_t>(mean_th);
+      out_var_biased[c_offset] = static_cast<outscalar_t>(m2_th / count_th);
+    }
+  }
+}
+
+// parallel welford kernel to further reduce mean / biased_var
+// into mean / unbiased_var / inv_std across multiple processes.
+template <typename scalar_t>
+__global__ void welford_kernel_parallel(
+      const scalar_t* __restrict__ mean,
+      const scalar_t* __restrict__ var_biased,
+      scalar_t* __restrict__ out_mean,
+      scalar_t* __restrict__ out_var,
+      scalar_t* __restrict__ inv_std,
+      const int world_size,
+      const int feature_size,
+      const float eps,
+      const int numel) {
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < feature_size; i += gridDim.x * blockDim.x) {
+    // load data;
+    int address = i;
+    scalar_t x_mean = 0;
+    scalar_t m_2_n = 0;
+    int count = 0;
+    for (int j = 0; j < world_size; j++) {
+      welford_merge_element(count, x_mean, m_2_n, numel, mean[address], var_biased[address]*numel);
+      address += feature_size;
+    }
+    out_mean[i] = x_mean;
+    out_var[i] = m_2_n/ (count - 1);
+    inv_std[i] = scalar_t(1) / sqrt(m_2_n/count + eps);
+  }
+}
+
+// elementwise BN kernel
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t,
+    int PARALLEL_LOADS>
+__global__ void batchnorm_forward_c_last_kernel(
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const layerscalar_t* __restrict__ shift,
+      scalar_t* __restrict__ out,
+      const int reduction_size,
+      const int stride) {
+  // tensor dimension (m,c)
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  auto m_c = mean[c_offset];
+  auto inv_std_c = static_cast<accscalar_t>(inv_std[c_offset]);
+  auto w_c = weight == NULL ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset]);
+  auto s_c = shift == NULL ? accscalar_t(0.0) : static_cast<accscalar_t>(shift[c_offset]);
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        out[address_base] = static_cast<scalar_t>(
+            w_c * (static_cast<accscalar_t>(input[address_base]) - m_c ) * inv_std_c + s_c
+          );
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+  }
+}
+
+// batchnorm backward kernel for c last tensor
+template
+   <typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t,
+    int PARALLEL_LOADS>
+__global__ void reduce_bn_c_last_kernel(
+      const scalar_t* __restrict__ input,
+      const scalar_t* __restrict__ grad_output,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      accscalar_t* __restrict__ mean_dy,
+      accscalar_t* __restrict__ mean_dy_xmu,
+      layerscalar_t* __restrict__ grad_weight,
+      layerscalar_t* __restrict__ grad_bias,
+      volatile accscalar_t* staging_data,
+      int* semaphores,
+      const int reduction_size,
+      const int stride) {
+
+  // hide latency with concurrency
+  accscalar_t sum_dy[PARALLEL_LOADS];
+  accscalar_t sum_dy_xmu[PARALLEL_LOADS];
+
+#pragma unroll
+  for (int i = 0; i < PARALLEL_LOADS; i++) {
+    sum_dy[i] = accscalar_t(0);
+    sum_dy_xmu[i] = accscalar_t(0);
+  }
+  // tensor dimension (m,c)
+
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  auto r_mean = mean[c_offset];
+  auto factor = inv_std[c_offset];
+
+  for (int i = 0; i < loop_count; i++) {
+    accscalar_t x_input[PARALLEL_LOADS];
+    accscalar_t x_grad_output[PARALLEL_LOADS];
+
+    // load multiple data in
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        x_input[j] = input[address_base];
+        x_grad_output[j] = grad_output[address_base];
+      } else {
+        x_input[j] = accscalar_t(0);
+        x_grad_output[j] = accscalar_t(0);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+
+    // calculate sum_dy / sum_dy_xmu
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      sum_dy[j] += x_grad_output[j];
+      sum_dy_xmu[j] += x_grad_output[j] * (x_input[j] - r_mean);
+    }
+  }
+
+  // thread reduction to accumulate sum_dy / sum_dy_xmu between PARALLEL_LOADS
+#pragma unroll
+  for (int j = 1; j < PARALLEL_LOADS; j++) {
+    sum_dy[0] += sum_dy[j];
+    sum_dy_xmu[0] += sum_dy_xmu[j];
+  }
+
+  // release array of registers
+  auto sum_dy_th = sum_dy[0];
+  auto sum_dy_xmu_th = sum_dy_xmu[0];
+
+  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
+  static __shared__ accscalar_t shmem_sum_dy[MAX_BLOCK_SIZE];
+  static __shared__ accscalar_t shmem_sum_dy_xmu[MAX_BLOCK_SIZE];
+
+  merge_block_vertical(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);
+
+  // grid reduction if needed (coop launch used at the first place)
+  if (gridDim.y > 1) {
+    volatile accscalar_t* staging_sum_dy = staging_data;
+    volatile accscalar_t* staging_sum_dy_xmu = &staging_data[stride*gridDim.y];
+
+    address_base = c_offset + blockIdx.y * stride;
+    // write data to staging_data;
+    if (threadIdx.y == 0 && c_offset < stride) {
+      staging_sum_dy[address_base] = sum_dy_th;
+      staging_sum_dy_xmu[address_base] = sum_dy_xmu_th;
+    }
+
+    __threadfence();
+    __syncthreads(); // ensuring writes to staging_ is visible to all blocks
+
+    __shared__ bool is_last_block_done;
+    // mark block done
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int old = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done = (old == (gridDim.y-1));
+    }
+
+    __syncthreads();
+
+    // check that all data is now available in global memory
+    if (is_last_block_done) {
+      sum_dy_th = accscalar_t(0.0);
+      sum_dy_xmu_th = accscalar_t(0.0);
+
+      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+        address_base = c_offset + y * stride;
+        sum_dy_th += (c_offset < stride ? staging_sum_dy[address_base] : accscalar_t(0.0));
+        sum_dy_xmu_th += (c_offset < stride ? staging_sum_dy_xmu[address_base] : accscalar_t(0.0));
+      }
+
+      merge_block_vertical(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);
+      if (threadIdx.y == 0 && c_offset < stride) {
+        if (grad_bias != NULL) {
+          grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
+        }
+        if (grad_weight != NULL) {
+          grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
+        }
+        mean_dy[c_offset] = sum_dy_th / reduction_size;
+        mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
+      }
+    }
+  } else {
+    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
+      if (grad_bias != NULL) {
+        grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
+      }
+      if (grad_weight != NULL) {
+        grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
+      }
+      mean_dy[c_offset] = sum_dy_th / reduction_size;
+      mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
+    }
+  }
+}
+
+// elementwise BN kernel
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t,
+    int PARALLEL_LOADS>
+__global__ void batchnorm_backward_c_last_kernel(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ mean_dy,
+      const accscalar_t* __restrict__ mean_dy_xmu,
+      scalar_t* __restrict__ grad_input,
+      const int reduction_size,
+      const int stride) {
+  // tensor dimension (m,c)
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  auto m_c = mean[c_offset];
+  auto m_dy_c = mean_dy[c_offset];
+  auto factor_1_c = inv_std[c_offset];
+  auto factor_2_c = (weight == NULL? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset])) * factor_1_c;
+  factor_1_c = factor_1_c * factor_1_c * mean_dy_xmu[c_offset];
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        grad_input[address_base] = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(grad_output[address_base]) - m_dy_c -
+            (static_cast<accscalar_t>(input[address_base]) - m_c) * factor_1_c)
+            * factor_2_c);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+  }
+}
+
+std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
+  const auto batch_size = input.size(0);
+  const auto feature_size = input.size(1);
+
+  auto space_size = get_tensor_spatial_size(input);
+  auto scalar_type = promote_scalartype(input);
+
+  at::Tensor out_var_biased = at::empty({feature_size}, input.options().dtype(scalar_type));
+  at::Tensor out_mean = at::empty({feature_size}, input.options().dtype(scalar_type));
+
+  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE / 32));
+  int block_x = max(1, min(MAX_BLOCK_SIZE / block_y, h_last_pow2(space_size)));
+  const dim3 block(block_x, block_y);
+  const dim3 grid(feature_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_kernel",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      welford_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          out_mean.data<accscalar_t>(),
+          out_var_biased.data<accscalar_t>(),
+          batch_size,
+          feature_size,
+          space_size);
+    );
+  }
+
+  return {out_mean, out_var_biased};
+}
+
+at::Tensor batchnorm_forward_CUDA(
+    const at::Tensor input,
+    const at::Tensor mean,
+    const at::Tensor inv_std,
+    const at::optional<at::Tensor> weight,
+    const at::optional<at::Tensor> shift) {
+  const auto batch_size = input.size(0);
+  const auto feature_size = input.size(1);
+  at::Tensor out = at::empty_like(input);
+
+  auto space_size = get_tensor_spatial_size(input);
+
+  int block_x = max(32, min(MAX_BLOCK_SIZE, h_last_pow2(space_size)/4));
+  int block_y = max(1, min(MAX_BLOCK_SIZE/block_x, h_last_pow2(batch_size)/4));
+  const dim3 block(block_x, block_y);
+  int grid_z = max(1, min(65535, h_last_pow2(space_size)/4/block_x));
+  int batch_group_size = max(1, min(65535, h_last_pow2(batch_size)/block_y));
+  const dim3 grid(feature_size, batch_group_size, grid_z);
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (input.scalar_type() == at::ScalarType::Half
+      && weight.has_value() &&
+      weight.value().scalar_type() == at::ScalarType::Float) {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_forward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
+          shift.has_value() ? shift.value().data<accscalar_t>() : NULL,
+          out.data<scalar_t_0>(),
+          space_size,
+          batch_size);
+    );
+  } else {
+    if (weight.has_value()) {
+      AT_CHECK(input.scalar_type() == weight.value().scalar_type(),
+          "input.scalar_type() is not supported with weight.scalar_type()");
+    }
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_forward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
+          shift.has_value() ? shift.value().data<scalar_t_0>() : NULL,
+          out.data<scalar_t_0>(),
+          space_size,
+          batch_size);
+    );
+  }
+  return out;
+}
+
+std::vector<at::Tensor> reduce_bn_CUDA(
+    const at::Tensor grad_output,
+    const at::Tensor input,
+    const at::Tensor mean,
+    const at::Tensor inv_std,
+    const at::optional<at::Tensor> weight)
+{
+  const auto batch_size = input.size(0);
+  const auto feature_size = input.size(1);
+
+  auto scalar_type = promote_scalartype(input);
+
+  at::Tensor mean_dy = at::empty({feature_size}, mean.options());
+  at::Tensor mean_dy_xmu = at::empty({feature_size}, mean.options());
+
+  at::Tensor grad_weight;
+  at::Tensor grad_bias;
+  if (weight.has_value()) {
+    grad_weight = at::empty({feature_size}, weight.value().options());
+    grad_bias = at::empty({feature_size}, weight.value().options());
+  } else {
+    grad_weight = at::empty({0}, mean.options());
+    grad_bias = at::empty({0}, mean.options());
+  }
+
+  auto space_size = get_tensor_spatial_size(input);
+
+  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE/ 32));
+  int block_x = max(1, min(MAX_BLOCK_SIZE/ block_y, h_last_pow2(space_size)));
+  const dim3 block(block_x, block_y);
+  const dim3 grid(feature_size);
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (input.scalar_type() == at::ScalarType::Half
+      && weight.has_value() &&
+      weight.value().scalar_type() == at::ScalarType::Float) {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      reduce_bn_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          grad_output.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
+          weight.has_value() ? grad_bias.data<accscalar_t>() : NULL,
+          batch_size,
+          feature_size,
+          space_size);
+    );
+  } else {
+    if (weight.has_value()) {
+        AT_CHECK(input.scalar_type() == weight.value().scalar_type(),
+            "input.scalar_type() is not supported with weight.scalar_type()");
+    }
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      reduce_bn_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          grad_output.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
+          weight.has_value() ? grad_bias.data<scalar_t_0>() : NULL,
+          batch_size,
+          feature_size,
+          space_size);
+    );
+  }
+
+  return {mean_dy, mean_dy_xmu, grad_weight, grad_bias};
+}
+
+at::Tensor batchnorm_backward_CUDA(
+    const at::Tensor grad_output,
+    const at::Tensor input,
+    const at::Tensor mean,
+    const at::Tensor inv_std,
+    const at::optional<at::Tensor> weight,
+    const at::Tensor mean_dy,
+    const at::Tensor mean_dy_xmu) {
+  const auto batch_size = input.size(0);
+  const auto feature_size = input.size(1);
+
+  at::Tensor grad_input = at::empty_like(input);
+
+  auto space_size = get_tensor_spatial_size(input);
+
+  int block_x = max(32, min(MAX_BLOCK_SIZE, h_last_pow2(space_size)/4));
+  int block_y = max(1, min(MAX_BLOCK_SIZE/block_x, h_last_pow2(batch_size)/4));
+  const dim3 block(block_x, block_y);
+  int grid_z = max(1, min(65535, h_last_pow2(space_size)/4/block_x));
+  int batch_group_size = max(1, min(65535, h_last_pow2(batch_size)/block_y));
+  const dim3 grid(feature_size, batch_group_size, grid_z);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (input.scalar_type() == at::ScalarType::Half
+      && weight.has_value() &&
+      weight.value().scalar_type() == at::ScalarType::Float) {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_backward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
+          grad_output.data<scalar_t_0>(),
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          grad_input.data<scalar_t_0>(),
+          space_size,
+          batch_size);
+    );
+  } else {
+    if (weight.has_value()) {
+      AT_CHECK(input.scalar_type() == weight.value().scalar_type(),
+          "input.scalar_type() is not supported with weight.scalar_type()");
+    }
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_backward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
+          grad_output.data<scalar_t_0>(),
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          grad_input.data<scalar_t_0>(),
+          space_size,
+          batch_size);
+    );
+  }
+
+  return grad_input;
+}
+
+std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_nodes,
+                                              const at::Tensor var_biased,
+                                              int numel,
+                                              const float eps) {
+  const auto world_size = mean_feature_nodes.size(0);
+  const auto feature_size = mean_feature_nodes.size(1);
+
+  at::Tensor out_var = at::empty({feature_size}, var_biased.options());
+  at::Tensor inv_std = at::empty_like(out_var);
+  at::Tensor out_mean = at::empty_like(out_var);
+
+  // TODO(jie): tile this for memory coalescing!
+  const int block = std::min(h_last_pow2(feature_size), MAX_BLOCK_SIZE);
+  const int grid = std::max<int>(1, feature_size / block);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
+      welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
+          mean_feature_nodes.data<scalar_t_0>(),
+          var_biased.data<scalar_t_0>(),
+          out_mean.data<scalar_t_0>(),
+          out_var.data<scalar_t_0>(),
+          inv_std.data<scalar_t_0>(),
+          world_size,
+          feature_size,
+          eps,
+          numel);
+    );
+  }
+
+  return {out_mean, out_var, inv_std};
+}
+
+std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) {
+  const auto stride = input.size(input.ndimension()-1);
+  const auto reduction_size = input.numel() / stride;
+
+  auto scalar_type = promote_scalartype(input);
+  auto option = input.options().dtype(scalar_type);
+
+  at::Tensor out_var_biased = at::empty({stride}, option);
+  at::Tensor out_mean = at::empty({stride}, option);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid, true);
+
+  at::Tensor staging_data;
+  at::Tensor semaphores;
+  if (grid.y > 1) {
+    staging_data = at::empty({4*stride*grid.y}, option);
+    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
+  }
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_c_last",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
+      int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
+      welford_kernel_c_last<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          out_mean.data<accscalar_t>(),
+          out_var_biased.data<accscalar_t>(),
+          staging_data_ptr,
+          semaphores_ptr,
+          reduction_size,
+          stride);
+    );
+  }
+
+  return {out_mean, out_var_biased};
+}
+
+at::Tensor batchnorm_forward_c_last_CUDA(
+    const at::Tensor input,
+    const at::Tensor mean,
+    const at::Tensor inv_std,
+    const at::optional<at::Tensor> weight,
+    const at::optional<at::Tensor> shift) {
+  const auto stride = input.size(input.ndimension()-1);
+  const auto reduction_size = input.numel() / stride;
+
+  at::Tensor out = at::empty_like(input);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (input.scalar_type() == at::ScalarType::Half
+      && weight.has_value() && weight.value().scalar_type() == at::ScalarType::Float) {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
+          shift.has_value() ? shift.value().data<accscalar_t>(): NULL,
+          out.data<scalar_t_0>(),
+          reduction_size,
+          stride);
+    );
+  } else {
+    if (weight.has_value()) {
+      AT_CHECK(input.scalar_type() == weight.value().scalar_type(),
+          "input.scalar_type() is not supported with weight.scalar_type()");
+    }
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
+          shift.has_value() ? shift.value().data<scalar_t_0>(): NULL,
+          out.data<scalar_t_0>(),
+          reduction_size,
+          stride);
+    );
+  }
+  return out;
+}
+
+std::vector<at::Tensor> reduce_bn_c_last_CUDA(
+    const at::Tensor grad_output,
+    const at::Tensor input,
+    const at::Tensor mean,
+    const at::Tensor inv_std,
+    const at::optional<at::Tensor> weight) {
+  const auto stride = input.size(input.ndimension()-1);
+  const auto reduction_size = input.numel() / stride;
+
+  at::Tensor mean_dy = at::empty({stride}, mean.options());
+  at::Tensor mean_dy_xmu = at::empty({stride}, mean.options());
+
+  at::Tensor grad_weight;
+  at::Tensor grad_bias;
+  if (weight.has_value()) {
+    grad_weight = at::empty({stride}, weight.value().options());
+    grad_bias = at::empty({stride}, weight.value().options());
+  } else {
+    // because I cannot return an uninitialized at::Tensor
+    grad_weight = at::empty({0}, mean.options());
+    grad_bias = at::empty({0}, mean.options());
+  }
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid, true);
+
+  at::Tensor staging_data;
+  at::Tensor semaphores;
+  if (grid.y > 1) {
+    staging_data = at::empty({2*stride*grid.y}, mean.options());
+    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
+  }
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (input.scalar_type() == at::ScalarType::Half
+      && weight.has_value()
+      && weight.value().scalar_type() == at::ScalarType::Float) {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
+      int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
+      reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          grad_output.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
+          weight.has_value() ?grad_bias.data<accscalar_t>() : NULL,
+          staging_data_ptr,
+          semaphores_ptr,
+          reduction_size,
+          stride);
+    );
+  } else {
+    if (weight.has_value()) {
+      AT_CHECK(input.scalar_type() == weight.value().scalar_type(),
+          "input.scalar_type() is not supported with weight.scalar_type()");
+    }
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
+      int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
+      reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.data<scalar_t_0>(),
+          grad_output.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
+          weight.has_value() ?grad_bias.data<scalar_t_0>() : NULL,
+          staging_data_ptr,
+          semaphores_ptr,
+          reduction_size,
+          stride);
+    );
+  }
+
+  return {mean_dy, mean_dy_xmu, grad_weight, grad_bias};
+}
+
+at::Tensor batchnorm_backward_c_last_CUDA(
+    const at::Tensor grad_output,
+    const at::Tensor input,
+    const at::Tensor mean,
+    const at::Tensor inv_std,
+    const at::optional<at::Tensor> weight,
+    const at::Tensor mean_dy,
+    const at::Tensor mean_dy_xmu) {
+  const auto stride = input.size(input.ndimension()-1);
+  const auto reduction_size = input.numel() / stride;
+
+  at::Tensor grad_input = at::empty_like(input);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (input.scalar_type() == at::ScalarType::Half
+      && weight.has_value() && weight.value().scalar_type() == at::ScalarType::Float) {
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          grad_output.data<scalar_t_0>(),
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          grad_input.data<scalar_t_0>(),
+          reduction_size,
+          stride);
+    );
+  } else {
+    if (weight.has_value()) {
+      AT_CHECK(input.scalar_type() == weight.value().scalar_type(),
+          "input.scalar_type() is not supported with weight.scalar_type()");
+    }
+    using namespace at;
+    DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
+      using accscalar_t = at::acc_type<scalar_t_0, true>;
+      batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          grad_output.data<scalar_t_0>(),
+          input.data<scalar_t_0>(),
+          mean.data<accscalar_t>(),
+          inv_std.data<accscalar_t>(),
+          weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
+          mean_dy.data<accscalar_t>(),
+          mean_dy_xmu.data<accscalar_t>(),
+          grad_input.data<scalar_t_0>(),
+          reduction_size,
+          stride);
+    );
+  }
+ 
+  return grad_input;
+}
diff --git a/apex/docs/Makefile b/apex/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..86cc24948dc35ad46b0b574003f71e234a30325a
--- /dev/null
+++ b/apex/docs/Makefile
@@ -0,0 +1,32 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = NVIDIAAPEX
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+gh-pages:
+	git checkout gh-pages
+	rm -rf build
+	rm -rf source
+	git checkout master -- .
+	make html
+	rm -rf ../_modules ../_sources ../_static
+	mv -fv build/html/* ../
+	rm -rf build
+	git add -A
+	git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/apex/docs/source/_static/css/pytorch_theme.css b/apex/docs/source/_static/css/pytorch_theme.css
new file mode 100644
index 0000000000000000000000000000000000000000..45e984c905662d6d8497e78c46dbfbc7b1b100c5
--- /dev/null
+++ b/apex/docs/source/_static/css/pytorch_theme.css
@@ -0,0 +1,118 @@
+body {
+    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+/* Default header fonts are ugly */
+h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
+    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+/* Use white for docs background */
+.wy-side-nav-search {
+    background-color: #fff;
+}
+
+.wy-nav-content-wrap, .wy-menu li.current > a  {
+    background-color: #fff;
+}
+
+@media screen and (min-width: 1400px) {
+    .wy-nav-content-wrap {
+        background-color: rgba(0, 0, 0, 0.0470588);
+    }
+
+    .wy-nav-content {
+        background-color: #fff;
+    }
+}
+
+/* Fixes for mobile */
+.wy-nav-top {
+    background-color: #fff;
+    background-image: url('../img/apex.jpg');
+    background-repeat: no-repeat;
+    background-position: center;
+    padding: 0;
+    margin: 0.4045em 0.809em;
+    color: #333;
+}
+
+.wy-nav-top > a {
+    display: none;
+}
+
+@media screen and (max-width: 768px) {
+    .wy-side-nav-search>a img.logo {
+        height: 60px;
+    }
+}
+
+/* This is needed to ensure that logo above search scales properly */
+.wy-side-nav-search a {
+    display: block;
+}
+
+/* This ensures that multiple constructors will remain in separate lines. */
+.rst-content dl:not(.docutils) dt {
+    display: table;
+}
+
+/* Use our red for literals (it's very similar to the original color) */
+.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
+    color: #F05732;
+}
+
+.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
+.rst-content code.xref, a .rst-content tt, a .rst-content code {
+    color: #404040;
+}
+
+/* Change link colors (except for the menu) */
+
+a {
+    color: #F05732;
+}
+
+a:hover {
+    color: #F05732;
+}
+
+
+a:visited {
+    color: #D44D2C;
+}
+
+.wy-menu a {
+    color: #b3b3b3;
+}
+
+.wy-menu a:hover {
+    color: #b3b3b3;
+}
+
+/* Default footer text is quite big */
+footer {
+    font-size: 80%;
+}
+
+footer .rst-footer-buttons {
+    font-size: 125%; /* revert footer settings - 1/80% = 125% */
+}
+
+footer p {
+    font-size: 100%;
+}
+
+/* For hidden headers that appear in TOC tree */
+/* see http://stackoverflow.com/a/32363545/3343043 */
+.rst-content .hidden-section {
+    display: none;
+}
+
+nav .hidden-section {
+    display: inherit;
+}
+
+.wy-side-nav-search>div.version {
+    color: #000;
+}
diff --git a/apex/docs/source/_static/img/nv-pytorch2.png b/apex/docs/source/_static/img/nv-pytorch2.png
new file mode 100644
index 0000000000000000000000000000000000000000..981268c60032b463b387dd3c9f56ebda9929a266
Binary files /dev/null and b/apex/docs/source/_static/img/nv-pytorch2.png differ
diff --git a/apex/docs/source/_templates/layout.html b/apex/docs/source/_templates/layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..63dfed9c94218f6fa646724a1842f44772213050
--- /dev/null
+++ b/apex/docs/source/_templates/layout.html
@@ -0,0 +1,51 @@
+{% extends "!layout.html" %}
+  {% block sidebartitle %} {{ super() }}
+
+  <style>
+    /* Sidebar header (and topbar for mobile) */
+    .wy-side-nav-search, .wy-nav-top {
+      background: #76b900;
+    }
+
+    .wy-side-nav-search a:link, .wy-nav-top a:link {
+      color: #fff;
+    }
+    .wy-side-nav-search a:visited, .wy-nav-top a:visited {
+      color: #fff;
+    }
+    .wy-side-nav-search a:hover, .wy-nav-top a:hover {
+      color: #fff;
+    }
+
+    .wy-menu-vertical a:link, .wy-menu-vertical a:visited {
+      color: #d9d9d9
+    }
+
+    .wy-menu-vertical a:active {
+      background-color: #76b900
+    }
+
+    .wy-side-nav-search>div.version {
+      color: rgba(0, 0, 0, 0.3)
+    }
+  </style>
+  {% endblock %}
+
+  {% block footer %} {{ super() }}
+
+  <style>
+  a:link, a:visited {
+    color: #76b900;
+  }
+
+  a:hover {
+    color: #8c0;
+  }
+
+  .rst-content dl:not(.docutils) dt {
+    background: rgba(118, 185, 0, 0.1);
+    color: rgba(59,93,0,1);
+    border-top: solid 3px rgba(59,93,0,1);
+  }
+  </style>
+  {% endblock %}
diff --git a/apex/docs/source/advanced.rst b/apex/docs/source/advanced.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37c3ebb65e5e1c1a4c5cfc7f7888d591b028da37
--- /dev/null
+++ b/apex/docs/source/advanced.rst
@@ -0,0 +1,217 @@
+.. role:: hidden
+    :class: hidden-section
+
+Advanced Amp Usage
+===================================
+
+GANs
+----
+
+GANs are an interesting synthesis of several topics below.  A `comprehensive example`_
+is under construction.
+
+.. _`comprehensive example`:
+    https://github.com/NVIDIA/apex/tree/master/examples/dcgan
+
+Gradient clipping
+-----------------
+Amp calls the params owned directly by the optimizer's ``param_groups`` the "master params."
+
+These master params may be fully or partially distinct from ``model.parameters()``.
+For example, with `opt_level="O2"`_, ``amp.initialize`` casts most model params to FP16,
+creates an FP32 master param outside the model for each newly-FP16 model param,
+and updates the optimizer's ``param_groups`` to point to these FP32 params.
+
+The master params owned by the optimizer's ``param_groups`` may also fully coincide with the
+model params, which is typically true for ``opt_level``\s ``O0``, ``O1``, and ``O3``.
+
+In all cases, correct practice is to clip the gradients of the params that are guaranteed to be
+owned **by the optimizer's** ``param_groups``, instead of those retrieved via ``model.parameters()``.
+
+Also, if Amp uses loss scaling, gradients must be clipped after they have been unscaled
+(which occurs during exit from the ``amp.scale_loss`` context manager).
+
+The following pattern should be correct for any ``opt_level``::
+
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+        # Gradients are unscaled during context manager exit.
+    # Now it's safe to clip.  Replace
+    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+    # with
+    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
+    # or
+    torch.nn.utils.clip_grad_value_(amp.master_params(optimizer), max_)
+
+Note the use of the utility function ``amp.master_params(optimizer)``,
+which returns a generator-expression that iterates over the
+params in the optimizer's ``param_groups``.
+
+Also note that ``clip_grad_norm_(amp.master_params(optimizer), max_norm)`` is invoked
+*instead of*, not *in addition to*, ``clip_grad_norm_(model.parameters(), max_norm)``.
+
+.. _`opt_level="O2"`:
+    https://nvidia.github.io/apex/amp.html#o2-fast-mixed-precision
+
+Custom/user-defined autograd functions
+--------------------------------------
+
+The old Amp API for `registering user functions`_ is still considered correct.  Functions must
+be registered before calling ``amp.initialize``.
+
+.. _`registering user functions`:
+    https://github.com/NVIDIA/apex/tree/master/apex/amp#annotating-user-functions
+
+Forcing particular layers/functions to a desired type
+-----------------------------------------------------
+
+I'm still working on a generalizable exposure for this that won't require user-side code divergence
+across different ``opt-level``\ s.
+
+Multiple models/optimizers/losses
+---------------------------------
+
+Initialization with multiple models/optimizers
+**********************************************
+
+``amp.initialize``'s optimizer argument may be a single optimizer or a list of optimizers,
+as long as the output you accept has the same type.
+Similarly, the ``model`` argument may be a single model or a list of models, as long as the accepted
+output matches.  The following calls are all legal::
+
+    model, optim = amp.initialize(model, optim,...)
+    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1],...)
+    [model0, model1], optim = amp.initialize([model0, model1], optim,...)
+    [model0, model1], [optim0, optim1] = amp.initialize([model0, model1], [optim0, optim1],...)
+
+Backward passes with multiple optimizers
+****************************************
+
+Whenever you invoke a backward pass, the ``amp.scale_loss`` context manager must receive
+**all the optimizers that own any params for which the current backward pass is creating gradients.**
+This is true even if each optimizer owns only some, but not all, of the params that are about to
+receive gradients.
+
+If, for a given backward pass, there's only one optimizer whose params are about to receive gradients,
+you may pass that optimizer directly to ``amp.scale_loss``.  Otherwise, you must pass the
+list of optimizers whose params are about to receive gradients::
+
+    # loss0 accumulates gradients only into params owned by optim0:
+    with amp.scale_loss(loss0, optim0) as scaled_loss:
+        scaled_loss.backward()
+
+    # loss1 accumulates gradients only into params owned by optim1:
+    with amp.scale_loss(loss1, optim1) as scaled_loss:
+        scaled_loss.backward()
+
+    # loss2 accumulates gradients into some params owned by optim0
+    # and some params owned by optim1
+    with amp.scale_loss(loss2, [optim0, optim1]) as scaled_loss:
+        scaled_loss.backward()
+
+Optionally have Amp use a different loss scaler per-loss
+********************************************************
+
+By default, Amp maintains a single global loss scaler that will be used for all backward passes
+(all invocations of ``with amp.scale_loss(...)``).  No additional arguments to ``amp.initialize``
+or ``amp.scale_loss`` are required to use the global loss scaler.  The code snippets above with
+multiple optimizers/backward passes use the single global loss scaler under the hood,
+and they should "just work."
+
+However, you can optionally tell Amp to maintain a loss scaler per-loss, which gives Amp increased
+numerical flexibility.  This is accomplished by supplying the ``num_losses`` argument to
+``amp.initialize`` (which tells Amp how many backward passes you plan to invoke, and therefore
+how many loss scalers Amp should create), then supplying the ``loss_id`` argument to each of your
+backward passes (which tells Amp the loss scaler to use for this particular backward pass)::
+
+    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1], ..., num_losses=3)
+
+    with amp.scale_loss(loss0, optim0, loss_id=0) as scaled_loss:
+        scaled_loss.backward()
+
+    with amp.scale_loss(loss1, optim1, loss_id=1) as scaled_loss:
+        scaled_loss.backward()
+
+    with amp.scale_loss(loss2, [optim0, optim1], loss_id=2) as scaled_loss:
+        scaled_loss.backward()
+
+``num_losses`` and ``loss_id``\ s should be specified purely based on the set of
+losses/backward passes.  The use of multiple optimizers, or association of single or
+multiple optimizers with each backward pass, is unrelated.
+
+Gradient accumulation across iterations
+---------------------------------------
+
+The following should "just work," and properly accommodate multiple models/optimizers/losses, as well as
+gradient clipping via the `instructions above`_::
+
+    if iter%iters_to_accumulate == 0:
+        # Every iters_to_accumulate iterations, unscale and step
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        # Gradient clipping if desired:
+        # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
+        optimizer.step()
+        optimizer.zero_grad()
+    else:
+        # Otherwise, accumulate gradients, don't unscale or step.
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+
+As a minor performance optimization, you can pass ``delay_unscale=True``
+to ``amp.scale_loss`` until you're ready to ``step()``.  You should only attempt ``delay_unscale=True``
+if you're sure you know what you're doing, because the interaction with gradient clipping and
+multiple models/optimizers/losses can become tricky.::
+
+    if iter%iters_to_accumulate == 0:
+        # Every iters_to_accumulate iterations, unscale and step
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+    else:
+        # Otherwise, accumulate gradients, don't unscale or step.
+        with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss:
+            scaled_loss.backward()
+
+.. _`instructions above`:
+    https://nvidia.github.io/apex/advanced.html#gradient-clipping
+
+Custom data batch types
+-----------------------
+
+The intention of Amp is that you never need to cast your input data manually, regardless of
+``opt_level``.  Amp accomplishes this by patching any models' ``forward`` methods to cast
+incoming data appropriately for the ``opt_level``.  But to cast incoming data,
+Amp needs to know how.  The patched ``forward`` will recognize and cast floating-point Tensors
+(non-floating-point Tensors like IntTensors are not touched) and
+Python containers of floating-point Tensors.  However, if you wrap your Tensors in a custom class,
+the casting logic doesn't know how to drill
+through the tough custom shell to access and cast the juicy Tensor meat within.  You need to tell
+Amp how to cast your custom batch class, by assigning it a ``to`` method that accepts a ``torch.dtype``
+(e.g., ``torch.float16`` or ``torch.float32``) and returns an instance of the custom batch cast to
+``dtype``.  The patched ``forward`` checks for the presence of your ``to`` method, and will
+invoke it with the correct type for the ``opt_level``.
+
+Example::
+
+    class CustomData(object):
+        def __init__(self):
+            self.tensor = torch.cuda.FloatTensor([1,2,3])
+
+        def to(self, dtype):
+            self.tensor = self.tensor.to(dtype)
+            return self
+
+.. warning::
+
+    Amp also forwards numpy ndarrays without casting them.  If you send input data as a raw, unwrapped
+    ndarray, then later use it to create a Tensor within your ``model.forward``, this Tensor's type will
+    not depend on the ``opt_level``, and may or may not be correct.  Users are encouraged to pass
+    castable data inputs (Tensors, collections of Tensors, or custom classes with a ``to`` method)
+    wherever possible.
+
+.. note::
+
+    Amp does not call ``.cuda()`` on any Tensors for you.  Amp assumes that your original script
+    is already set up to move Tensors from the host to the device as needed.
diff --git a/apex/docs/source/amp.rst b/apex/docs/source/amp.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2c6bb87cbaf71006ec89b21ce7aaaeeba971a49
--- /dev/null
+++ b/apex/docs/source/amp.rst
@@ -0,0 +1,250 @@
+.. role:: hidden
+    :class: hidden-section
+
+apex.amp
+===================================
+
+This page documents the updated API for Amp (Automatic Mixed Precision),
+a tool to enable Tensor Core-accelerated training in only 3 lines of Python.
+
+A `runnable, comprehensive Imagenet example`_ demonstrating good practices can be found
+on the Github page.
+
+GANs are a tricky case that many people have requested.  A `comprehensive DCGAN example`_
+is under construction.
+
+If you already implemented Amp based on the instructions below, but it isn't behaving as expected,
+please review `Advanced Amp Usage`_ to see if any topics match your use case.  If that doesn't help,
+`file an issue`_.
+
+.. _`file an issue`:
+    https://github.com/NVIDIA/apex/issues
+
+``opt_level``\ s and Properties
+-------------------------------
+
+Amp allows users to easily experiment with different pure and mixed precision modes.
+Commonly-used default modes are chosen by
+selecting an "optimization level" or ``opt_level``; each ``opt_level`` establishes a set of
+properties that govern Amp's implementation of pure or mixed precision training.
+Finer-grained control of how a given ``opt_level`` behaves can be achieved by passing values for
+particular properties directly to ``amp.initialize``.  These manually specified values
+override the defaults established by the ``opt_level``.
+
+Example::
+
+        # Declare model and optimizer as usual, with default (FP32) precision
+        model = torch.nn.Linear(D_in, D_out).cuda()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+        # Allow Amp to perform casts as required by the opt_level
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
+        ...
+        # loss.backward() becomes:
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        ...
+
+Users **should not** manually cast their model or data to ``.half()``, regardless of what ``opt_level``
+or properties are chosen.  Amp intends that users start with an existing default (FP32) script,
+add the three lines corresponding to the Amp API, and begin training with mixed precision.
+Amp can also be disabled, in which case the original script will behave exactly as it used to.
+In this way, there's no risk adhering to the Amp API, and a lot of potential performance benefit.
+
+.. note::
+    Because it's never necessary to manually cast your model (aside from the call ``amp.initialize``)
+    or input data, a script that adheres to the new API
+    can switch between different ``opt-level``\ s without having to make any other changes.
+
+.. _`runnable, comprehensive Imagenet example`:
+    https://github.com/NVIDIA/apex/tree/master/examples/imagenet
+
+.. _`comprehensive DCGAN example`:
+    https://github.com/NVIDIA/apex/tree/master/examples/dcgan
+
+.. _`Advanced Amp Usage`:
+    https://nvidia.github.io/apex/advanced.html
+
+Properties
+**********
+
+Currently, the under-the-hood properties that govern pure or mixed precision training are the following:
+
+- ``cast_model_type``:  Casts your model's parameters and buffers to the desired type.
+- ``patch_torch_functions``: Patch all Torch functions and Tensor methods to perform Tensor Core-friendly ops like GEMMs and convolutions in FP16, and any ops that benefit from FP32 precision in FP32.
+- ``keep_batchnorm_fp32``:  To enhance precision and enable cudnn batchnorm (which improves performance), it's often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16.
+- ``master_weights``:  Maintain FP32 master weights to accompany any FP16 model weights.  FP32 master weights are stepped by the optimizer to enhance precision and capture small gradients.
+- ``loss_scale``:  If ``loss_scale`` is a float value, use this value as the static (fixed) loss scale.  If ``loss_scale`` is the string ``"dynamic"``, adaptively adjust the loss scale over time.  Dynamic loss scale adjustments are performed by Amp automatically.
+
+Again, you often don't need to specify these properties by hand.  Instead, select an ``opt_level``,
+which will set them up for you.  After selecting an ``opt_level``, you can optionally pass property
+kwargs as manual overrides.
+
+If you attempt to override a property that does not make sense for the selected ``opt_level``,
+Amp will raise an error with an explanation.  For example, selecting ``opt_level="O1"`` combined with
+the override ``master_weights=True`` does not make sense.  ``O1`` inserts casts
+around Torch functions rather than model weights.  Data, activations, and weights are recast
+out-of-place on the fly as they flow through patched functions.  Therefore, the model weights themselves
+can (and should) remain FP32, and there is no need to maintain separate FP32 master weights.
+
+``opt_level``\ s
+****************
+
+Recognized ``opt_level``\ s are ``"O0"``, ``"O1"``, ``"O2"``, and ``"O3"``.
+
+``O0`` and ``O3`` are not true mixed precision, but they are useful for establishing accuracy and
+speed baselines, respectively.
+
+``O1`` and ``O2`` are different implementations of mixed precision.  Try both, and see
+what gives the best speedup and accuracy for your model.
+
+``O0``:  FP32 training
+^^^^^^^^^^^^^^^^^^^^^^
+Your incoming model should be FP32 already, so this is likely a no-op.
+``O0`` can be useful to establish an accuracy baseline.
+
+| Default properties set by ``O0``:
+| ``cast_model_type=torch.float32``
+| ``patch_torch_functions=False``
+| ``keep_batchnorm_fp32=None`` (effectively, "not applicable," everything is FP32)
+| ``master_weights=False``
+| ``loss_scale=1.0``
+|
+|
+
+``O1``:  Mixed Precision (recommended for typical use)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Patch all Torch functions and Tensor methods to cast their inputs according to a whitelist-blacklist
+model.  Whitelist ops (for example, Tensor Core-friendly ops like GEMMs and convolutions) are performed
+in FP16.  Blacklist ops that benefit from FP32 precision (for example, softmax)
+are performed in FP32.  ``O1`` also uses dynamic loss scaling, unless overridden.
+
+| Default properties set by ``O1``:
+| ``cast_model_type=None`` (not applicable)
+| ``patch_torch_functions=True``
+| ``keep_batchnorm_fp32=None`` (again, not applicable, all model weights remain FP32)
+| ``master_weights=None`` (not applicable, model weights remain FP32)
+| ``loss_scale="dynamic"``
+|
+|
+
+``O2``:  "Almost FP16" Mixed Precision
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``O2`` casts the model weights to FP16,
+patches the model's ``forward`` method to cast input
+data to FP16, keeps batchnorms in FP32, maintains FP32 master weights,
+updates the optimizer's ``param_groups`` so that the ``optimizer.step()``
+acts directly on the FP32 weights (followed by FP32 master weight->FP16 model weight
+copies if necessary),
+and implements dynamic loss scaling (unless overridden).
+Unlike ``O1``, ``O2`` does not patch Torch functions or Tensor methods.
+
+| Default properties set by ``O2``:
+| ``cast_model_type=torch.float16``
+| ``patch_torch_functions=False``
+| ``keep_batchnorm_fp32=True``
+| ``master_weights=True``
+| ``loss_scale="dynamic"``
+|
+|
+
+``O3``:  FP16 training
+^^^^^^^^^^^^^^^^^^^^^^
+``O3`` may not achieve the stability of the true mixed precision options ``O1`` and ``O2``.
+However, it can be useful to establish a speed baseline for your model, against which
+the performance of ``O1`` and ``O2`` can be compared.  If your model uses batch normalization,
+to establish "speed of light" you can try ``O3`` with the additional property override
+``keep_batchnorm_fp32=True`` (which enables cudnn batchnorm, as stated earlier).
+
+| Default properties set by ``O3``:
+| ``cast_model_type=torch.float16``
+| ``patch_torch_functions=False``
+| ``keep_batchnorm_fp32=False``
+| ``master_weights=False``
+| ``loss_scale=1.0``
+|
+|
+
+Unified API
+-----------
+
+.. automodule:: apex.amp
+.. currentmodule:: apex.amp
+
+.. autofunction:: initialize
+
+.. autofunction:: scale_loss
+
+.. autofunction:: master_params
+
+Advanced use cases
+------------------
+
+The unified Amp API supports gradient accumulation across iterations,
+multiple backward passes per iteration, multiple models/optimizers,
+custom/user-defined autograd functions, and custom data batch classes.  Gradient clipping and GANs also
+require special treatment, but this treatment does not need to change
+for different ``opt_level``\ s.  Further details can be found here:
+
+.. toctree::
+   :maxdepth: 1
+
+   advanced
+
+Transition guide for old API users
+----------------------------------
+
+We strongly encourage moving to the new Amp API, because it's more versatile, easier to use, and future proof.  The original :class:`FP16_Optimizer` and the old "Amp" API are deprecated, and subject to removal at at any time.
+
+For users of the old "Amp" API
+******************************
+
+In the new API, ``opt-level O1`` performs the same patching of the Torch namespace as the old thing
+called "Amp."
+However, the new API allows static or dynamic loss scaling, while the old API only allowed dynamic loss scaling.
+
+In the new API, the old call to ``amp_handle = amp.init()``, and the returned ``amp_handle``, are no
+longer exposed or necessary.  The new ``amp.initialize()`` does the duty of ``amp.init()`` (and more).
+Therefore, any existing calls to ``amp_handle = amp.init()`` should be deleted.
+
+The functions formerly exposed through ``amp_handle`` are now free
+functions accessible through the ``amp`` module.
+
+The backward context manager must be changed accordingly::
+
+    # old API
+    with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+    ->
+    # new API
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+
+For now, the deprecated "Amp" API documentation can still be found on the Github README:  https://github.com/NVIDIA/apex/tree/master/apex/amp.  The old API calls that `annotate user functions`_ to run
+with a particular precision are still honored by the new API.
+
+.. _`annotate user functions`:
+    https://github.com/NVIDIA/apex/tree/master/apex/amp#annotating-user-functions
+
+
+For users of the old FP16_Optimizer
+***********************************
+
+``opt-level O2`` is equivalent to :class:`FP16_Optimizer` with ``dynamic_loss_scale=True``.
+Once again, the backward pass must be changed to the unified version::
+
+    optimizer.backward(loss)
+    ->
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+
+One annoying aspect of FP16_Optimizer was that the user had to manually convert their model to half
+(either by calling ``.half()`` on it, or using a function or module wrapper from
+``apex.fp16_utils``), and also manually call ``.half()`` on input data.  **Neither of these are
+necessary in the new API.  No matter what --opt-level
+you choose, you can and should simply build your model and pass input data in the default FP32 format.**
+The new Amp API will perform the right conversions during
+``model, optimizer = amp.initialize(model, optimizer, opt_level=....)`` based on the ``--opt-level``
+and any overridden flags.  Floating point input data may be FP32 or FP16, but you may as well just
+let it be FP16, because the ``model`` returned by ``amp.initialize`` will have its ``forward``
+method patched to cast the input data appropriately.
diff --git a/apex/docs/source/conf.py b/apex/docs/source/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4477a28e1352c362d8cd66e25409f803c60a8069
--- /dev/null
+++ b/apex/docs/source/conf.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# PyTorch documentation build configuration file, created by
+# sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('../../apex/parallel/'))
+import apex
+# import multiproc
+import sphinx_rtd_theme
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.extlinks',
+]
+
+napoleon_use_ivar = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Apex'
+copyright = '2018'
+author = 'Christian Sarofeen, Natalia Gimelshein, Michael Carilli, Raul Puri'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+# TODO: change to [:2] at v1.0
+# version = 'master (' + torch.__version__ + ' )'
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+# TODO: verify this works as expected
+release = '0.1.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'collapse_navigation': False,
+    'display_version': True,
+    'logo_only': True,
+}
+
+# html_logo = '_static/img/nv-pytorch2.png'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# html_style_path = 'css/pytorch_theme.css'
+html_context = {
+    'css_files': [
+        'https://fonts.googleapis.com/css?family=Lato',
+        '_static/css/pytorch_theme.css'
+    ],
+}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PyTorchdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'apex.tex', 'Apex Documentation',
+     'Torch Contributors', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'Apex', 'Apex Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Apex', 'Apex Documentation',
+     author, 'Apex', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/', None),
+    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
+}
+
+# -- A patch that prevents Sphinx from cross-referencing ivar tags -------
+# See http://stackoverflow.com/a/41184353/3343043
+
+from docutils import nodes
+from sphinx.util.docfields import TypedField
+from sphinx import addnodes
+
+
+def patched_make_field(self, types, domain, items, **kw):
+    # `kw` catches `env=None` needed for newer sphinx while maintaining
+    #  backwards compatibility when passed along further down!
+
+    # type: (List, unicode, Tuple) -> nodes.field
+    def handle_item(fieldarg, content):
+        par = nodes.paragraph()
+        par += addnodes.literal_strong('', fieldarg)  # Patch: this line added
+        # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
+        #                           addnodes.literal_strong))
+        if fieldarg in types:
+            par += nodes.Text(' (')
+            # NOTE: using .pop() here to prevent a single type node to be
+            # inserted twice into the doctree, which leads to
+            # inconsistencies later when references are resolved
+            fieldtype = types.pop(fieldarg)
+            if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
+                typename = u''.join(n.astext() for n in fieldtype)
+                typename = typename.replace('int', 'python:int')
+                typename = typename.replace('long', 'python:long')
+                typename = typename.replace('float', 'python:float')
+                typename = typename.replace('type', 'python:type')
+                par.extend(self.make_xrefs(self.typerolename, domain, typename,
+                                           addnodes.literal_emphasis, **kw))
+            else:
+                par += fieldtype
+            par += nodes.Text(')')
+        par += nodes.Text(' -- ')
+        par += content
+        return par
+
+    fieldname = nodes.field_name('', self.label)
+    if len(items) == 1 and self.can_collapse:
+        fieldarg, content = items[0]
+        bodynode = handle_item(fieldarg, content)
+    else:
+        bodynode = self.list_type()
+        for fieldarg, content in items:
+            bodynode += nodes.list_item('', handle_item(fieldarg, content))
+    fieldbody = nodes.field_body('', bodynode)
+    return nodes.field('', fieldname, fieldbody)
+
+TypedField.make_field = patched_make_field
diff --git a/apex/docs/source/fp16_utils.rst b/apex/docs/source/fp16_utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b6b3da5f8db9f20b65e2f0f000dde7c59df4af04
--- /dev/null
+++ b/apex/docs/source/fp16_utils.rst
@@ -0,0 +1,59 @@
+.. role:: hidden
+    :class: hidden-section
+
+apex.fp16_utils
+===================================
+
+This submodule contains utilities designed to streamline the mixed precision training recipe 
+presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions 
+`Training Neural Networks with Mixed Precision: Theory and Practice`_ and 
+`Training Neural Networks with Mixed Precision: Real Examples`_.
+For Pytorch users, Real Examples in particular is recommended.
+
+Full runnable Python scripts demonstrating ``apex.fp16_utils`` 
+can be found on the Github page:
+
+| `Simple FP16_Optimizer demos`_
+|
+| `Distributed Mixed Precision Training with imagenet`_
+|
+| `Mixed Precision Training with word_language_model`_
+|
+|
+
+.. _`on Parallel Forall`:
+    https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
+.. _`Training Neural Networks with Mixed Precision: Theory and Practice`:
+    http://on-demand.gputechconf.com/gtc/2018/video/S8923/
+.. _`Training Neural Networks with Mixed Precision: Real Examples`:
+    http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+.. _`Simple FP16_Optimizer demos`:
+    https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple
+.. _`Distributed Mixed Precision Training with imagenet`:
+    https://github.com/NVIDIA/apex/tree/master/examples/imagenet
+.. _`Mixed Precision Training with word_language_model`:
+    https://github.com/NVIDIA/apex/tree/master/examples/word_language_model
+
+.. automodule:: apex.fp16_utils
+.. currentmodule:: apex.fp16_utils
+
+Automatic management of master params + loss scaling
+----------------------------------------------------
+
+.. autoclass:: FP16_Optimizer
+    :members:
+
+.. autoclass:: LossScaler
+    :members:
+
+.. autoclass:: DynamicLossScaler
+    :members:
+
+Manual master parameter management
+----------------------------------
+
+.. autofunction:: prep_param_lists
+
+.. autofunction:: master_params_to_model_params
+
+.. autofunction:: model_grads_to_master_grads
diff --git a/apex/docs/source/index.rst b/apex/docs/source/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cd43c86400891631201d95b3e2359623a65c43e2
--- /dev/null
+++ b/apex/docs/source/index.rst
@@ -0,0 +1,52 @@
+.. PyTorch documentation master file, created by
+   sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+:github_url: https://github.com/nvidia/apex
+
+Apex (A PyTorch Extension)
+===================================
+
+This site contains the API documentation for Apex (https://github.com/nvidia/apex),
+a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training.  Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible.
+
+Installation instructions can be found here:  https://github.com/NVIDIA/apex#quick-start.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: AMP:  Automatic Mixed Precision
+
+   amp
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Distributed Training
+
+   parallel
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Fused Optimizers
+
+   optimizers
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Fused Layer Norm
+
+   layernorm
+
+..   .. toctree::
+     :maxdepth: 1
+     :caption: Deprecated mixed precision API
+     fp16_util
+
+..   reparameterization
+..   RNN
+   
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
diff --git a/apex/docs/source/layernorm.rst b/apex/docs/source/layernorm.rst
new file mode 100644
index 0000000000000000000000000000000000000000..36dcb845b16ca943de1f214496e953e26c2f934c
--- /dev/null
+++ b/apex/docs/source/layernorm.rst
@@ -0,0 +1,14 @@
+.. role:: hidden
+    :class: hidden-section
+
+apex.normalization.fused_layer_norm
+===================================
+
+.. automodule:: apex.normalization
+.. currentmodule:: apex.normalization
+
+.. FusedAdam
+   ----------
+
+.. autoclass:: FusedLayerNorm
+    :members:
diff --git a/apex/docs/source/optimizers.rst b/apex/docs/source/optimizers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8dbfcb86c8d27cad58182b10858831d40e5c3afb
--- /dev/null
+++ b/apex/docs/source/optimizers.rst
@@ -0,0 +1,14 @@
+.. role:: hidden
+    :class: hidden-section
+
+apex.optimizers
+===================================
+
+.. automodule:: apex.optimizers
+.. currentmodule:: apex.optimizers
+
+.. FusedAdam
+   ----------
+
+.. autoclass:: FusedAdam
+    :members:
diff --git a/apex/docs/source/parallel.rst b/apex/docs/source/parallel.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73759eeb90a0d21367fa961495f852ea0c752ab4
--- /dev/null
+++ b/apex/docs/source/parallel.rst
@@ -0,0 +1,25 @@
+.. role:: hidden
+    :class: hidden-section
+
+apex.parallel
+===================================
+
+.. automodule:: apex.parallel
+.. currentmodule:: apex.parallel
+
+.. DistributedDataParallel
+   ----------
+
+.. autoclass:: DistributedDataParallel
+    :members:
+
+.. autoclass:: Reducer
+    :members:
+
+.. autoclass:: SyncBatchNorm
+    :members:
+
+Utility functions
+----------------------------------
+
+.. autofunction:: convert_syncbn_model
diff --git a/apex/examples/README.md b/apex/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6cb9231813ee4cd36cc6e94ab238dcd2b3ea77ed
--- /dev/null
+++ b/apex/examples/README.md
@@ -0,0 +1,4 @@
+This directory contains examples illustrating Apex mixed precision and distributed tools.
+
+**Note for users of the pre-unification API**:
+`deprecated_api` contains examples illustrating the old (pre-unified) APIs.  These APIs will be removed soon, and users are strongly encouraged to switch.  The separate mixed precision tools called `Amp` and `FP16_Optimizer` in the old API are exposed via different flags/optimization levels in the new API.
diff --git a/apex/examples/dcgan/README.md b/apex/examples/dcgan/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e86fd8fc1eecb066b10814fea5967acbb67fe7b
--- /dev/null
+++ b/apex/examples/dcgan/README.md
@@ -0,0 +1 @@
+Under construction...
diff --git a/apex/examples/docker/Dockerfile b/apex/examples/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..37bed888cdbe8ccc9c95765059a00d26cf2b411a
--- /dev/null
+++ b/apex/examples/docker/Dockerfile
@@ -0,0 +1,16 @@
+# Base image must at least have pytorch and CUDA installed.
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:19.03-py3
+FROM $BASE_IMAGE
+ARG BASE_IMAGE
+RUN echo "Installing Apex on top of ${BASE_IMAGE}"
+# make sure we don't overwrite some existing directory called "apex"
+WORKDIR /tmp/unique_for_apex
+# uninstall Apex if present, twice to make absolutely sure :)
+RUN pip uninstall -y apex || :
+RUN pip uninstall -y apex || :
+# SHA is something the user can touch to force recreation of this Docker layer,
+# and therefore force cloning of the latest version of Apex
+RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
+WORKDIR /tmp/unique_for_apex/apex
+RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+WORKDIR /workspace
diff --git a/apex/examples/docker/README.md b/apex/examples/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec0f4c0de9f93be250b74c87614479412f1a3bd0
--- /dev/null
+++ b/apex/examples/docker/README.md
@@ -0,0 +1,40 @@
+## Option 1:  Create a new container with Apex
+
+**Dockerfile** installs the latest Apex on top of an existing image.  Run
+```
+docker build -t new_image_with_apex .
+```
+By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
+which requires an NVIDIA GPU Cloud (NGC) account.  If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
+
+Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
+`BASE_IMAGE` must have Pytorch and Cuda installed.  For example, any
+`-devel` image for Pytorch 1.0 and later from the
+[official Pytorch Dockerhub](https://hub.docker.com/r/pytorch/pytorch) may be used:
+```
+docker build --build-arg BASE_IMAGE=pytorch/pytorch:nightly-devel-cuda10.0-cudnn7 -t new_image_with_apex .
+```
+
+If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.
+
+**Warning:**
+Currently, the non-`-devel` images on Pytorch Dockerhub do not contain the Cuda compiler `nvcc`.  Therefore,
+images whose name does not contain `-devel` are not eligible candidates for `BASE_IMAGE`.
+
+### Running your Apex container
+
+Like any Cuda-enabled Pytorch container, a container with Apex should be run via [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), for example:
+```
+docker run --runtime=nvidia -it --rm --ipc=host new_image_with_apex
+```
+
+## Option 2:  Install Apex in a running container
+
+Instead of building a new container, it is also a viable option to `git clone https://github.com/NVIDIA/apex.git` on bare metal, mount the Apex repo into your container at launch by running, for example,
+```
+docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image>
+```
+then go to /apex/in/container within the running container and
+```
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+```
diff --git a/apex/examples/imagenet/README.md b/apex/examples/imagenet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ddeac3198d2565e3a4c46eb9dbe6fdb05afe9d20
--- /dev/null
+++ b/apex/examples/imagenet/README.md
@@ -0,0 +1,175 @@
+# Mixed Precision ImageNet Training in PyTorch
+
+`main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet).
+It implements Automatic Mixed Precision (Amp) training of popular model architectures, such as ResNet, AlexNet, and VGG, on the ImageNet dataset.  Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).
+
+Three lines enable Amp:
+```
+# Added after model and optimizer construction
+model, optimizer = amp.initialize(model, optimizer, flags...)
+...
+# loss.backward() changed to:
+with amp.scale_loss(loss, optimizer) as scaled_loss:
+    scaled_loss.backward()
+```
+
+With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**
+
+## Requirements
+
+- Download the ImageNet dataset and move validation images to labeled subfolders
+    - The following script may be helpful: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
+
+## Training
+
+To train a model, create softlinks to the Imagenet dataset, then run `main.py` with the desired model architecture, as shown in `Example commands` below.
+
+The default learning rate schedule is set for ResNet50.  `main_amp.py` script rescales the learning rate according to the global batch size (number of distributed processes \* per-process minibatch size).
+
+## Example commands
+
+**Note:**  batch size `--b 224` assumes your GPUs have >=16GB of onboard memory.  You may be able to increase this to 256, but that's cutting it close, so it may out-of-memory for different Pytorch versions.
+
+**Note:**  All of the following use 4 dataloader subprocesses (`--workers 4`) to reduce potential
+CPU data loading bottlenecks.
+
+**Note:**  `--opt-level` `O1` and `O2` both use dynamic loss scaling by default unless manually overridden.
+`--opt-level` `O0` and `O3` (the "pure" training modes) do not use loss scaling by default.
+`O0` and `O3` can be told to use loss scaling via manual overrides, but using loss scaling with `O0`
+(pure FP32 training) does not really make sense, and will trigger a warning.
+
+Softlink training and validation datasets into the current directory:
+```
+$ ln -sf /data/imagenet/train-jpeg/ train
+$ ln -sf /data/imagenet/val-jpeg/ val
+```
+
+### Summary
+
+Amp allows easy experimentation with various pure and mixed precision options.
+```
+$ python main_amp.py -a resnet50 --b 128 --workers 4 --opt-level O0 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 --keep-batchnorm-fp32 True ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 --loss-scale 128.0 ./
+$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 --loss-scale 128.0 ./
+$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
+```
+Options are explained below.  Again, the [updated API guide](https://nvidia.github.io/apex/amp.html) provides more detail.
+
+#### `--opt-level O0` (FP32 training) and `O3` (FP16 training)
+
+"Pure FP32" training:
+```
+$ python main_amp.py -a resnet50 --b 128 --workers 4 --opt-level O0 ./
+```
+"Pure FP16" training:
+```
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 ./
+```
+FP16 training with FP32 batchnorm:
+```
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O3 --keep-batchnorm-fp32 True ./
+```
+Keeping the batchnorms in FP32 improves stability and allows Pytorch
+to use cudnn batchnorms, which significantly increases speed in Resnet50.
+
+The `O3` options might not converge, because they are not true mixed precision.
+However, they can be useful to establish "speed of light" performance for
+your model, which provides a baseline for comparison with `O1` and `O2`.
+For Resnet50 in particular, `--opt-level O3 --keep-batchnorm-fp32 True` establishes
+the "speed of light."  (Without `--keep-batchnorm-fp32`, it's slower, because it does
+not use cudnn batchnorm.)
+
+#### `--opt-level O1` ("conservative mixed precision")
+
+`O1` patches Torch functions to cast inputs according to a whitelist-blacklist model.
+FP16-friendly (Tensor Core) ops like gemms and convolutions run in FP16, while ops
+that benefit from FP32, like batchnorm and softmax, run in FP32.
+Also, dynamic loss scaling is used by default.
+```
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
+```
+`O1` overridden to use static loss scaling:
+```
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 --loss-scale 128.0
+```
+Distributed training with 2 processes (1 GPU per process, see **Distributed training** below
+for more detail)
+```
+$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O1 ./
+```
+For best performance, set `--nproc_per_node` equal to the total number of GPUs on the node
+to use all available resources.
+
+#### `--opt-level O2` ("fast mixed precision")
+
+`O2` casts the model to FP16, keeps batchnorms in FP32,
+maintains master weights in FP32, and implements
+dynamic loss scaling by default. (Unlike --opt-level O1, --opt-level O2
+does not patch Torch functions.)
+```
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
+```
+"Fast mixed precision" overridden to use static loss scaling:
+```
+$ python main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 --loss-scale 128.0 ./
+```
+Distributed training with 2 processes (1 GPU per process)
+```
+$ python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 224 --workers 4 --opt-level O2 ./
+```
+
+## Distributed training
+
+`main_amp.py` optionally uses `apex.parallel.DistributedDataParallel` (DDP) for multiprocess training with one GPU per process.
+```
+model = apex.parallel.DistributedDataParallel(model)
+```
+is a drop-in replacement for
+```
+model = torch.nn.parallel.DistributedDataParallel(model,
+                                                  device_ids=[arg.local_rank],
+                                                  output_device=arg.local_rank)
+```
+(because Torch DDP permits multiple GPUs per process, with Torch DDP you are required to
+manually specify the device to run on and the output device.
+With Apex DDP, it uses only the current device by default).
+
+The choice of DDP wrapper (Torch or Apex) is orthogonal to the use of Amp and other Apex tools.  It is safe to use `apex.amp` with either `torch.nn.parallel.DistributedDataParallel` or `apex.parallel.DistributedDataParallel`.  In the future, I may add some features that permit optional tighter integration between `Amp` and `apex.parallel.DistributedDataParallel` for marginal performance benefits, but currently, there's no compelling reason to use Apex DDP versus Torch DDP for most models.
+
+To use DDP with `apex.amp`, the only gotcha is that
+```
+model, optimizer = amp.initialize(model, optimizer, flags...)
+```
+must precede
+```
+model = DDP(model)
+```
+If DDP wrapping occurs before `amp.initialize`, `amp.initialize` will raise an error.
+
+With both Apex DDP and Torch DDP, you must also call `torch.cuda.set_device(args.local_rank)` within
+each process prior to initializing your model or any other tensors.
+More information can be found in the docs for the
+Pytorch multiprocess launcher module [torch.distributed.launch](https://pytorch.org/docs/stable/distributed.html#launch-utility).
+
+`main_amp.py` is written to interact with 
+[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility),
+which spawns multiprocess jobs using the following syntax:
+```
+python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_amp.py args...
+```
+`NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.  The use of `torch.distributed.launch` is unrelated to the choice of DDP wrapper.  It is safe to use either apex DDP or torch DDP with `torch.distributed.launch`.
+
+Optionally, one can run imagenet with synchronized batch normalization across processes by adding
+`--sync_bn` to the `args...`
+
+## Deterministic training (for debugging purposes)
+
+Running with the `--deterministic` flag should produce bitwise identical outputs run-to-run,
+regardless of what other options are used (see [Pytorch docs on reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)).
+Since `--deterministic` disables `torch.backends.cudnn.benchmark`, `--deterministic` may
+cause a modest performance decrease.
diff --git a/apex/examples/imagenet/main_amp.py b/apex/examples/imagenet/main_amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..bddf24827e9b55a1991877ebc31b973abef3ce83
--- /dev/null
+++ b/apex/examples/imagenet/main_amp.py
@@ -0,0 +1,522 @@
+import argparse
+import os
+import shutil
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+
+import numpy as np
+
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex import amp, optimizers
+    from apex.multi_tensor_apply import multi_tensor_applier
+except ImportError:
+    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
+
+model_names = sorted(name for name in models.__dict__
+                     if name.islower() and not name.startswith("__")
+                     and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                    ' | '.join(model_names) +
+                    ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N', help='mini-batch size per process (default: 256)')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='Initial learning rate.  Will be scaled by <global batch size>/256: args.lr = args.lr*float(args.batch_size*args.world_size)/256.  A warmup schedule will also be applied over the first 5 epochs.')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+
+parser.add_argument('--prof', dest='prof', action='store_true',
+                    help='Only run 10 iterations for profiling.')
+parser.add_argument('--deterministic', action='store_true')
+
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--sync_bn', action='store_true',
+                    help='enabling apex sync BN.')
+
+parser.add_argument('--opt-level', type=str)
+parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+parser.add_argument('--loss-scale', type=str, default=None)
+
+cudnn.benchmark = True
+
+def fast_collate(batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8 )
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        if(nump_array.ndim < 3):
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+
+        tensor[i] += torch.from_numpy(nump_array)
+        
+    return tensor, targets
+
+best_prec1 = 0
+args = parser.parse_args()
+
+print("opt_level = {}".format(args.opt_level))
+print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32))
+print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))
+
+print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))
+
+if args.deterministic:
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+    torch.manual_seed(args.local_rank)
+    torch.set_printoptions(precision=10)
+
+def main():
+    global best_prec1, args
+
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+    args.gpu = 0
+    args.world_size = 1
+
+    if args.distributed:
+        args.gpu = args.local_rank
+        torch.cuda.set_device(args.gpu)
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method='env://')
+        args.world_size = torch.distributed.get_world_size()
+
+    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."
+
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch]()
+
+    if args.sync_bn:
+        import apex
+        print("using apex synced BN")
+        model = apex.parallel.convert_syncbn_model(model)
+
+    model = model.cuda()
+
+    # Scale learning rate based on global batch size
+    args.lr = args.lr*float(args.batch_size*args.world_size)/256. 
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    # Initialize Amp.  Amp accepts either values or strings for the optional override arguments,
+    # for convenient interoperation with argparse.
+    model, optimizer = amp.initialize(model, optimizer,
+                                      opt_level=args.opt_level,
+                                      keep_batchnorm_fp32=args.keep_batchnorm_fp32,
+                                      loss_scale=args.loss_scale
+                                      )
+
+    # For distributed training, wrap the model with apex.parallel.DistributedDataParallel.
+    # This must be done AFTER the call to amp.initialize.  If model = DDP(model) is called
+    # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter
+    # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks.
+    if args.distributed:
+        # By default, apex.parallel.DistributedDataParallel overlaps communication with 
+        # computation in the backward pass.
+        # model = DDP(model)
+        # delay_allreduce delays all communication to the end of the backward pass.
+        model = DDP(model, delay_allreduce=True)
+
+    # define loss function (criterion) and optimizer
+    criterion = nn.CrossEntropyLoss().cuda()
+
+    # Optionally resume from a checkpoint
+    if args.resume:
+        # Use a local scope to avoid dangling references
+        def resume():
+            if os.path.isfile(args.resume):
+                print("=> loading checkpoint '{}'".format(args.resume))
+                checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
+                args.start_epoch = checkpoint['epoch']
+                best_prec1 = checkpoint['best_prec1']
+                model.load_state_dict(checkpoint['state_dict'])
+                optimizer.load_state_dict(checkpoint['optimizer'])
+                print("=> loaded checkpoint '{}' (epoch {})"
+                      .format(args.resume, checkpoint['epoch']))
+            else:
+                print("=> no checkpoint found at '{}'".format(args.resume))
+        resume()
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+
+    if(args.arch == "inception_v3"):
+        raise RuntimeError("Currently, inception_v3 is not supported by this example.")
+        # crop_size = 299
+        # val_size = 320 # I chose this value arbitrarily, we can adjust.
+    else:
+        crop_size = 224
+        val_size = 256
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(crop_size),
+            transforms.RandomHorizontalFlip(),
+            # transforms.ToTensor(), Too slow
+            # normalize,
+        ]))
+    val_dataset = datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(val_size),
+            transforms.CenterCrop(crop_size),
+        ]))
+
+    train_sampler = None
+    val_sampler = None
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True,
+        sampler=val_sampler,
+        collate_fn=fast_collate)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch)
+        if args.prof:
+            break
+        # evaluate on validation set
+        prec1 = validate(val_loader, model, criterion)
+
+        # remember best prec@1 and save checkpoint
+        if args.local_rank == 0:
+            is_best = prec1 > best_prec1
+            best_prec1 = max(prec1, best_prec1)
+            save_checkpoint({
+                'epoch': epoch + 1,
+                'arch': args.arch,
+                'state_dict': model.state_dict(),
+                'best_prec1': best_prec1,
+                'optimizer' : optimizer.state_dict(),
+            }, is_best)
+
+class data_prefetcher():
+    def __init__(self, loader):
+        self.loader = iter(loader)
+        self.stream = torch.cuda.Stream()
+        self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)
+        self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)
+        # With Amp, it isn't necessary to manually convert data to half.
+        # if args.fp16:
+        #     self.mean = self.mean.half()
+        #     self.std = self.std.half()
+        self.preload()
+
+    def preload(self):
+        try:
+            self.next_input, self.next_target = next(self.loader)
+        except StopIteration:
+            self.next_input = None
+            self.next_target = None
+            return
+        # if record_stream() doesn't work, another option is to make sure device inputs are created
+        # on the main stream.
+        # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
+        # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
+        # Need to make sure the memory allocated for next_* is not still in use by the main stream
+        # at the time we start copying to next_*:
+        # self.stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self.stream):
+            self.next_input = self.next_input.cuda(non_blocking=True)
+            self.next_target = self.next_target.cuda(non_blocking=True)
+            # more code for the alternative if record_stream() doesn't work:
+            # copy_ will record the use of the pinned source tensor in this side stream.
+            # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
+            # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
+            # self.next_input = self.next_input_gpu
+            # self.next_target = self.next_target_gpu
+
+            # With Amp, it isn't necessary to manually convert data to half.
+            # if args.fp16:
+            #     self.next_input = self.next_input.half()
+            # else:
+            self.next_input = self.next_input.float()
+            self.next_input = self.next_input.sub_(self.mean).div_(self.std)
+            
+    def next(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        input = self.next_input
+        target = self.next_target
+        input.record_stream(torch.cuda.current_stream())
+        target.record_stream(torch.cuda.current_stream())
+        self.preload()
+        return input, target
+
+
+def train(train_loader, model, criterion, optimizer, epoch):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to train mode
+    model.train()
+    end = time.time()
+
+    prefetcher = data_prefetcher(train_loader)
+    input, target = prefetcher.next()
+    i = 0
+    while input is not None:
+        i += 1
+
+        adjust_learning_rate(optimizer, epoch, i, len(train_loader))
+
+        if args.prof:
+            if i > 10:
+                break
+
+        # compute output
+        if args.prof: torch.cuda.nvtx.range_push("forward")
+        output = model(input)
+        if args.prof: torch.cuda.nvtx.range_pop()
+        loss = criterion(output, target)
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+
+        if args.prof: torch.cuda.nvtx.range_push("backward")
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        if args.prof: torch.cuda.nvtx.range_pop()
+
+        # for param in model.parameters():
+        #     print(param.data.double().sum().item(), param.grad.data.double().sum().item())
+
+        if args.prof: torch.cuda.nvtx.range_push("step")
+        optimizer.step()
+        if args.prof: torch.cuda.nvtx.range_pop()
+
+        if i%args.print_freq == 0:
+            # Every print_freq iterations, check the loss, accuracy, and speed.
+            # For best performance, it doesn't make sense to print these metrics every
+            # iteration, since they incur an allreduce and some host<->device syncs.
+
+            # Measure accuracy
+            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+   
+            # Average loss and accuracy across processes for logging 
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data)
+                prec1 = reduce_tensor(prec1)
+                prec5 = reduce_tensor(prec5)
+            else:
+                reduced_loss = loss.data
+   
+            # to_python_float incurs a host<->device sync
+            losses.update(to_python_float(reduced_loss), input.size(0))
+            top1.update(to_python_float(prec1), input.size(0))
+            top5.update(to_python_float(prec5), input.size(0))
+    
+            torch.cuda.synchronize()
+            batch_time.update((time.time() - end)/args.print_freq)
+            end = time.time()
+
+            if args.local_rank == 0:
+                print('Epoch: [{0}][{1}/{2}]\t'
+                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                      'Speed {3:.3f} ({4:.3f})\t'
+                      'Loss {loss.val:.10f} ({loss.avg:.4f})\t'
+                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                       epoch, i, len(train_loader),
+                       args.world_size*args.batch_size/batch_time.val,
+                       args.world_size*args.batch_size/batch_time.avg,
+                       batch_time=batch_time,
+                       loss=losses, top1=top1, top5=top5))
+
+        input, target = prefetcher.next()
+
+
+def validate(val_loader, model, criterion):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to evaluate mode
+    model.eval()
+
+    end = time.time()
+
+    prefetcher = data_prefetcher(val_loader)
+    input, target = prefetcher.next()
+    i = 0
+    while input is not None:
+        i += 1
+
+        # compute output
+        with torch.no_grad():
+            output = model(input)
+            loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+
+        if args.distributed:
+            reduced_loss = reduce_tensor(loss.data)
+            prec1 = reduce_tensor(prec1)
+            prec5 = reduce_tensor(prec5)
+        else:
+            reduced_loss = loss.data
+
+        losses.update(to_python_float(reduced_loss), input.size(0))
+        top1.update(to_python_float(prec1), input.size(0))
+        top5.update(to_python_float(prec5), input.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        # TODO:  Change timings to mirror train().
+        if args.local_rank == 0 and i % args.print_freq == 0:
+            print('Test: [{0}/{1}]\t'
+                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                  'Speed {2:.3f} ({3:.3f})\t'
+                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                   i, len(val_loader),
+                   args.world_size * args.batch_size / batch_time.val,
+                   args.world_size * args.batch_size / batch_time.avg,
+                   batch_time=batch_time, loss=losses,
+                   top1=top1, top5=top5))
+
+        input, target = prefetcher.next()
+
+    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
+          .format(top1=top1, top5=top5))
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def adjust_learning_rate(optimizer, epoch, step, len_epoch):
+    """LR schedule that should yield 76% converged accuracy with batch size 256"""
+    factor = epoch // 30
+
+    if epoch >= 80:
+        factor = factor + 1
+
+    lr = args.lr*(0.1**factor)
+
+    """Warmup"""
+    if epoch < 5:
+        lr = lr*float(1 + step + epoch*len_epoch)/(5.*len_epoch)
+
+    # if(args.local_rank == 0):
+    #     print("epoch = {}, step = {}, lr = {}".format(epoch, step, lr))
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    rt /= args.world_size
+    return rt
+
+if __name__ == '__main__':
+    main()
diff --git a/apex/examples/simple/distributed/README.md b/apex/examples/simple/distributed/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d939cbbf67234835b1c9afdc84ae42357fee34e
--- /dev/null
+++ b/apex/examples/simple/distributed/README.md
@@ -0,0 +1,13 @@
+**distributed_data_parallel.py** and **run.sh** show an example using Amp with
+[apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
+[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
+and the Pytorch multiprocess launcher script,
+[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
+The use of `Amp` with DistributedDataParallel does not need to change from ordinary 
+single-process use.  The only gotcha is that wrapping your model with `DistributedDataParallel` must
+come after the call to `amp.initialize`.  Test via
+```bash
+bash run.sh
+```
+
+**This is intended purely as an instructional example, not a performance showcase.**
diff --git a/apex/examples/simple/distributed/distributed_data_parallel.py b/apex/examples/simple/distributed/distributed_data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..b364405df89b7568efb59e640b40e557d402b277
--- /dev/null
+++ b/apex/examples/simple/distributed/distributed_data_parallel.py
@@ -0,0 +1,65 @@
+import torch
+import argparse
+import os
+from apex import amp
+# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
+from apex.parallel import DistributedDataParallel
+
+parser = argparse.ArgumentParser()
+# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
+# automatically by torch.distributed.launch.
+parser.add_argument("--local_rank", default=0, type=int)
+args = parser.parse_args()
+
+# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
+# the 'WORLD_SIZE' environment variable will also be set automatically.
+args.distributed = False
+if 'WORLD_SIZE' in os.environ:
+    args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+if args.distributed:
+    # FOR DISTRIBUTED:  Set the device according to local_rank.
+    torch.cuda.set_device(args.local_rank)
+
+    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
+    # environment variables, and requires that you use init_method=`env://`.
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://')
+
+torch.backends.cudnn.benchmark = True
+
+N, D_in, D_out = 64, 1024, 16
+
+# Each process receives its own batch of "fake input data" and "fake target data."
+# The "training loop" in each process just uses this fake batch over and over.
+# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
+# example of distributed data sampling for both training and validation.
+x = torch.randn(N, D_in, device='cuda')
+y = torch.randn(N, D_out, device='cuda')
+
+model = torch.nn.Linear(D_in, D_out).cuda()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
+
+if args.distributed:
+    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
+    # apex.parallel.DistributedDataParallel.
+    model = DistributedDataParallel(model)
+    # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
+    # model = torch.nn.parallel.DistributedDataParallel(model,
+    #                                                   device_ids=[args.local_rank],
+    #                                                   output_device=args.local_rank)
+
+loss_fn = torch.nn.MSELoss()
+
+for t in range(500):
+    optimizer.zero_grad()
+    y_pred = model(x)
+    loss = loss_fn(y_pred, y)
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+    optimizer.step()
+
+if args.local_rank == 0:
+    print("final loss = ", loss)
diff --git a/apex/examples/simple/distributed/run.sh b/apex/examples/simple/distributed/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7a2d85f0a464bb880c9aaad5ae0ab32fddb11635
--- /dev/null
+++ b/apex/examples/simple/distributed/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
diff --git a/apex/setup.py b/apex/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..3deb46d8eac99ba52c4fb2b2bcd945f3a2c11811
--- /dev/null
+++ b/apex/setup.py
@@ -0,0 +1,120 @@
+import torch
+from setuptools import setup, find_packages
+import subprocess
+
+import sys
+
+if not torch.cuda.is_available():
+    print("\nWarning: Torch did not find available GPUs on this system.\n",
+          "If your intention is to cross-compile, this is not an error.\n")
+
+print("torch.__version__  = ", torch.__version__)
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
+      raise RuntimeError("Apex requires Pytorch 0.4 or newer.\n" +
+                         "The latest stable release can be obtained from https://pytorch.org/")
+
+cmdclass = {}
+ext_modules = []
+
+if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
+    if TORCH_MAJOR == 0:
+        raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
+                           "found torch.__version__ = {}".format(torch.__version__))
+    from torch.utils.cpp_extension import BuildExtension
+    cmdclass['build_ext'] = BuildExtension
+
+if "--cpp_ext" in sys.argv:
+    from torch.utils.cpp_extension import CppExtension
+    sys.argv.remove("--cpp_ext")
+    ext_modules.append(
+        CppExtension('apex_C',
+                     ['csrc/flatten_unflatten.cpp',]))
+
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+    torch_binary_major = torch.version.cuda.split(".")[0]
+    torch_binary_minor = torch.version.cuda.split(".")[1]
+
+    print("\nCompiling cuda extensions with")
+    print(raw_output + "from " + cuda_dir + "/bin\n")
+
+    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
+                           "not match the version used to compile Pytorch binaries.  " +
+                           "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
+                           "In some cases, a minor-version mismatch will not cause later errors:  " +
+                           "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
+                           "You can try commenting out this check (at your own risk).")
+
+if "--cuda_ext" in sys.argv:
+    from torch.utils.cpp_extension import CUDAExtension
+    sys.argv.remove("--cuda_ext")
+
+    if torch.utils.cpp_extension.CUDA_HOME is None:
+        raise RuntimeError("--cuda_ext was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
+    else:
+        check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
+
+        # Set up macros for forward/backward compatibility hack around
+        # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+
+        ext_modules.append(
+            CUDAExtension(name='amp_C',
+                          sources=['csrc/amp_C_frontend.cpp',
+                                   'csrc/multi_tensor_scale_kernel.cu',
+                                   'csrc/multi_tensor_axpby_kernel.cu',
+                                   'csrc/multi_tensor_l2norm_kernel.cu',
+                                   'csrc/multi_tensor_lamb_stage_1.cu',
+                                   'csrc/multi_tensor_lamb_stage_2.cu'],
+                          extra_compile_args={'cxx': ['-O3'],
+                                              'nvcc':['-lineinfo',
+                                                      '-O3',
+                                                      # '--resource-usage',
+                                                      '--use_fast_math']}))
+        ext_modules.append(
+            CUDAExtension(name='fused_adam_cuda',
+                          sources=['csrc/fused_adam_cuda.cpp',
+                                   'csrc/fused_adam_cuda_kernel.cu'],
+                          extra_compile_args={'cxx': ['-O3',],
+                                              'nvcc':['-O3',
+                                                      '--use_fast_math']}))
+        ext_modules.append(
+            CUDAExtension(name='syncbn',
+                          sources=['csrc/syncbn.cpp',
+                                   'csrc/welford.cu']))
+        ext_modules.append(
+            CUDAExtension(name='fused_layer_norm_cuda',
+                          sources=['csrc/layer_norm_cuda.cpp',
+                                   'csrc/layer_norm_cuda_kernel.cu'],
+                          extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
+                                              'nvcc':['-maxrregcount=50',
+                                                      '-O3',
+                                                      '--use_fast_math'] + version_ge_1_1}))
+
+setup(
+    name='apex',
+    version='0.1',
+    packages=find_packages(exclude=('build',
+                                    'csrc',
+                                    'include',
+                                    'tests',
+                                    'dist',
+                                    'docs',
+                                    'tests',
+                                    'examples',
+                                    'apex.egg-info',)),
+    description='PyTorch Extensions written by NVIDIA',
+    ext_modules=ext_modules,
+    cmdclass=cmdclass,
+)
diff --git a/apex/tests/L0/run_amp/__init__.py b/apex/tests/L0/run_amp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/apex/tests/L0/run_amp/test_add_param_group.py b/apex/tests/L0/run_amp/test_add_param_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3e90c4330eff32c529f2293af34324691ef6674
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_add_param_group.py
@@ -0,0 +1,148 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+from apex.amp import _amp_state
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+class MyModel(torch.nn.Module):
+    def __init__(self, unique):
+        super(MyModel, self).__init__()
+        self.weight0 = Parameter(unique +
+            torch.arange(2, device='cuda', dtype=torch.float32))
+        self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
+
+    @staticmethod
+    def ops(input, weight0, weight1):
+        return ((input*(weight0.float()))*(weight1.float())).sum()
+
+    def forward(self, input):
+        return self.ops(input, self.weight0, self.weight1)
+
+
+# Abandon all hope, ye who enter here.
+
+
+class TestAddParamGroup(unittest.TestCase):
+    def setUp(self):
+        self.x = torch.ones((2), device='cuda', dtype=torch.float32)
+        common_init(self)
+
+    def tearDown(self):
+        pass
+
+    def zero_grad(self, models, optimizer, how_to_zero):
+        if how_to_zero == "none":
+            for model in models:
+                for param in model.parameters():
+                    param.grad = None
+        elif how_to_zero == "model":
+            for model in models:
+                model.zero_grad()
+        elif how_to_zero == "optimizer":
+            optimizer.zero_grad()
+
+    def test_add_param_group(self):
+        for opt_level in ("O0", "O1", "O2", "O3"):
+          for zero_before_add in (True, False):
+            for try_accumulation in (True, False):
+              model0 = MyModel(1)
+              model1 = MyModel(2)
+
+              optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                          momentum=0.125)
+
+              optimizer.zero_grad()
+              loss = model0(self.x)
+              loss.backward()
+              optimizer.step()
+
+              if zero_before_add:
+                  optimizer.zero_grad()
+              optimizer.add_param_group({'params' : model1.parameters(), 'lr' : 0.5})
+              if not zero_before_add:
+                  optimizer.zero_grad()
+
+              loss = model0(self.x) + model1(self.x)
+              loss.backward(retain_graph=try_accumulation)
+              if try_accumulation:
+                  loss.backward()
+              optimizer.step()
+
+              # Once more to make sure the new params pick up momemtums properly
+              optimizer.zero_grad()
+              loss = model0(self.x) + model1(self.x)
+              loss.backward(retain_graph=try_accumulation)
+              if try_accumulation:
+                  loss.backward()
+              optimizer.step()
+
+              reference_params = [param.data.clone() for param in model0.parameters()] + \
+                                 [param.data.clone() for param in model1.parameters()]
+
+              for how_to_zero in "none", "model", "optimizer":
+                  model0 = MyModel(1)
+                  model1 = MyModel(2)
+
+                  optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                              momentum=0.125)
+
+                  _amp_state.allow_incoming_model_not_fp32 = True
+                  [model0, model1], optimizer = amp.initialize([model0, model1],
+                      optimizer,
+                      opt_level=opt_level,
+                      verbosity=0,
+                      cast_model_type=False)
+                  _amp_state.allow_incoming_model_not_fp32 = False
+
+                  _amp_state.loss_scalers[0]._loss_scale = 4.0
+
+                  self.zero_grad([model0, model1], optimizer, how_to_zero)
+                  loss = model0(self.x)
+                  with amp.scale_loss(loss, optimizer) as scaled_loss:
+                      scaled_loss.backward()
+                  optimizer.step()
+
+                  if zero_before_add:
+                      self.zero_grad([model0, model1], optimizer, how_to_zero)
+                  optimizer.add_param_group({'params' : model1.parameters(), 'lr' : 0.5})
+                  if not zero_before_add:
+                      self.zero_grad([model0, model1], optimizer, how_to_zero)
+
+                  loss = model0(self.x) + model1(self.x)
+                  with amp.scale_loss(loss, optimizer) as scaled_loss:
+                      scaled_loss.backward(retain_graph=try_accumulation)
+                  if try_accumulation:
+                      with amp.scale_loss(loss, optimizer) as scaled_loss:
+                          scaled_loss.backward()
+                  optimizer.step()
+
+                  # Once more to make sure the new params pick up momentums properly
+                  self.zero_grad([model0, model1], optimizer, how_to_zero)
+                  loss = model0(self.x) + model1(self.x)
+                  with amp.scale_loss(loss, optimizer) as scaled_loss:
+                      scaled_loss.backward(retain_graph=try_accumulation)
+                  if try_accumulation:
+                      with amp.scale_loss(loss, optimizer) as scaled_loss:
+                          scaled_loss.backward()
+                  optimizer.step()
+
+                  final_params = [param.data.clone() for param in model0.parameters()] + \
+                                 [param.data.clone() for param in model1.parameters()]
+
+                  for reference, final in zip(reference_params, final_params):
+                      self.assertTrue(torch.allclose(reference.to(final.dtype), final),
+                                      "opt_level = {}, how_to_zero = {}, zero_before_add = {}".format(
+                                      opt_level, how_to_zero, zero_before_add))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_basic_casts.py b/apex/tests/L0/run_amp/test_basic_casts.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ab3fd87cc55c7c3852a90594b744128d97d7f7c
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_basic_casts.py
@@ -0,0 +1,163 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+def run_layer_test(test_case, fns, expected, input_shape, test_backward=True):
+    for fn, typ in it.product(fns, expected.keys()):
+        x = torch.randn(input_shape, dtype=typ).requires_grad_()
+        y = fn(x)
+        test_case.assertEqual(y.type(), expected[typ])
+        if test_backward:
+            y.float().sum().backward()
+            test_case.assertEqual(x.grad.type(), MATCH_INPUT[typ])
+
+class TestBasicCasts(unittest.TestCase):
+    def setUp(self):
+        self.handle = amp.init(enabled=True)
+        common_init(self)
+
+    def tearDown(self):
+        self.handle._deactivate()
+
+    def test_linear_is_half(self):
+        m = nn.Linear(self.h, self.h)
+        f = ft.partial(F.linear, weight=m.weight, bias=m.bias)
+        run_layer_test(self, [m, f], ALWAYS_HALF, (self.b, self.h))
+
+    def test_conv2d_is_half(self):
+        m = nn.Conv2d(self.c, self.c, self.k)
+        f = ft.partial(F.conv2d, weight=m.weight, bias=m.bias)
+        run_layer_test(self, [m, f], ALWAYS_HALF, (self.b, self.c, self.h, self.h))
+
+    def test_softmax_is_float(self):
+        m = nn.Softmax(dim=1)
+        f = ft.partial(F.softmax, dim=1)
+        run_layer_test(self, [m, f], ALWAYS_FLOAT, (self.b, self.h))
+
+    def test_group_norm_is_float(self):
+        m = nn.GroupNorm(num_groups=4, num_channels=self.c)
+        run_layer_test(self, [m], ALWAYS_FLOAT, (self.b, self.c, self.h, self.h))
+
+    def test_mse_loss_is_float(self):
+        shape = (self.b, self.h)
+        target = torch.randn(shape)
+        mod = nn.MSELoss()
+        m = lambda x: mod(x, target)
+        f = ft.partial(F.mse_loss, target=target)
+        run_layer_test(self, [m], ALWAYS_FLOAT, shape)
+
+    def test_relu_is_match(self):
+        run_layer_test(self, [nn.ReLU(), F.relu], MATCH_INPUT, (self.b, self.h))
+
+    def test_batch_norm_is_match(self):
+        m = nn.BatchNorm2d(num_features=self.c)
+        f = ft.partial(F.batch_norm, running_mean=m.running_mean, running_var=m.running_var,
+                       weight=m.weight, bias=m.bias, training=True)
+        run_layer_test(self, [m], MATCH_INPUT, (self.b, self.c, self.h, self.h))
+
+        # Test forward-only for BN inference
+        m.eval()
+        f = ft.partial(F.batch_norm, running_mean=m.running_mean, running_var=m.running_var,
+                       weight=m.weight, bias=m.bias, training=False)
+        run_layer_test(self, [m, f], MATCH_INPUT, (self.b, self.c, self.h, self.h),
+                            test_backward=False)
+
+class TestBannedMethods(unittest.TestCase):
+    def setUp(self):
+        self.handle = amp.init(enabled=True)
+        common_init(self)
+
+    def tearDown(self):
+        self.handle._deactivate()
+
+    def bce_common(self, assertion):
+        shape = (self.b, self.h)
+        target = torch.rand(shape)
+        mod = nn.BCELoss()
+        m = lambda x: mod(x, target)
+        f = ft.partial(F.binary_cross_entropy, target=target)
+        for fn in [m, f]:
+            x = torch.rand(shape, dtype=torch.half)
+            assertion(fn, x)
+
+    def test_bce_raises_by_default(self):
+        assertion = lambda fn, x: self.assertRaises(NotImplementedError, fn, x)
+        self.bce_common(assertion)
+
+    def test_bce_is_float_with_allow_banned(self):
+        self.handle._deactivate()
+        self.handle = amp.init(enabled=True, allow_banned=True)
+        assertion = lambda fn, x: self.assertEqual(fn(x).type(), FLOAT)
+        self.bce_common(assertion)
+
+class TestTensorCasts(unittest.TestCase):
+    def setUp(self):
+        self.handle = amp.init(enabled=True)
+        common_init(self)
+
+    def tearDown(self):
+        self.handle._deactivate()
+
+    def test_matmul_method_is_half(self):
+        other = torch.randn(self.h, self.h)
+        lhs = lambda x: x.matmul(other)
+        rhs = lambda x: other.matmul(x)
+        run_layer_test(self, [lhs, rhs], ALWAYS_HALF, (self.h, self.h))
+
+    def test_matmul_op_is_half(self):
+        other = torch.randn(self.h, self.h)
+        lhs = lambda x: x @ other
+        rhs = lambda x: other @ x
+        run_layer_test(self, [lhs, rhs], ALWAYS_HALF, (self.h, self.h))
+
+    def test_pow_method_is_float(self):
+        fn = lambda x: x.pow(2.)
+        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
+
+    def test_pow_op_is_float(self):
+        fn = lambda x: x ** 2.
+        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
+
+    def test_cpu_is_float(self):
+        fn = lambda x: x.cpu()
+        always_cpu_float = {torch.float: 'torch.FloatTensor',
+                            torch.half: 'torch.FloatTensor'}
+        run_layer_test(self, [fn], always_cpu_float, (self.b, self.h))
+
+    def test_sum_is_float(self):
+        fn = lambda x: x.sum()
+        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
+
+class TestDisabledCasts(unittest.TestCase):
+    def setUp(self):
+        self.handle = amp.init(enabled=False)
+        common_init(self)
+
+    def test_disabled_linear(self):
+        m = nn.Linear(self.h, self.h)
+        f = ft.partial(F.linear, weight=m.weight, bias=m.bias)
+        input_shape = (self.b, self.h)
+
+        for fn in [m, f]:
+            x = torch.randn(input_shape, dtype=torch.float).requires_grad_()
+            y = fn(x)
+            self.assertEqual(y.type(), FLOAT)
+            y.sum().backward()
+            self.assertEqual(x.grad.type(), FLOAT)
+
+            x = torch.randn(input_shape, dtype=torch.half).requires_grad_()
+            self.assertRaises(RuntimeError, fn, x)
+
+    # TODO: maybe more tests on disabled casting?
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_cache.py b/apex/tests/L0/run_amp/test_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58d2665f7418992ca25e6da2b113a61c47ff9fb
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_cache.py
@@ -0,0 +1,137 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+from apex.amp import _amp_state
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+def get_reference_grad(i, w, ops):
+    # Creating new tensors ensures, among other things, that the new tensors are not in the cache.
+    # In fact, they are guaranteed not to use the cache because they are not torch.nn.Parameters.
+    fp32_i = i.detach().clone().float()
+    fp32_w = w.detach().clone().float().requires_grad_()
+    loss = ops(fp32_i, fp32_w)
+    loss.backward()
+    return fp32_w.grad
+
+class WhitelistModule(torch.nn.Module):
+    def __init__(self, dtype):
+        super(WhitelistModule, self).__init__()
+        self.weight = torch.nn.Parameter(torch.arange(8*8, device='cuda', dtype=dtype).view(8,8))
+
+    @staticmethod
+    def ops(input, weight):
+        return (input.mm(weight)).mm(weight).sum()
+
+    def forward(self, input):
+        return self.ops(input, self.weight)
+
+
+class BlacklistModule(torch.nn.Module):
+    def __init__(self, dtype):
+        super(BlacklistModule, self).__init__()
+        self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
+
+    @staticmethod
+    def ops(input, weight):
+        return (input + torch.pow(weight, 2) + torch.pow(weight, 2)).sum()
+
+    def forward(self, input):
+        return self.ops(input, self.weight)
+
+
+class PromoteModule(torch.nn.Module):
+    def __init__(self, dtype):
+        super(PromoteModule, self).__init__()
+        self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
+
+    @staticmethod
+    def ops(input, weight):
+        return ((input*weight)*weight).sum()
+
+    def forward(self, input):
+        return self.ops(input, self.weight)
+
+class TestCache(unittest.TestCase):
+    def setUp(self):
+        self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32)
+        common_init(self)
+
+    def tearDown(self):
+        pass
+
+    def train_eval_train_test(self, module, t):
+        model = module(t).cuda()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+
+        _amp_state.allow_incoming_model_not_fp32 = True
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
+        _amp_state.allow_incoming_model_not_fp32 = False
+        
+        def training_step():
+            for param in model.parameters():
+                param.grad = None
+        
+            loss = model(self.x).sum()
+            _amp_state.loss_scalers[0]._loss_scale = 4.0
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        
+            self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1)
+            self.assertEqual(model.weight.grad.type(), model.weight.type())
+        
+            reference_grad = get_reference_grad(self.x, model.weight, model.ops)
+        
+            # Currently there's no difference in the allclose calls, so no need for branching,
+            # but I'm keeping this in case we want different tolerances for fp16 and fp32 checks. 
+            if model.weight.grad.type() == "torch.cuda.HalfTensor":
+                self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
+            elif model.weight.grad.type() == "torch.cuda.FloatTensor":
+                self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
+            else:
+                raise RuntimeError("model.weight.grad.type = {}".format(model.weight.grad.type()))
+
+            model.weight.data -= 1.
+        
+        # Simulates first epoch
+        training_step()
+        
+        # Simulates eval
+        with torch.no_grad():
+            loss = model(self.x).sum()
+        
+        # Simulates resuming training after eval
+        training_step()
+
+        _amp_state.handle._deactivate()
+   
+    # I could easily have these as a set of for loops in a single test,
+    # instead of going for granularity.
+    def test_whitelist_module_fp16_weight(self):
+        self.train_eval_train_test(WhitelistModule, torch.float16)
+
+    def test_whitelist_module_fp32_weight(self):
+        self.train_eval_train_test(WhitelistModule, torch.float32)
+
+    def test_blacklist_module_fp16_weight(self):
+        self.train_eval_train_test(BlacklistModule, torch.float16)
+
+    def test_blacklist_module_fp32_weight(self):
+        self.train_eval_train_test(BlacklistModule, torch.float32)
+
+    def test_promote_module_fp16_weight(self):
+        self.train_eval_train_test(PromoteModule, torch.float16)
+
+    def test_promote_module_fp32_weight(self):
+        self.train_eval_train_test(PromoteModule, torch.float32)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_multi_tensor_axpby.py b/apex/tests/L0/run_amp/test_multi_tensor_axpby.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aa89858fc5d19edb6c76f035e9c0ee197002442
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_multi_tensor_axpby.py
@@ -0,0 +1,128 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+try:
+  import amp_C
+  from amp_C import multi_tensor_axpby
+  from apex.multi_tensor_apply import MultiTensorApply
+  disabled = False
+except ImportError as err:
+  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
+  disabled = True
+
+
+class TestMultiTensorAxpby(unittest.TestCase):
+
+    def setUp(self):
+        common_init(self)
+
+        self.a = 2.0
+        self.b = 8.0
+        self.xval = 4.0
+        self.yval = 16.0
+        self.overflow_buf = torch.cuda.IntTensor(1).zero_()
+        self.ref = torch.cuda.FloatTensor([136.0])
+
+    def tearDown(self):
+        pass
+
+    # The tensor creation here is written for convenience, not speed.
+    def axpby(self, sizea, sizeb, applier, repeat_tensors,
+              x_type, y_type, out_type, inplace=False):
+        self.overflow_buf.zero_()
+        t1 = torch.cuda.FloatTensor(sizea).fill_(1.0)
+        t2 = torch.cuda.FloatTensor(sizeb).fill_(1.0)
+
+        y_list = []
+        for i in range(repeat_tensors):
+            y_list += [t1.clone().to(y_type)*self.yval, t2.clone().to(y_type)*self.yval]
+
+        x_list = [x.clone().to(x_type)*(self.xval/self.yval) for x in y_list]
+
+        if inplace:
+            out_list = y_list
+        else:
+            out_list = [out.clone().to(out_type)*3.0 for out in y_list]
+
+        applier(multi_tensor_axpby, self.overflow_buf, [x_list, y_list, out_list], self.a, self.b, -1)
+
+        self.assertTrue(all([torch.allclose(out, self.ref.to(out_type)) for out in out_list]),
+                        msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
+                        x_type, y_type, out_type, inplace))
+        self.assertTrue(self.overflow_buf.item() == 0,
+                        msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
+                        x_type, y_type, out_type, inplace))
+
+    # def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
+    #     self.overflow_buf.zero_()
+    #     a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
+    #     b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
+
+    #     out_list = []
+    #     for i in range(repeat_tensors):
+    #         out_list += [a.clone().to(out_type), b.clone().to(out_type)]
+
+    #     if inplace:
+    #         in_list = out_list
+    #     else:
+    #         in_list = [out.clone().to(in_type) for out in out_list]
+
+    #     applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
+
+    #     self.overflow_buf.zero_()
+    #     in_list[t][ind] = val
+    #     applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
+    #     self.assertTrue(self.overflow_buf.item())
+
+    @unittest.skipIf(disabled, "amp_C is unavailable")
+    def test_fuzz(self):
+        input_size_pairs = (
+            (7777*77, 555*555),
+            (777, 555),
+            (555, 2048*32+1),
+            (2048*32+1, 555),
+            (555, 2048*32),
+            (2048*32, 555),
+            (33333, 555),
+            (555, 33333))
+        appliers = (
+            MultiTensorApply(2048*32),
+            MultiTensorApply(333),
+            MultiTensorApply(33333))
+        repeat_tensors = (
+            1,
+            55)
+
+        for sizea, sizeb in input_size_pairs:
+          for applier in appliers:
+            for repeat in repeat_tensors:
+              for x_type in (torch.float32, torch.float16):
+                for y_type in (torch.float32, torch.float16):
+                  for out_type in (torch.float32, torch.float16):
+                    for inplace in (True, False):
+                      if inplace is True and (y_type is not out_type):
+                        continue
+                      else:
+                        self.axpby(sizea, sizeb, applier, repeat,
+                                   x_type, y_type, out_type, inplace=inplace)
+                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                      #               0, 0, float('nan'), inplace=inplace)
+                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                      #               2*repeat-1, sizeb-1, float('inf'), inplace=inplace)
+                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                      #              2*(repeat//2), sizea//2, float('inf'), inplace=inplace)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_multi_tensor_l2norm.py b/apex/tests/L0/run_amp/test_multi_tensor_l2norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed3cbd1956c99b68a0a873f5c8dcc4bb27d55e7f
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_multi_tensor_l2norm.py
@@ -0,0 +1,87 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+try:
+  import amp_C
+  from amp_C import multi_tensor_l2norm
+  from apex.multi_tensor_apply import MultiTensorApply
+  disabled = False
+except ImportError as err:
+  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
+  disabled = True
+
+
+class TestMultiTensorL2Norm(unittest.TestCase):
+
+    def setUp(self):
+        common_init(self)
+        self.val = 4.0
+        self.overflow_buf = torch.cuda.IntTensor(1).zero_()
+
+    def tearDown(self):
+        pass
+
+    # The tensor creation here is written for convenience, not speed.
+    def l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type, per_tensor):
+        self.overflow_buf.zero_()
+        a = torch.cuda.FloatTensor(sizea).fill_(self.val)
+        b = torch.cuda.FloatTensor(sizeb).fill_(self.val)
+
+        in_list = []
+        for i in range(repeat_tensors):
+            in_list += [a.clone().to(in_type), b.clone().to(in_type)]
+
+        if per_tensor:
+            norm, norm_per_tensor = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
+            normab = torch.cat((a.norm().view(1), b.norm().view(1)))
+            norm_per_tensor = norm_per_tensor.view(-1, 2)
+        else:
+            norm, _ = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
+
+        reference = torch.cuda.FloatTensor((sizea + sizeb)*repeat_tensors).fill_(self.val).norm()
+
+        self.assertTrue(torch.allclose(norm, reference))
+        if per_tensor:
+          self.assertTrue(torch.allclose(norm_per_tensor, normab))
+        self.assertTrue(self.overflow_buf.item() == 0)
+
+    @unittest.skipIf(disabled, "amp_C is unavailable")
+    def test_fuzz(self):
+        input_size_pairs = (
+            (7777*77, 555*555),
+            (777, 555),
+            (555, 2048*32+1),
+            (2048*32+1, 555),
+            (555, 2048*32),
+            (2048*32, 555),
+            (33333, 555),
+            (555, 33333))
+        appliers = (
+            MultiTensorApply(2048*32), 
+            MultiTensorApply(333),
+            MultiTensorApply(33333))
+        repeat_tensors = (
+            1,
+            55)
+
+        for sizea, sizeb in input_size_pairs:
+          for applier in appliers:
+            for repeat in repeat_tensors:
+              for in_type in (torch.float32, torch.float16):
+                for per_tensor in (False, True):
+                  self.l2norm(sizea, sizeb, applier, repeat, in_type, per_tensor)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_multi_tensor_scale.py b/apex/tests/L0/run_amp/test_multi_tensor_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..22da2490c6c40b67bf6d4ff7f18f1b1b640d266b
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_multi_tensor_scale.py
@@ -0,0 +1,126 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+try:
+  import amp_C
+  from amp_C import multi_tensor_scale 
+  from apex.multi_tensor_apply import MultiTensorApply
+  disabled = False
+except ImportError as err:
+  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
+  disabled = True
+
+
+class TestMultiTensorScale(unittest.TestCase):
+
+    def setUp(self):
+        common_init(self)
+        self.scale = 4.0
+        self.overflow_buf = torch.cuda.IntTensor(1).zero_()
+        self.ref = torch.cuda.FloatTensor([1.0])
+
+    def tearDown(self):
+        pass
+
+    # The tensor creation here is written for convenience, not speed.
+    def downscale(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, inplace=False):
+        self.overflow_buf.zero_()
+        a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
+        b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
+
+        out_list = []
+        for i in range(repeat_tensors):
+            out_list += [a.clone().to(out_type), b.clone().to(out_type)]
+
+        if inplace:
+            in_list = out_list
+        else:
+            in_list = [out.clone().to(in_type) for out in out_list]
+
+        applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
+
+        self.assertTrue(all([torch.allclose(out, self.ref.to(out_type)) for out in out_list]))
+        self.assertTrue(self.overflow_buf.item() == 0)
+ 
+    def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
+        self.overflow_buf.zero_()
+        a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
+        b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
+
+        out_list = []
+        for i in range(repeat_tensors):
+            out_list += [a.clone().to(out_type), b.clone().to(out_type)]
+
+        if inplace:
+            in_list = out_list
+        else:
+            in_list = [out.clone().to(in_type) for out in out_list]
+
+        applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
+
+        self.overflow_buf.zero_()
+        in_list[t][ind] = val
+        applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
+        self.assertTrue(self.overflow_buf.item())
+
+    # Currently, the fused kernel gives a hard error if you attempt to downscale
+    # into fp16 output, which imo is the desired behavior.  Maybe someday we
+    # will learn otherwise.
+    # @unittest.skipIf(disabled, "amp_C is unavailable")
+    # def test_fp16_to_fp16(self):
+    #     self.downscale(self.fp16, self.fp16, self.fp16_ref)
+    # 
+    # @unittest.skipIf(disabled, "amp_C is unavailable")
+    # def test_fp32_to_fp16(self):
+    #     self.downscale(self.fp32, self.fp16, self.fp16_ref)
+
+    @unittest.skipIf(disabled, "amp_C is unavailable")
+    def test_fuzz(self):
+        input_size_pairs = (
+            (7777*77, 555*555),
+            (777, 555),
+            (555, 2048*32+1),
+            (2048*32+1, 555),
+            (555, 2048*32),
+            (2048*32, 555),
+            (33333, 555),
+            (555, 33333))
+        appliers = (
+            MultiTensorApply(2048*32), 
+            MultiTensorApply(333),
+            MultiTensorApply(33333))
+        repeat_tensors = (
+            1,
+            55)
+
+        for sizea, sizeb in input_size_pairs:
+          for applier in appliers:
+            for repeat in repeat_tensors:
+              for in_type in (torch.float32, torch.float16):
+                for out_type in (torch.float32, torch.float16):
+                  for inplace in (True, False):
+                    if inplace is True and (out_type is not in_type):
+                      continue
+                    else:
+                      self.downscale(sizea, sizeb, applier, repeat, in_type, out_type, inplace=inplace)
+                      self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                                    0, 0, float('nan'), inplace=inplace)
+                      self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                                    2*repeat-1, sizeb-1, float('inf'), inplace=inplace)
+                      self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                                   2*(repeat//2), sizea//2, float('inf'), inplace=inplace)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_multiple_models_optimizers_losses.py b/apex/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..068c845375f5f909abced39ec84f4f92e1c2a6fc
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
@@ -0,0 +1,762 @@
+import unittest
+
+import functools as ft
+import itertools as it
+
+from apex import amp
+from apex.amp import _amp_state
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+
+class MyModel(torch.nn.Module):
+    def __init__(self, unique):
+        super(MyModel, self).__init__()
+        self.weight0 = Parameter(unique +
+            torch.arange(2, device='cuda', dtype=torch.float32))
+        self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
+
+    @staticmethod
+    def ops(input, weight0, weight1):
+        return ((input*(weight0.float()))*(weight1.float())).sum()
+
+    def forward(self, input):
+        return self.ops(input, self.weight0, self.weight1)
+
+# Abandon all hope, ye who enter here.
+
+# This is hands down the ugliest code I have ever written, but it succeeds in testing
+# multiple models/optimizers/losses fairly thoroughly.  Many of the different test cases
+# require slightly divergent code in a way that seems near-impossible to genericize into a simple
+# cross product or nested loops.
+
+class TestMultipleModelsOptimizersLosses(unittest.TestCase):
+    def setUp(self):
+        self.x = torch.ones((2), device='cuda', dtype=torch.float32)
+        common_init(self)
+
+    def tearDown(self):
+        pass
+
+    def test_2models2losses1optimizer(self):
+        model0 = MyModel(1)
+        model1 = MyModel(2)
+
+        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
+                                     {'params' : model1.parameters(), 'lr' : 0.5}],
+                                    momentum=0.125)
+
+        reference_grads = []
+        for i in range(2):
+            optimizer.zero_grad()
+            loss0 = model0(self.x)
+            loss1 = model1(self.x)
+            loss0.backward()
+            loss1.backward()
+
+            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
+                                   [param.grad.data.clone() for param in model1.parameters()])
+
+            optimizer.step()
+
+        final_params = [param.data.clone() for param in model0.parameters()] + \
+                       [param.data.clone() for param in model1.parameters()]
+
+        for opt_level in ("O0", "O1", "O2", "O3"):
+          for how_to_zero in ("none", "model", "optimizer"):
+            for use_multiple_loss_scalers in (True, False):
+              if opt_level == "O1" or opt_level == "O2":
+                  inject_inf_iters = (-1, 0, 1)
+              else:
+                  inject_inf_iters = (-1,)
+
+              for inject_inf in inject_inf_iters:
+                if inject_inf >= 0:
+                   inject_inf_locs = ("fp16", "fp32")
+                   which_backwards = (0, 1)
+                else:
+                   inject_inf_locs = ("fdsa",)
+                   which_backwards = (None,)
+
+                for inject_inf_loc in inject_inf_locs:
+                  for which_backward in which_backwards:
+                      if use_multiple_loss_scalers:
+                          num_losses = 2
+                          loss_ids = [0, 1]
+                      else:
+                          num_losses = 1
+                          loss_ids = [0, 0]
+
+                      if inject_inf >= 0:
+                          iters = 3
+                      else:
+                          iters = 2
+
+                      model0 = MyModel(1)
+                      model1 = MyModel(2)
+
+                      models = [model0, model1]
+
+                      optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
+                                                   {'params' : model1.parameters(), 'lr' : 0.5}],
+                                                  momentum=0.125)
+
+                      _amp_state.allow_incoming_model_not_fp32 = True
+                      [model0, model1], optimizer = amp.initialize(
+                          [model0, model1],
+                          optimizer,
+                          opt_level=opt_level,
+                          verbosity=0,
+                          cast_model_type=False,
+                          num_losses=num_losses)
+                      _amp_state.allow_incoming_model_not_fp32 = False
+
+                      _amp_state.loss_scalers[0]._loss_scale = 4.0
+                      if use_multiple_loss_scalers:
+                          _amp_state.loss_scalers[1]._loss_scale = 16.0
+
+                      unskipped = 0
+                      for i in range(iters):
+                          if how_to_zero == "none":
+                              for model in models:
+                                  for param in model.parameters():
+                                      param.grad = None
+                          elif how_to_zero == "model":
+                              for model in models:
+                                  model.zero_grad()
+                          else:
+                              optimizer.zero_grad()
+
+                          loss0 = model0(self.x)
+                          loss1 = model1(self.x)
+
+                          with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
+                              scaled_loss.backward()
+                              if i == inject_inf and which_backward == 0:
+                                  if inject_inf_loc == "fp32":
+                                      model0.weight0.grad[0] = float('inf')
+                                  elif inject_inf_loc == "fp16":
+                                      model0.weight1.grad[0] = float('inf')
+                          with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
+                              scaled_loss.backward()
+                              if i == inject_inf and which_backward == 1:
+                                  if inject_inf_loc == "fp32":
+                                      model1.weight0.grad[0] = float('inf')
+                                  elif inject_inf_loc == "fp16":
+                                      model1.weight1.grad[0] = float('inf')
+
+                          if i != inject_inf:
+                              for param, reference_grad in zip(amp.master_params(optimizer),
+                                                               reference_grads[unskipped]):
+                                  self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
+                              unskipped += 1
+                          optimizer.step()
+
+                      model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
+                      for model, master, reference in zip(
+                              model_params,
+                              amp.master_params(optimizer),
+                              final_params):
+                          self.assertTrue(torch.allclose(model, reference))
+                          self.assertTrue(torch.allclose(model, master.to(model.dtype)))
+
+                      if opt_level == "O1":
+                          _amp_state.handle._deactivate()
+
+    def test_3models2losses1optimizer(self):
+
+        model0 = MyModel(1)
+        model1 = MyModel(2)
+        model2 = MyModel(3)
+
+        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
+                                     {'params' : model1.parameters(), 'lr' : 0.5},
+                                     {'params' : model2.parameters(), 'lr' : 0.125}],
+                                     momentum=0.125)
+
+        reference_grads = []
+        for i in range(2):
+            optimizer.zero_grad()
+            loss0 = model0(self.x) + model2(self.x)
+            loss1 = model1(self.x) + model2(self.x)
+            loss0.backward()
+            loss1.backward()
+
+            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
+                                   [param.grad.data.clone() for param in model1.parameters()] +
+                                   [param.grad.data.clone() for param in model2.parameters()])
+
+            optimizer.step()
+
+
+        final_params = [param.data.clone() for param in model0.parameters()] + \
+                       [param.data.clone() for param in model1.parameters()] + \
+                       [param.data.clone() for param in model2.parameters()]
+
+        for opt_level in ("O0", "O1", "O2", "O3"):
+          for how_to_zero in ("none", "model", "optimizer"):
+            for use_multiple_loss_scalers in (True, False):
+              if opt_level == "O1" or opt_level == "O2":
+                  inject_inf_iters = (-1, 0, 1)
+              else:
+                  inject_inf_iters = (-1,)
+
+              for inject_inf in inject_inf_iters:
+                if inject_inf >= 0:
+                   inject_inf_locs = ("fp16", "fp32")
+                   which_backwards = (0, 1)
+                else:
+                   inject_inf_locs = ("fdsa",)
+                   which_backwards = (None,)
+
+                for inject_inf_loc in inject_inf_locs:
+                  for which_backward in which_backwards:
+                    if use_multiple_loss_scalers:
+                        num_losses = 2
+                        loss_ids = [0, 1]
+                    else:
+                        num_losses = 1
+                        loss_ids = [0, 0]
+
+                    if inject_inf >= 0:
+                        iters = 3
+                        if which_backward == 0:
+                            which_models = (0, 2)
+                        elif which_backward == 1:
+                            which_models = (1, 2)
+                    else:
+                        iters = 2
+                        which_models = (None,)
+
+                    for which_model in which_models:
+                        model0 = MyModel(1)
+                        model1 = MyModel(2)
+                        model2 = MyModel(3)
+
+                        models = [model0, model1, model2]
+
+                        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
+                                                     {'params' : model1.parameters(), 'lr' : 0.5},
+                                                     {'params' : model2.parameters(), 'lr' : 0.125}],
+                                                     momentum=0.125)
+
+                        _amp_state.allow_incoming_model_not_fp32 = True
+                        [model0, model1, model2], optimizer = amp.initialize(
+                            [model0, model1, model2],
+                            optimizer,
+                            opt_level=opt_level,
+                            verbosity=0,
+                            cast_model_type=False,
+                            num_losses=num_losses)
+                        _amp_state.allow_incoming_model_not_fp32 = False
+
+                        _amp_state.loss_scalers[0]._loss_scale = 4.0
+                        if use_multiple_loss_scalers:
+                            _amp_state.loss_scalers[1]._loss_scale = 16.0
+
+                        unskipped = 0
+                        for i in range(iters):
+                            if how_to_zero == "none":
+                                for model in models:
+                                    for param in model.parameters():
+                                        param.grad = None
+                            elif how_to_zero == "model":
+                                for model in models:
+                                    model.zero_grad()
+                            else:
+                                optimizer.zero_grad()
+
+                            # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} which_model {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, which_model, use_multiple_loss_scalers))
+
+                            loss0 = model0(self.x) + model2(self.x)
+                            loss1 = model1(self.x) + model2(self.x)
+
+                            with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
+                                scaled_loss.backward()
+                                if i == inject_inf and which_backward == 0:
+                                    if which_model == 0:
+                                        inj_model = model0
+                                    elif which_model == 2:
+                                        inj_model = model2
+                                    else:
+                                        raise RuntimeError(which_model + " invalid for loss 0")
+                                    if inject_inf_loc == "fp32":
+                                        inj_model.weight0.grad[0] = float('inf')
+                                    elif inject_inf_loc == "fp16":
+                                        inj_model.weight1.grad[0] = float('inf')
+                            with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
+                                scaled_loss.backward()
+                                if i == inject_inf and which_backward == 1:
+                                    if which_model == 1:
+                                        inj_model = model1
+                                    elif which_model == 2:
+                                        inj_model = model2
+                                    else:
+                                        raise RuntimeError(which_model + " invalid for loss 1 ")
+                                    if inject_inf_loc == "fp32":
+                                        inj_model.weight0.grad[0] = float('inf')
+                                    elif inject_inf_loc == "fp16":
+                                        inj_model.weight1.grad[0] = float('inf')
+
+                            if i != inject_inf:
+                                for param, reference_grad in zip(amp.master_params(optimizer),
+                                                                 reference_grads[unskipped]):
+                                    self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
+                                unskipped += 1
+
+                            optimizer.step()
+
+                        model_params = [p for p in model0.parameters()] + \
+                                       [p for p in model1.parameters()] + \
+                                       [p for p in model2.parameters()]
+                        for model, master, reference in zip(
+                                model_params,
+                                amp.master_params(optimizer),
+                                final_params):
+                            self.assertTrue(torch.allclose(model, reference))
+                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))
+
+                        if opt_level == "O1":
+                            _amp_state.handle._deactivate()
+
+    def test_2models2losses2optimizers(self):
+        model0 = MyModel(1)
+        model1 = MyModel(2)
+
+        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                      momentum=0.125)
+        optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
+                                      momentum=0.25)
+
+        # Don't do it like this:  reference_grads = [[]]*5
+        # because then it creates a list of 5 references to the same "[]" and appending
+        # to any of them effectively makes you append to all of them, which multiplies
+        # the resulting size of reference_grads by 5x and needless to say makes the test fail.
+        reference_grads = [[], [], [], [], []]
+        final_params = [None, None, None, None, None]
+        for i in range(2):
+            optimizer0.zero_grad()
+            optimizer1.zero_grad()
+            loss0 = model0(self.x)
+            loss1 = model1(self.x)
+            loss0.backward()
+            loss1.backward()
+
+            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
+                                   [param.grad.data.clone() for param in model1.parameters()])
+
+            optimizer0.step()
+            optimizer1.step()
+
+        final_params[0] = [param.data.clone() for param in model0.parameters()] + \
+                          [param.data.clone() for param in model1.parameters()]
+
+        def what_got_skipped(which_iter, which_backward):
+            if which_iter == 0 and which_backward == 0:
+                return 1
+            if which_iter == 0 and which_backward == 1:
+                return 2
+            if which_iter == 1 and which_backward == 0:
+                return 3
+            if which_iter == 1 and which_backward == 1:
+                return 4
+            return 0
+
+        for which_iter in (0,1):
+            for which_backward in (0,1):
+                model0 = MyModel(1)
+                model1 = MyModel(2)
+
+                optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                              momentum=0.125)
+                optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
+                                              momentum=0.25)
+
+                for i in range(3):
+                    optimizer0.zero_grad()
+                    optimizer1.zero_grad()
+                    loss0 = model0(self.x)
+                    loss1 = model1(self.x)
+                    loss0.backward()
+                    loss1.backward()
+
+                    if i != which_iter:
+                        reference_grads[what_got_skipped(which_iter, which_backward)].append(
+                            [param.grad.data.clone() for param in model0.parameters()] +
+                            [param.grad.data.clone() for param in model1.parameters()])
+
+                    if i == which_iter:
+                        if which_backward == 0:
+                            optimizer1.step()
+                        else:
+                            optimizer0.step()
+                    else:
+                        optimizer0.step()
+                        optimizer1.step()
+
+                final_params[what_got_skipped(which_iter, which_backward)] = \
+                    [param.data.clone() for param in model0.parameters()] + \
+                    [param.data.clone() for param in model1.parameters()]
+
+        for opt_level in ("O0", "O1", "O2", "O3"):
+          for how_to_zero in ("none", "model", "optimizer"):
+            for use_multiple_loss_scalers in (True, False):
+              if opt_level == "O1" or opt_level == "O2":
+                  inject_inf_iters = (-1, 0, 1)
+              else:
+                  inject_inf_iters = (-1,)
+
+              for inject_inf in inject_inf_iters:
+                if inject_inf >= 0:
+                   inject_inf_locs = ("fp16", "fp32")
+                   which_backwards = (0, 1)
+                else:
+                   inject_inf_locs = ("fdsa",)
+                   which_backwards = (None,)
+
+                for inject_inf_loc in inject_inf_locs:
+                  for which_backward in which_backwards:
+                      if use_multiple_loss_scalers:
+                          num_losses = 2
+                          loss_ids = [0, 1]
+                      else:
+                          num_losses = 1
+                          loss_ids = [0, 0]
+
+                      if inject_inf >= 0:
+                          iters = 3
+                      else:
+                          iters = 2
+
+                      model0 = MyModel(1)
+                      model1 = MyModel(2)
+
+                      models = [model0, model1]
+
+                      optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                                    momentum=0.125)
+                      optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
+                                                    momentum=0.25)
+
+                      _amp_state.allow_incoming_model_not_fp32 = True
+                      [model0, model1], [optimizer0, optimizer1] = amp.initialize(
+                          [model0, model1],
+                          [optimizer0, optimizer1],
+                          opt_level=opt_level,
+                          verbosity=0,
+                          cast_model_type=False,
+                          num_losses=num_losses)
+                      _amp_state.allow_incoming_model_not_fp32 = False
+
+                      _amp_state.loss_scalers[0]._loss_scale = 4.0
+                      if use_multiple_loss_scalers:
+                          _amp_state.loss_scalers[1]._loss_scale = 16.0
+
+                      unskipped = 0
+                      for i in range(iters):
+                          if how_to_zero == "none":
+                              for model in models:
+                                  for param in model.parameters():
+                                      param.grad = None
+                          elif how_to_zero == "model":
+                              for model in models:
+                                  model.zero_grad()
+                          else:
+                              optimizer0.zero_grad()
+                              optimizer1.zero_grad()
+
+                          loss0 = model0(self.x)
+                          loss1 = model1(self.x)
+
+                          with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
+                              scaled_loss.backward()
+                              if i == inject_inf and which_backward == 0:
+                                  if inject_inf_loc == "fp32":
+                                      model0.weight0.grad[0] = float('inf')
+                                  elif inject_inf_loc == "fp16":
+                                      model0.weight1.grad[0] = float('inf')
+                          with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss:
+                              scaled_loss.backward()
+                              if i == inject_inf and which_backward == 1:
+                                  if inject_inf_loc == "fp32":
+                                      model1.weight0.grad[0] = float('inf')
+                                  elif inject_inf_loc == "fp16":
+                                      model1.weight1.grad[0] = float('inf')
+
+                          # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))
+
+                          if i != inject_inf:
+                              master_params = list(amp.master_params(optimizer0)) + \
+                                              list(amp.master_params(optimizer1))
+                              for param, reference_grad in zip(master_params,
+                                      reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]):
+                                  self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
+                              unskipped += 1
+
+                          optimizer0.step()
+                          optimizer1.step()
+
+                      model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
+                      master_params = [p for p in amp.master_params(optimizer0)] + \
+                                      [p for p in amp.master_params(optimizer1)]
+                      for model, master, reference in zip(
+                              model_params,
+                              master_params,
+                              final_params[what_got_skipped(inject_inf, which_backward)]):
+                          self.assertTrue(torch.allclose(model, reference))
+                          self.assertTrue(torch.allclose(model, master.to(model.dtype)))
+
+                      if opt_level == "O1":
+                          _amp_state.handle._deactivate()
+
+    def test_3models2losses2optimizers(self):
+        model0 = MyModel(1)
+        model1 = MyModel(2)
+        model2 = MyModel(3)
+
+        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
+                                      {'params' : model1.parameters(), 'lr' : 1.0}],
+                                     momentum=0.5)
+        optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
+                                     momentum=0.25)
+
+        # Again, can't do this:  reference_grads = [[]]*9
+        reference_grads = [[], [], [], [], [], [], [], [], []]
+        final_params = [None, None, None, None, None, None, None, None, None]
+        for i in range(2):
+            optimizer0.zero_grad()
+            optimizer1.zero_grad()
+            loss0 = model0(self.x) + model1(self.x)
+            loss1 = model2(self.x) + model1(self.x)
+            loss0.backward()
+            loss1.backward()
+
+            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
+                                   [param.grad.data.clone() for param in model1.parameters()])
+
+            optimizer0.step()
+            optimizer1.step()
+
+        final_params[0] = \
+            [param.data.clone() for param in model0.parameters()] + \
+            [param.data.clone() for param in model1.parameters()] + \
+            [param.data.clone() for param in model2.parameters()]
+
+        def what_got_skipped(which_iter, which_backward, which_model):
+            if which_iter == 0:
+                if which_backward == 0:
+                    if which_model == 0:
+                        return 1
+                    if which_model == 1:
+                        return 2
+                if which_backward == 1:
+                    if which_model == 2:
+                        return 3
+                    if which_model == 1:
+                        return 4
+            if which_iter == 1:
+                if which_backward == 0:
+                    if which_model == 0:
+                        return 5
+                    if which_model == 1:
+                        return 6
+                if which_backward == 1:
+                    if which_model == 2:
+                        return 7
+                    if which_model == 1:
+                        return 8
+            return 0
+
+        for which_iter in (0,1):
+            for which_backward in (0,1):
+                if which_backward == 0:
+                    which_models = (0,1)
+                if which_backward == 1:
+                    which_models = (2,1)
+                for which_model in which_models:
+
+                    model0 = MyModel(1)
+                    model1 = MyModel(2)
+                    model2 = MyModel(3)
+
+                    optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
+                                                  {'params' : model1.parameters(), 'lr' : 1.0}],
+                                                 momentum=0.5)
+                    optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
+                                                 momentum=0.25)
+
+                    for i in range(3):
+                        optimizer0.zero_grad()
+                        optimizer1.zero_grad()
+                        loss0 = model0(self.x) + model1(self.x)
+                        loss1 = model2(self.x) + model1(self.x)
+                        loss0.backward()
+                        loss1.backward()
+
+                        if i != which_iter:
+                            reference_grads[what_got_skipped(which_iter,
+                                    which_backward, which_model)].append(
+                                [param.grad.data.clone() for param in model0.parameters()] +
+                                [param.grad.data.clone() for param in model1.parameters()])
+
+                        if i == which_iter:
+                            if which_backward == 0:
+                                # if which_model == 0:
+                                    optimizer1.step()
+                                # if which_model == 1:
+                                #     optimizer1.step()
+                            if which_backward == 1:
+                                # if which_model == 2:
+                                #     optimizer0.step()
+                                # if which_model == 1:
+                                    continue
+                        else:
+                            optimizer0.step()
+                            optimizer1.step()
+
+                    final_params[what_got_skipped(which_iter, which_backward, which_model)] = \
+                        [param.data.clone() for param in model0.parameters()] + \
+                        [param.data.clone() for param in model1.parameters()] + \
+                        [param.data.clone() for param in model2.parameters()]
+
+        for opt_level in ("O0", "O1", "O2", "O3"):
+          for how_to_zero in ("none", "model", "optimizer"):
+            for use_multiple_loss_scalers in (True, False):
+              if opt_level == "O1" or opt_level == "O2":
+                  inject_inf_iters = (-1, 0, 1)
+              else:
+                  inject_inf_iters = (-1,)
+
+              for inject_inf in inject_inf_iters:
+                if inject_inf >= 0:
+                   inject_inf_locs = ("fp16", "fp32")
+                   which_backwards = (0, 1)
+                else:
+                   inject_inf_locs = ("fdsa",)
+                   which_backwards = (None,)
+
+                for inject_inf_loc in inject_inf_locs:
+                  for which_backward in which_backwards:
+                    if use_multiple_loss_scalers:
+                        num_losses = 2
+                        loss_ids = [0, 1]
+                    else:
+                        num_losses = 1
+                        loss_ids = [0, 0]
+
+                    if inject_inf >= 0:
+                        iters = 3
+                        if which_backward == 0:
+                            which_models = (0, 1)
+                        elif which_backward == 1:
+                            which_models = (2, 1)
+                    else:
+                        iters = 2
+                        which_models = (None,)
+
+                    for which_model in which_models:
+                        model0 = MyModel(1)
+                        model1 = MyModel(2)
+                        model2 = MyModel(3)
+
+                        models = [model0, model1, model2]
+
+                        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
+                                                      {'params' : model1.parameters(), 'lr' : 1.0}],
+                                                     momentum=0.5)
+                        optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
+                                                     momentum=0.25)
+
+                        _amp_state.allow_incoming_model_not_fp32 = True
+                        [model0, model1, model2], [optimizer0, optimizer1] = amp.initialize(
+                            [model0, model1, model2],
+                            [optimizer0, optimizer1],
+                            opt_level=opt_level,
+                            verbosity=0,
+                            cast_model_type=False,
+                            num_losses=num_losses)
+                        _amp_state.allow_incoming_model_not_fp32 = False
+
+                        _amp_state.loss_scalers[0]._loss_scale = 4.0
+                        if use_multiple_loss_scalers:
+                            _amp_state.loss_scalers[1]._loss_scale = 16.0
+
+                        unskipped = 0
+                        for i in range(iters):
+                            if how_to_zero == "none":
+                                for model in models:
+                                    for param in model.parameters():
+                                        param.grad = None
+                            elif how_to_zero == "model":
+                                for model in models:
+                                    model.zero_grad()
+                            else:
+                                optimizer0.zero_grad()
+                                optimizer1.zero_grad()
+
+                            loss0 = model0(self.x) + model1(self.x)
+                            loss1 = model2(self.x) + model1(self.x)
+
+                            with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
+                                scaled_loss.backward()
+                                if i == inject_inf and which_backward == 0:
+                                    if which_model == 0:
+                                        inj_model = model0
+                                    elif which_model == 1:
+                                        inj_model = model1
+                                    else:
+                                        raise RuntimeError(which_model + " invalid for loss 0")
+                                    if inject_inf_loc == "fp32":
+                                        inj_model.weight0.grad[0] = float('inf')
+                                    elif inject_inf_loc == "fp16":
+                                        inj_model.weight1.grad[0] = float('inf')
+                            with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss:
+                                scaled_loss.backward()
+                                if i == inject_inf and which_backward == 1:
+                                    if which_model == 2:
+                                        inj_model = model2
+                                    elif which_model == 1:
+                                        inj_model = model1
+                                    else:
+                                        raise RuntimeError(which_model + " invalid for loss 1 ")
+                                    if inject_inf_loc == "fp32":
+                                        inj_model.weight0.grad[0] = float('inf')
+                                    elif inject_inf_loc == "fp16":
+                                        inj_model.weight1.grad[0] = float('inf')
+
+                            if i != inject_inf:
+                                master_params = list(amp.master_params(optimizer0)) + \
+                                                list(amp.master_params(optimizer1))
+                                for param, reference_grad in zip(master_params,
+                                      reference_grads[what_got_skipped(inject_inf,
+                                          which_backward, which_model)][unskipped]):
+                                    self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
+                                unskipped += 1
+
+                            optimizer0.step()
+                            optimizer1.step()
+
+                        model_params = [p for p in model0.parameters()] + \
+                                       [p for p in model1.parameters()] + \
+                                       [p for p in model2.parameters()]
+                        master_params = [p for p in amp.master_params(optimizer0)] + \
+                                        [p for p in amp.master_params(optimizer1)]
+
+                        # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model))
+
+                        for model, master, reference in zip(
+                                model_params,
+                                master_params,
+                                final_params[what_got_skipped(inject_inf, which_backward, which_model)]):
+                            self.assertTrue(torch.allclose(model, reference))
+                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))
+
+                        if opt_level == "O1":
+                            _amp_state.handle._deactivate()
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_promotion.py b/apex/tests/L0/run_amp/test_promotion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ef30c1273a2910aa3b6e7467fa8e4fbd96dbc5
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_promotion.py
@@ -0,0 +1,75 @@
+import unittest
+
+import itertools as it
+
+from apex import amp
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from utils import common_init, HALF, FLOAT, DTYPES
+
+class TestPromotion(unittest.TestCase):
+    def setUp(self):
+        self.handle = amp.init(enabled=True)
+        common_init(self)
+
+    def tearDown(self):
+        self.handle._deactivate()
+
+    def run_binary_promote_test(self, fns, input_shape, x_inplace=False):
+        type_pairs = it.product(DTYPES, DTYPES)
+        for fn, (xtype, ytype) in it.product(fns, type_pairs):
+            x = torch.randn(input_shape, dtype=xtype).requires_grad_()
+            x_leaf = x
+            if x_inplace:
+                # We need a non-leaf to call in place on
+                x = x.clone()
+            y = torch.randn(input_shape, dtype=ytype)
+            out = fn(x, y)
+            if x_inplace:
+                # In place: always match xtype
+                self.assertEqual(out.type(), x.type())
+            else:
+                # Out of place: match widest type
+                if xtype == torch.float or ytype == torch.float:
+                    self.assertEqual(out.type(), FLOAT)
+                else:
+                    self.assertEqual(out.type(), HALF)
+            out.float().sum().backward()
+            self.assertEqual(x_leaf.grad.dtype, xtype)
+
+    def test_atan2_matches_widest(self):
+        fns = [lambda x, y : torch.atan2(x, y),
+               lambda x, y : x.atan2(y)]
+        self.run_binary_promote_test(fns, (self.b,))
+
+    def test_mul_matches_widest(self):
+        fns = [lambda x, y : torch.mul(x, y),
+               lambda x, y: x.mul(y)]
+        self.run_binary_promote_test(fns, (self.b,))
+
+    def test_cat_matches_widest(self):
+        shape = self.b
+        ys = [torch.randn(shape, dtype=torch.half) for _ in range(5)]
+        x_float = torch.randn(shape)
+        out = torch.cat(ys + [x_float])
+        self.assertEqual(out.type(), FLOAT)
+        x_half = torch.randn(shape, dtype=torch.half)
+        out = torch.cat(ys + [x_half])
+        self.assertEqual(out.type(), HALF)
+
+    def test_inplace_exp_is_error_for_half(self):
+        xs = torch.randn(self.b)
+        xs.exp_()
+        self.assertEqual(xs.type(), FLOAT)
+        xs = torch.randn(self.b, dtype=torch.half)
+        with self.assertRaises(NotImplementedError):
+            xs.exp_()
+
+    def test_inplace_add_matches_self(self):
+        fn = lambda x, y: x.add_(y)
+        self.run_binary_promote_test([fn], (self.b,), x_inplace=True)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/test_rnn.py b/apex/tests/L0/run_amp/test_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49a5f00345ea131b017580754b8c9b224c591c8
--- /dev/null
+++ b/apex/tests/L0/run_amp/test_rnn.py
@@ -0,0 +1,116 @@
+import unittest
+
+from apex import amp
+import random
+import torch
+from torch import nn
+
+from utils import common_init, HALF
+
+class TestRnnCells(unittest.TestCase):
+    def setUp(self):
+        self.handle = amp.init(enabled=True)
+        common_init(self)
+
+    def tearDown(self):
+        self.handle._deactivate()
+
+    def run_cell_test(self, cell, state_tuple=False):
+        shape = (self.b, self.h)
+        for typ in [torch.float, torch.half]:
+            xs = [torch.randn(shape, dtype=typ).requires_grad_()
+                  for _ in range(self.t)]
+            hidden_fn = lambda: torch.zeros(shape, dtype=typ)
+            if state_tuple:
+                hidden = (hidden_fn(), hidden_fn())
+            else:
+                hidden = hidden_fn()
+            outputs = []
+            for i in range(self.t):
+                hidden = cell(xs[i], hidden)
+                if state_tuple:
+                    output = hidden[0]
+                else:
+                    output = hidden
+                outputs.append(output)
+            for y in outputs:
+                self.assertEqual(y.type(), HALF)
+            outputs[-1].float().sum().backward()
+            for i, x in enumerate(xs):
+                self.assertEqual(x.grad.dtype, x.dtype)
+
+    def test_rnn_cell_is_half(self):
+        cell = nn.RNNCell(self.h, self.h)
+        self.run_cell_test(cell)
+
+    def test_gru_cell_is_half(self):
+        cell = nn.GRUCell(self.h, self.h)
+        self.run_cell_test(cell)
+
+    def test_lstm_cell_is_half(self):
+        cell = nn.LSTMCell(self.h, self.h)
+        self.run_cell_test(cell, state_tuple=True)
+
+class TestRnns(unittest.TestCase):
+    def setUp(self):
+        self.handle = amp.init(enabled=True)
+        common_init(self)
+
+    def tearDown(self):
+        self.handle._deactivate()
+
+    def run_rnn_test(self, rnn, layers, bidir, state_tuple=False):
+        for typ in [torch.float, torch.half]:
+            x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_()
+            hidden_fn = lambda: torch.zeros((layers + (layers * bidir),
+                                             self.b, self.h), dtype=typ)
+            if state_tuple:
+                hidden = (hidden_fn(), hidden_fn())
+            else:
+                hidden = hidden_fn()
+            output, _ = rnn(x, hidden)
+            self.assertEqual(output.type(), HALF)
+            output[-1, :, :].float().sum().backward()
+            self.assertEqual(x.grad.dtype, x.dtype)
+
+    def test_rnn_is_half(self):
+        configs = [(1, False), (2, False), (2, True)]
+        for layers, bidir in configs:
+            rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=layers,
+                         nonlinearity='relu', bidirectional=bidir)
+            self.run_rnn_test(rnn, layers, bidir)
+
+    def test_gru_is_half(self):
+        configs = [(1, False), (2, False), (2, True)]
+        for layers, bidir in configs:
+            rnn = nn.GRU(input_size=self.h, hidden_size=self.h, num_layers=layers,
+                         bidirectional=bidir)
+            self.run_rnn_test(rnn, layers, bidir)
+
+    def test_lstm_is_half(self):
+        configs = [(1, False), (2, False), (2, True)]
+        for layers, bidir in configs:
+            rnn = nn.LSTM(input_size=self.h, hidden_size=self.h, num_layers=layers,
+                         bidirectional=bidir)
+            self.run_rnn_test(rnn, layers, bidir, state_tuple=True)
+
+    def test_rnn_packed_sequence(self):
+        num_layers = 2
+        rnn = nn.RNN(input_size=self.h, hidden_size=self.h, num_layers=num_layers)
+        for typ in [torch.float, torch.half]:
+            x = torch.randn((self.t, self.b, self.h), dtype=typ).requires_grad_()
+            lens = sorted([random.randint(self.t // 2, self.t) for _ in range(self.b)],
+                          reverse=True)
+            # `pack_padded_sequence` breaks if default tensor type is non-CPU
+            torch.set_default_tensor_type(torch.FloatTensor)
+            lens = torch.tensor(lens, dtype=torch.int64, device=torch.device('cpu'))
+            packed_seq = nn.utils.rnn.pack_padded_sequence(x, lens)
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+            hidden = torch.zeros((num_layers, self.b, self.h), dtype=typ)
+            output, _ = rnn(packed_seq, hidden)
+            self.assertEqual(output.data.type(), HALF)
+            output.data.float().sum().backward()
+            self.assertEqual(x.grad.dtype, x.dtype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/apex/tests/L0/run_amp/utils.py b/apex/tests/L0/run_amp/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aa20c36918f28967ade3d94921a565327c366fe
--- /dev/null
+++ b/apex/tests/L0/run_amp/utils.py
@@ -0,0 +1,21 @@
+import torch
+
+HALF = 'torch.cuda.HalfTensor'
+FLOAT = 'torch.cuda.FloatTensor'
+
+DTYPES = [torch.half, torch.float]
+
+ALWAYS_HALF = {torch.float: HALF,
+               torch.half: HALF}
+ALWAYS_FLOAT = {torch.float: FLOAT,
+                torch.half: FLOAT}
+MATCH_INPUT = {torch.float: FLOAT,
+               torch.half: HALF}
+
+def common_init(test_case):
+    test_case.h = 64
+    test_case.b = 16
+    test_case.c = 16
+    test_case.k = 3
+    test_case.t = 10
+    torch.set_default_tensor_type(torch.cuda.FloatTensor)
diff --git a/apex/tests/L0/run_fp16util/__init__.py b/apex/tests/L0/run_fp16util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/apex/tests/L0/run_fp16util/test_fp16util.py b/apex/tests/L0/run_fp16util/test_fp16util.py
new file mode 100644
index 0000000000000000000000000000000000000000..eecddbc01d7e5b72e93f1cdba6ad037c0c2aae54
--- /dev/null
+++ b/apex/tests/L0/run_fp16util/test_fp16util.py
@@ -0,0 +1,75 @@
+import unittest
+
+import torch
+import torch.nn as nn
+
+from apex.fp16_utils import FP16Model
+
+
+class DummyBlock(nn.Module):
+    def __init__(self):
+        super(DummyBlock, self).__init__()
+
+        self.conv = nn.Conv2d(10, 10, 2)
+        self.bn = nn.BatchNorm2d(10, affine=True)
+
+    def forward(self, x):
+        return self.conv(self.bn(x))
+
+
+class DummyNet(nn.Module):
+    def __init__(self):
+        super(DummyNet, self).__init__()
+
+        self.conv1 = nn.Conv2d(3, 10, 2)
+        self.bn1 = nn.BatchNorm2d(10, affine=False)
+        self.db1 = DummyBlock()
+        self.db2 = DummyBlock()
+
+    def forward(self, x):
+        out = x
+        out = self.conv1(out)
+        out = self.bn1(out)
+        out = self.db1(out)
+        out = self.db2(out)
+        return out
+
+
+class DummyNetWrapper(nn.Module):
+    def __init__(self):
+        super(DummyNetWrapper, self).__init__()
+
+        self.bn = nn.BatchNorm2d(3, affine=True)
+        self.dn = DummyNet()
+
+    def forward(self, x):
+        return self.dn(self.bn(x))
+
+
+class TestFP16Model(unittest.TestCase):
+    def setUp(self):
+        self.N = 64
+        self.C_in = 3
+        self.H_in = 16
+        self.W_in = 32
+        self.in_tensor = torch.randn((self.N, self.C_in, self.H_in, self.W_in)).cuda()
+        self.orig_model = DummyNetWrapper().cuda()
+        self.fp16_model = FP16Model(self.orig_model)
+
+    def test_params_and_buffers(self):
+        exempted_modules = [
+            self.fp16_model.network.bn,
+            self.fp16_model.network.dn.db1.bn,
+            self.fp16_model.network.dn.db2.bn,
+        ]
+        for m in self.fp16_model.modules():
+            expected_dtype = torch.float if (m in exempted_modules) else torch.half
+            for p in m.parameters(recurse=False):
+                assert p.dtype == expected_dtype
+            for b in m.buffers(recurse=False):
+                assert b.dtype in (expected_dtype, torch.int64)
+
+    def test_output_is_half(self):
+        out_tensor = self.fp16_model(self.in_tensor)
+        assert out_tensor.dtype == torch.half
+
diff --git a/apex/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py b/apex/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..241b2490b2c3ee8bf3cb9f7f8a6b175d8e1cad1c
--- /dev/null
+++ b/apex/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
@@ -0,0 +1,41 @@
+import unittest
+import os
+import random
+
+import torch
+import apex
+
+        
+class TestFusedLayerNorm(unittest.TestCase):
+    def setUp(self):
+        self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=False)
+        self.input_ = torch.randn(16, 32, 64)
+        torch.cuda.manual_seed(42)
+        
+    def forward_cpu(self, input_):
+        self.module.cpu()
+        return self.module(input_.cpu())
+    
+    def forward_cuda(self, input_):
+        self.module.cuda()
+        return self.module(input_.cuda())
+    
+    def test_forward_cuda(self):
+        out_ = self.forward_cuda(self.input_)
+        assert out_.is_cuda == True
+        
+    def test_forward_cpu(self):
+        out_ = self.forward_cpu(self.input_)
+        assert out_.is_cuda == False
+        
+    def test_same_output(self):
+        out_cpu = self.forward_cpu(self.input_)
+        out_cuda = self.forward_cuda(self.input_)
+        torch.testing.assert_allclose(out_cpu, out_cuda.cpu())
+        
+        
+class TestFusedLayerNormElemWise(TestFusedLayerNorm):
+    def setUp(self):
+        self.module = apex.normalization.FusedLayerNorm(normalized_shape=[32, 64], elementwise_affine=True)
+        self.input_ = torch.randn(16, 32, 64)
+        torch.cuda.manual_seed(42)
\ No newline at end of file
diff --git a/apex/tests/L0/run_mixed_adam/__init__.py b/apex/tests/L0/run_mixed_adam/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/apex/tests/L0/run_mixed_adam/test_fp16_optimizer.py b/apex/tests/L0/run_mixed_adam/test_fp16_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c083f16e2937f29b3b786eb5ce5eeeb76d260ee7
--- /dev/null
+++ b/apex/tests/L0/run_mixed_adam/test_fp16_optimizer.py
@@ -0,0 +1,134 @@
+import unittest
+import torch
+import apex
+
+class TestFP16Optimizer(unittest.TestCase):
+    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
+        self.max_abs_diff = max_abs_diff
+        self.max_rel_diff = max_rel_diff
+        self.iters = iters
+        torch.cuda.manual_seed(13337)
+
+        N, D_in, D_out = 64, 1024, 16
+        self.N = N
+        self.D_in = D_in
+        self.D_out = D_out
+        self.x = torch.randn((N, D_in), dtype=torch.float16, device='cuda')
+        self.ref_model = torch.nn.Linear(D_in, D_out).cuda().half()
+        self.tst_model = torch.nn.Linear(D_in, D_out).cuda().half()
+        for p,q in zip(self.tst_model.parameters(), self.ref_model.parameters()):
+            p.data.copy_(q.data)
+
+    def get_max_diff(self, ref_param, tst_param):
+        max_abs_diff = max_rel_diff = 0
+        for p_ref, p_tst in zip(ref_param, tst_param):
+            max_abs_diff_p = (p_ref - p_tst).abs().max().item()
+            max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
+
+            if max_abs_diff_p > max_abs_diff:  max_abs_diff = max_abs_diff_p
+            if max_rel_diff_p > max_rel_diff:  max_rel_diff = max_rel_diff_p
+
+        return max_abs_diff, max_rel_diff
+
+    def test_fp16_optimizer(self):
+
+        ref_optim = torch.optim.Adam(self.ref_model.parameters())
+        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
+
+        tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
+        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
+
+        for i in range(self.iters):
+            ref_loss = self.ref_model(self.x).sum()
+            ref_optim.backward(ref_loss)
+            ref_optim.step()
+
+            tst_loss = self.tst_model(self.x).sum()
+            tst_optim.backward(tst_loss)
+            tst_optim.step()
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+
+    def test_loss_scaling(self):
+
+        ref_optim = torch.optim.Adam(self.ref_model.parameters())
+        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, static_loss_scale=128.0, verbose=False)
+
+        tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
+        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim, static_loss_scale=128.0)
+
+        for i in range(self.iters):
+            ref_loss = self.ref_model(self.x).sum()
+            ref_optim.backward(ref_loss)
+            ref_optim.step()
+
+            tst_loss = self.tst_model(self.x).sum()
+            tst_optim.backward(tst_loss)
+            tst_optim.step()
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_parameter_groups(self):
+
+        ref_groups = [{'params': [self.ref_model.weight]},{'params': [self.ref_model.bias]}]
+        ref_optim = torch.optim.Adam(ref_groups)
+        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
+
+        tst_groups = [{'params': [self.tst_model.weight]},{'params': [self.tst_model.bias]}]
+        tst_optim = apex.optimizers.FusedAdam(tst_groups)
+        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
+
+        for i in range(self.iters):
+            ref_loss = self.ref_model(self.x).sum()
+            ref_optim.backward(ref_loss)
+            ref_optim.step()
+
+            tst_loss = self.tst_model(self.x).sum()
+            tst_optim.backward(tst_loss)
+            tst_optim.step()
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_grad_clip(self):
+        ref_optim = torch.optim.Adam(self.ref_model.parameters())
+        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
+
+        tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters(), max_grad_norm=0.01)
+        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
+
+        for i in range(self.iters):
+            ref_loss = self.ref_model(self.x).sum()
+            ref_optim.backward(ref_loss)
+            ref_optim.clip_master_grads(0.01)
+            ref_optim.step()
+
+            tst_loss = self.tst_model(self.x).sum()
+            tst_optim.backward(tst_loss)
+            tst_optim.step()
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    @unittest.skip('Not support grad being None')
+    def test_grad_None(self):
+        self.fail()
+
+    @unittest.skip('Not support same weight decay as pytorch')
+    def test_weight_decay(self):
+        self.fail()
+
+    @unittest.skip('Not support empty parameter groups')
+    def test_group_empty(self):
+        self.fail()
+
+if __name__ == '__main__':
+    script_path = os.path.dirname(os.path.realpath(__file__))
+    unittest.main()
diff --git a/apex/tests/L0/run_mixed_adam/test_mixed_adam.py b/apex/tests/L0/run_mixed_adam/test_mixed_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..020c16b1b88e97cc69e27674380e8b89513dc3d5
--- /dev/null
+++ b/apex/tests/L0/run_mixed_adam/test_mixed_adam.py
@@ -0,0 +1,179 @@
+import unittest
+import os
+import random
+
+import torch
+import apex
+
+class TestFusedAdam(unittest.TestCase):
+    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
+        self.max_abs_diff = max_abs_diff
+        self.max_rel_diff = max_rel_diff
+        self.iters = iters
+        torch.cuda.manual_seed(9876)
+
+    def tearDown(self):
+        pass
+
+    def gen_param_optim(self, tensors, adam_option):
+        ref_param = []
+        tst_param = []
+        for tensor in tensors:
+            ref_param.append(torch.nn.Parameter(tensor.clone()))
+            tst_param.append(torch.nn.Parameter(tensor.clone()))
+
+        ref_optim = torch.optim.Adam(ref_param, **adam_option)
+        tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option)
+       
+        return (ref_param, tst_param, ref_optim, tst_optim)
+
+    def gen_grad(self, ref_param, tst_param):
+        for p_ref, p_tst in zip(ref_param, tst_param):
+            p_ref.grad = torch.rand_like(p_ref)
+            p_tst.grad = p_ref.grad
+
+    def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
+        half_grads = []
+        for p_ref, p_tst in zip(ref_param, tst_param):
+            half_grads.append(torch.rand_like(p_ref).half())
+            p_ref.grad = half_grads[-1].float() / scale
+        return half_grads
+
+    def get_max_diff(self, ref_param, tst_param):
+        max_abs_diff = max_rel_diff = 0
+        for p_ref, p_tst in zip(ref_param, tst_param):
+            max_abs_diff_p = (p_ref - p_tst).abs().max().item()
+            max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
+
+            if max_abs_diff_p > max_abs_diff:  max_abs_diff = max_abs_diff_p
+            if max_rel_diff_p > max_rel_diff:  max_rel_diff = max_rel_diff_p
+
+        return max_abs_diff, max_rel_diff
+
+    def gen_single_type_test(self, param_type=torch.float):
+        nelem = 278011
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=param_type, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        for i in range(self.iters):
+            self.gen_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step()
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_double(self):
+        self.gen_single_type_test(param_type=torch.double)
+
+    def test_float(self):
+        self.gen_single_type_test(param_type=torch.float)
+
+    def test_half(self):
+        nelem = 278011
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        for i in range(self.iters):
+            half_grads = self.gen_mixed_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step(grads=half_grads)
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_multi_params(self):
+        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensors = []
+        for size in sizes:
+            tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim(tensors, adam_option)
+
+        for i in range(self.iters):
+            half_grads = self.gen_mixed_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step(grads=half_grads)
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_scale(self):
+        nelem = 278011
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        for i in range(self.iters):
+            scale = random.random() * 1000
+            half_grads = self.gen_mixed_grad(ref_param, tst_param, scale)
+            ref_optim.step()
+            tst_optim.step(grads=half_grads, scale=scale)
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_fp16_output(self):
+        nelem = 278011
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        fp16_param = torch.nn.Parameter(tensor.clone().half())
+
+        for i in range(self.iters):
+            half_grads = self.gen_mixed_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step(grads=half_grads, output_params=[fp16_param])
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
+                [fp16_param.float()])
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_adam_option(self):
+        nelem = 1
+        adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        for i in range(self.iters):
+            self.gen_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step()
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+
+if __name__ == '__main__':
+    script_path = os.path.dirname(os.path.realpath(__file__))
+    unittest.main()
diff --git a/apex/tests/L0/run_test.py b/apex/tests/L0/run_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..93c7529b1c63d09b0145d5370ae5fc4e7e6ea623
--- /dev/null
+++ b/apex/tests/L0/run_test.py
@@ -0,0 +1,20 @@
+import unittest
+import sys
+
+test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam", "run_fused_layer_norm"]
+
+runner = unittest.TextTestRunner(verbosity=2)
+
+errcode = 0
+
+for test_dir in test_dirs:
+    suite = unittest.TestLoader().discover(test_dir)
+
+    print("\nExecuting tests from " + test_dir)
+
+    result = runner.run(suite)
+
+    if not result.wasSuccessful():
+        errcode = 1
+
+sys.exit(errcode)
diff --git a/apex/tests/L1/common/compare.py b/apex/tests/L1/common/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..74374d412da530a8ae578deb3efe9e1e5407cb89
--- /dev/null
+++ b/apex/tests/L1/common/compare.py
@@ -0,0 +1,64 @@
+import argparse
+import torch
+
+parser = argparse.ArgumentParser(description='Compare')
+parser.add_argument('--opt-level', type=str)
+parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+parser.add_argument('--loss-scale', type=str, default=None)
+parser.add_argument('--fused-adam', action='store_true')
+parser.add_argument('--use_baseline', action='store_true')
+args = parser.parse_args()
+
+base_file = str(args.opt_level) + "_" +\
+            str(args.loss_scale) + "_" +\
+            str(args.keep_batchnorm_fp32) + "_" +\
+            str(args.fused_adam)
+
+file_e = "True_" + base_file
+file_p = "False_" + base_file
+if args.use_baseline:
+    file_b = "baselines/True_" + base_file
+
+dict_e = torch.load(file_e)
+dict_p = torch.load(file_p)
+if args.use_baseline:
+    dict_b = torch.load(file_b)
+
+torch.set_printoptions(precision=10)
+
+print(file_e)
+print(file_p)
+if args.use_baseline:
+    print(file_b)
+
+# ugly duplication here...
+if not args.use_baseline:
+    for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
+        assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
+
+        loss_e = dict_e["Loss"][n]
+        loss_p = dict_p["Loss"][n]
+        assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
+        print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
+              i_e,
+              loss_e,
+              loss_p,
+              dict_e["Speed"][n],
+              dict_p["Speed"][n]))
+else:
+    for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
+        assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
+
+        loss_e = dict_e["Loss"][n]
+        loss_p = dict_p["Loss"][n]
+        loss_b = dict_b["Loss"][n]
+        assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
+        assert loss_e == loss_b, "Iteration {}, loss_e = {}, loss_b = {}".format(i_e, loss_e, loss_b)
+        print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
+              i_e,
+              loss_b,
+              loss_e,
+              loss_p,
+              dict_b["Speed"][n],
+              dict_e["Speed"][n],
+              dict_p["Speed"][n]))
diff --git a/apex/tests/L1/common/main_amp.py b/apex/tests/L1/common/main_amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..106a0f637803fdbef042e1d60aaab33bbb1b7595
--- /dev/null
+++ b/apex/tests/L1/common/main_amp.py
@@ -0,0 +1,526 @@
+import argparse
+import os
+import shutil
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+
+import numpy as np
+
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex import amp, optimizers
+    from apex.multi_tensor_apply import multi_tensor_applier
+except ImportError:
+    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
+
+model_names = sorted(name for name in models.__dict__
+                     if name.islower() and not name.startswith("__")
+                     and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                    ' | '.join(model_names) +
+                    ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N', help='mini-batch size per process (default: 256)')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='Initial learning rate.  Will be scaled by <global batch size>/256: args.lr = args.lr*float(args.batch_size*args.world_size)/256.  A warmup schedule will also be applied over the first 5 epochs.')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)')
+parser.add_argument('--print-freq', '-p', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+
+parser.add_argument('--prof', dest='prof', action='store_true',
+                    help='Only run 10 iterations for profiling.')
+parser.add_argument('--deterministic', action='store_true')
+
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--sync_bn', action='store_true',
+                    help='enabling apex sync BN.')
+
+parser.add_argument('--has-ext', action='store_true')
+parser.add_argument('--opt-level', type=str)
+parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+parser.add_argument('--loss-scale', type=str, default=None)
+parser.add_argument('--fused-adam', action='store_true')
+
+parser.add_argument('--prints-to-process', type=int, default=10)
+
+cudnn.benchmark = True
+
+def fast_collate(batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8 )
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        if(nump_array.ndim < 3):
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+
+        tensor[i] += torch.from_numpy(nump_array)
+        
+    return tensor, targets
+
+best_prec1 = 0
+args = parser.parse_args()
+
+# Let multi_tensor_applier be the canary in the coalmine
+# that verifies if the backend is what we think it is
+assert multi_tensor_applier.available == args.has_ext 
+
+print("opt_level = {}".format(args.opt_level))
+print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32))
+print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))
+
+
+print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))
+
+if args.deterministic:
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+    torch.manual_seed(args.local_rank)
+    torch.set_printoptions(precision=10)
+
+def main():
+    global best_prec1, args
+
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+    args.gpu = 0
+    args.world_size = 1
+
+    if args.distributed:
+        args.gpu = args.local_rank % torch.cuda.device_count()
+        torch.cuda.set_device(args.gpu)
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method='env://')
+        args.world_size = torch.distributed.get_world_size()
+
+    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."
+
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch]()
+
+    if args.sync_bn:
+        import apex
+        print("using apex synced BN")
+        model = apex.parallel.convert_syncbn_model(model)
+
+    model = model.cuda()
+
+    # Scale learning rate based on global batch size
+    args.lr = args.lr*float(args.batch_size*args.world_size)/256. 
+    if args.fused_adam:
+        optimizer = optimizers.FusedAdam(model.parameters())
+    else:
+        optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                    momentum=args.momentum,
+                                    weight_decay=args.weight_decay)
+
+    model, optimizer = amp.initialize(
+        model, optimizer,
+        # enabled=False,
+        opt_level=args.opt_level,
+        keep_batchnorm_fp32=args.keep_batchnorm_fp32,
+        loss_scale=args.loss_scale
+        )
+
+    if args.distributed:
+        # By default, apex.parallel.DistributedDataParallel overlaps communication with 
+        # computation in the backward pass.
+        # model = DDP(model)
+        # delay_allreduce delays all communication to the end of the backward pass.
+        model = DDP(model, delay_allreduce=True)
+
+    # define loss function (criterion) and optimizer
+    criterion = nn.CrossEntropyLoss().cuda()
+
+    # Optionally resume from a checkpoint
+    if args.resume:
+        # Use a local scope to avoid dangling references
+        def resume():
+            if os.path.isfile(args.resume):
+                print("=> loading checkpoint '{}'".format(args.resume))
+                checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
+                args.start_epoch = checkpoint['epoch']
+                best_prec1 = checkpoint['best_prec1']
+                model.load_state_dict(checkpoint['state_dict'])
+                optimizer.load_state_dict(checkpoint['optimizer'])
+                print("=> loaded checkpoint '{}' (epoch {})"
+                      .format(args.resume, checkpoint['epoch']))
+            else:
+                print("=> no checkpoint found at '{}'".format(args.resume))
+        resume()
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+
+    if(args.arch == "inception_v3"):
+        crop_size = 299
+        val_size = 320 # I chose this value arbitrarily, we can adjust.
+    else:
+        crop_size = 224
+        val_size = 256
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(crop_size),
+            transforms.RandomHorizontalFlip(),
+            # transforms.ToTensor(), Too slow
+            # normalize,
+        ]))
+    val_dataset = datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(val_size),
+            transforms.CenterCrop(crop_size),
+        ]))
+
+    train_sampler = None
+    val_sampler = None
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True,
+        sampler=val_sampler,
+        collate_fn=fast_collate)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch)
+        if args.prof:
+            break
+        # evaluate on validation set
+        prec1 = validate(val_loader, model, criterion)
+
+        # remember best prec@1 and save checkpoint
+        if args.local_rank == 0:
+            is_best = prec1 > best_prec1
+            best_prec1 = max(prec1, best_prec1)
+            save_checkpoint({
+                'epoch': epoch + 1,
+                'arch': args.arch,
+                'state_dict': model.state_dict(),
+                'best_prec1': best_prec1,
+                'optimizer' : optimizer.state_dict(),
+            }, is_best)
+
+class data_prefetcher():
+    def __init__(self, loader):
+        self.loader = iter(loader)
+        self.stream = torch.cuda.Stream()
+        self.mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)
+        self.std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)
+        # With Amp, it isn't necessary to manually convert data to half.
+        # if args.fp16:
+        #     self.mean = self.mean.half()
+        #     self.std = self.std.half()
+        self.preload()
+
+    def preload(self):
+        try:
+            self.next_input, self.next_target = next(self.loader)
+        except StopIteration:
+            self.next_input = None
+            self.next_target = None
+            return
+        with torch.cuda.stream(self.stream):
+            self.next_input = self.next_input.cuda(non_blocking=True)
+            self.next_target = self.next_target.cuda(non_blocking=True)
+            # With Amp, it isn't necessary to manually convert data to half.
+            # if args.fp16:
+            #     self.next_input = self.next_input.half()
+            # else:
+            self.next_input = self.next_input.float()
+            self.next_input = self.next_input.sub_(self.mean).div_(self.std)
+            
+    def next(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        input = self.next_input
+        target = self.next_target
+        self.preload()
+        return input, target
+
+
+def train(train_loader, model, criterion, optimizer, epoch):
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to train mode
+    model.train()
+    end = time.time()
+
+    run_info_dict = {"Iteration" : [],
+                     "Loss" : [],
+                     "Speed" : []}
+
+    prefetcher = data_prefetcher(train_loader)
+    input, target = prefetcher.next()
+    i = -1
+    while input is not None:
+        i += 1
+
+        # No learning rate warmup for this test, to expose bitwise inaccuracies more quickly
+        # adjust_learning_rate(optimizer, epoch, i, len(train_loader))
+
+        if args.prof:
+            if i > 10:
+                break
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        # compute output
+        output = model(input)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+
+        if args.distributed:
+            reduced_loss = reduce_tensor(loss.data)
+            prec1 = reduce_tensor(prec1)
+            prec5 = reduce_tensor(prec5)
+        else:
+            reduced_loss = loss.data
+
+        losses.update(to_python_float(reduced_loss), input.size(0))
+        top1.update(to_python_float(prec1), input.size(0))
+        top5.update(to_python_float(prec5), input.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+
+        # for param in model.parameters():
+        #     print(param.data.double().sum().item(), param.grad.data.double().sum().item())
+
+        # torch.cuda.synchronize()
+        torch.cuda.nvtx.range_push("step")
+        optimizer.step()
+        torch.cuda.nvtx.range_pop()
+
+        torch.cuda.synchronize()
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+
+        end = time.time()
+
+        # If you decide to refactor this test, like examples/imagenet, to sample the loss every
+        # print_freq iterations, make sure to move this prefetching below the accuracy calculation.
+        input, target = prefetcher.next()
+
+        if i % args.print_freq == 0 and i > 1:
+            if args.local_rank == 0:
+                print('Epoch: [{0}][{1}/{2}]\t'
+                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                      'Speed {3:.3f} ({4:.3f})\t'
+                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                      'Loss {loss.val:.10f} ({loss.avg:.4f})\t'
+                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                       epoch, i, len(train_loader),
+                       args.world_size * args.batch_size / batch_time.val,
+                       args.world_size * args.batch_size / batch_time.avg,
+                       batch_time=batch_time,
+                       data_time=data_time, loss=losses, top1=top1, top5=top5))
+            run_info_dict["Iteration"].append(i)
+            run_info_dict["Loss"].append(losses.val)
+            run_info_dict["Speed"].append(args.world_size * args.batch_size / batch_time.val)
+            if len(run_info_dict["Loss"]) == args.prints_to_process:
+                if args.local_rank == 0:
+                    torch.save(run_info_dict,
+                               str(args.has_ext) + "_" + str(args.opt_level) + "_" +
+                               str(args.loss_scale) + "_" + str(args.keep_batchnorm_fp32) + "_" +
+                               str(args.fused_adam))
+                quit()
+
+
+def validate(val_loader, model, criterion):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to evaluate mode
+    model.eval()
+
+    end = time.time()
+
+    prefetcher = data_prefetcher(val_loader)
+    input, target = prefetcher.next()
+    i = -1
+    while input is not None:
+        i += 1
+
+        # compute output
+        with torch.no_grad():
+            output = model(input)
+            loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
+
+        if args.distributed:
+            reduced_loss = reduce_tensor(loss.data)
+            prec1 = reduce_tensor(prec1)
+            prec5 = reduce_tensor(prec5)
+        else:
+            reduced_loss = loss.data
+
+        losses.update(to_python_float(reduced_loss), input.size(0))
+        top1.update(to_python_float(prec1), input.size(0))
+        top5.update(to_python_float(prec5), input.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if args.local_rank == 0 and i % args.print_freq == 0:
+            print('Test: [{0}/{1}]\t'
+                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                  'Speed {2:.3f} ({3:.3f})\t'
+                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                   i, len(val_loader),
+                   args.world_size * args.batch_size / batch_time.val,
+                   args.world_size * args.batch_size / batch_time.avg,
+                   batch_time=batch_time, loss=losses,
+                   top1=top1, top5=top5))
+
+        input, target = prefetcher.next()
+
+    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
+          .format(top1=top1, top5=top5))
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def adjust_learning_rate(optimizer, epoch, step, len_epoch):
+    """LR schedule that should yield 76% converged accuracy with batch size 256"""
+    factor = epoch // 30
+
+    if epoch >= 80:
+        factor = factor + 1
+
+    lr = args.lr*(0.1**factor)
+
+    """Warmup"""
+    if epoch < 5:
+        lr = lr*float(1 + step + epoch*len_epoch)/(5.*len_epoch)
+
+    # if(args.local_rank == 0):
+    #     print("epoch = {}, step = {}, lr = {}".format(epoch, step, lr))
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    rt /= args.world_size
+    return rt
+
+if __name__ == '__main__':
+    main()
diff --git a/apex/tests/L1/common/run_test.sh b/apex/tests/L1/common/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f4ae06c80ee32e9dbabee3614a98081865e56816
--- /dev/null
+++ b/apex/tests/L1/common/run_test.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+print_banner() {
+  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
+}
+
+print_banner "Distributed status:  $1"
+
+echo $2
+DATADIR=$2
+
+if [ -n "$3" ]
+then
+  USE_BASELINE=""
+else
+  USE_BASELINE="--use_baseline"
+fi
+
+if [ "$1" == "single_gpu" ]
+then
+  BASE_CMD="python main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
+fi
+
+if [ "$1" == "distributed" ]
+then
+  BASE_CMD="python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
+fi
+
+ADAM_ARGS="--opt-level O2 --keep-batchnorm-fp32 False --fused-adam"
+
+keep_batchnorms=(
+""
+"--keep-batchnorm-fp32 True"
+"--keep-batchnorm-fp32 False"
+)
+
+loss_scales=(
+""
+"--loss-scale 1.0"
+"--loss-scale 128.0"
+"--loss-scale dynamic"
+)
+
+opt_levels=(
+"O0"
+"O1"
+"O2"
+"O3"
+)
+
+rm True*
+rm False*
+
+set -e
+
+print_banner "Installing Apex with --cuda_ext and --cpp_ext"
+
+pushd ../../..
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+popd
+
+for opt_level in "${opt_levels[@]}"
+do
+  for loss_scale in "${loss_scales[@]}"
+  do
+    for keep_batchnorm in "${keep_batchnorms[@]}"
+    do
+      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
+      then
+        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
+        continue
+      fi
+      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR"
+      set -x
+      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR
+      set +x
+    done
+  done
+done
+
+# Handle FusedAdam separately due to limited support.
+# FusedAdam will not be tested for bitwise accuracy against the Python implementation.
+# The L0 tests already do so.  These tests are here to ensure that it actually runs,
+# and get an idea of performance.
+for loss_scale in "${loss_scales[@]}"
+do
+  print_banner "${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR"
+  set -x
+  ${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR
+  set +x
+done
+
+print_banner "Reinstalling apex without extensions"
+
+pushd ../../..
+pip install -v --no-cache-dir .
+popd
+
+for opt_level in "${opt_levels[@]}"
+do
+  for loss_scale in "${loss_scales[@]}"
+  do
+    for keep_batchnorm in "${keep_batchnorms[@]}"
+    do
+      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
+      then
+        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
+        continue
+      fi
+      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR"
+      set -x
+      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR
+      set +x
+    done
+  done
+done
+
+print_banner "Checking for bitwise accuracy between Python-only and cpp/cuda extension installs"
+
+for opt_level in "${opt_levels[@]}"
+do
+  for loss_scale in "${loss_scales[@]}"
+  do
+    for keep_batchnorm in "${keep_batchnorms[@]}"
+    do
+      echo ""
+      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
+      then
+        echo "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
+        continue
+      fi
+      echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
+      set -x
+      python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
+      set +x
+    done
+  done
+done
+
+print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"
+
+pushd ../../..
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+popd
diff --git a/apex/tests/L1/cross_product/run.sh b/apex/tests/L1/cross_product/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3b4d458894a36db4853fea89eb27fd4ab8b3814d
--- /dev/null
+++ b/apex/tests/L1/cross_product/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
+# DATADIR="/opt/home/apex/examples/imagenet/"
+cp ../common/* .
+bash run_test.sh single_gpu $1 $DATADIR yes
diff --git a/apex/tests/L1/cross_product_distributed/run.sh b/apex/tests/L1/cross_product_distributed/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..917ec11e83227a4083a4dd572c5ab3f1a6a3cca7
--- /dev/null
+++ b/apex/tests/L1/cross_product_distributed/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cp ../common/* .
+bash run_test.sh distributed $1
diff --git a/apex/tests/distributed/DDP/ddp_race_condition_test.py b/apex/tests/distributed/DDP/ddp_race_condition_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..298baeb0571f7381ae880761d03b7573af7fb69a
--- /dev/null
+++ b/apex/tests/distributed/DDP/ddp_race_condition_test.py
@@ -0,0 +1,68 @@
+import torch
+import torch.distributed as dist
+from torch.nn import Parameter
+from torch.nn import Module
+from apex.parallel import DistributedDataParallel as DDP
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description='allreduce hook example')
+parser.add_argument("--local_rank", default=0, type=int)
+args = parser.parse_args()
+
+args.distributed = False
+if 'WORLD_SIZE' in os.environ:
+    args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+if args.distributed:
+    args.gpu = args.local_rank % torch.cuda.device_count()
+    torch.cuda.set_device(args.gpu)
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://')
+    args.world_size = torch.distributed.get_world_size()
+
+torch.set_printoptions(precision=10)
+torch.manual_seed(args.local_rank)
+
+class Model(Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0))
+        self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0))
+    def forward(self, input):
+        return (input*self.a)*self.b
+
+model = Model()
+# model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
+model = DDP(model, delay_allreduce=True)
+# model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
+
+x = torch.cuda.FloatTensor(4096*4096)
+
+passed = True
+torch.cuda.cudart().cudaProfilerStart()
+for i in range(10):
+    x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity
+    model.zero_grad()
+    out = model(x)
+    loss = out.sum()
+    # torch.cuda.nvtx.range_push("backward")
+    loss.backward()
+    # torch.cuda.nvtx.range_pop()
+    
+    # torch.cuda.nvtx.range_push("synchronize() + info")
+    # torch.cuda.synchronize()
+    print("i = {}".format(i))
+    def info(name, param, val):
+        expected = val*4096*4096*(2.*i+1)/2.
+        actual = param.grad.data.sum().item()
+        print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
+              param.grad.data_ptr(), expected, actual))
+        return (expected == actual)
+    if not info("model.a", model.module.a, 2.):  passed = False
+    if not info("model.b", model.module.b, 1.):  passed = False
+    # torch.cuda.nvtx.range_pop()
+torch.cuda.cudart().cudaProfilerStop()
+
+print("passed = ", passed)
diff --git a/apex/tests/distributed/DDP/run_race_test.sh b/apex/tests/distributed/DDP/run_race_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2c2bd266cceba96f5884dde6d5f0bac256f4de02
--- /dev/null
+++ b/apex/tests/distributed/DDP/run_race_test.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py
diff --git a/apex/tests/distributed/amp_master_params/amp_master_params.py b/apex/tests/distributed/amp_master_params/amp_master_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..4af5092f757a9f5bddf0470d9834a7935ebdafdb
--- /dev/null
+++ b/apex/tests/distributed/amp_master_params/amp_master_params.py
@@ -0,0 +1,70 @@
+import torch
+import argparse
+import os
+from apex import amp
+# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
+from apex.parallel import DistributedDataParallel
+
+parser = argparse.ArgumentParser()
+# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
+# automatically by torch.distributed.launch.
+parser.add_argument("--local_rank", default=0, type=int)
+args = parser.parse_args()
+
+# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
+# the 'WORLD_SIZE' environment variable will also be set automatically.
+args.distributed = False
+if 'WORLD_SIZE' in os.environ:
+    args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+if args.distributed:
+    # FOR DISTRIBUTED:  Set the device according to local_rank.
+    torch.cuda.set_device(args.local_rank)
+
+    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
+    # environment variables, and requires that you use init_method=`env://`.
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://')
+
+    torch.manual_seed(torch.distributed.get_rank())
+
+torch.backends.cudnn.benchmark = True
+
+N, D_in, D_out = 64, 1024, 16
+
+# Each process receives its own batch of "fake input data" and "fake target data."
+# The "training loop" in each process just uses this fake batch over and over.
+# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
+# example of distributed data sampling for both training and validation.
+x = torch.randn(N, D_in, device='cuda')
+y = torch.randn(N, D_out, device='cuda')
+
+model = torch.nn.Linear(D_in, D_out).cuda()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
+
+if args.distributed:
+    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
+    # apex.parallel.DistributedDataParallel.
+    model = DistributedDataParallel(model)
+    # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
+    # model = torch.nn.parallel.DistributedDataParallel(model,
+    #                                                   device_ids=[args.local_rank],
+    #                                                   output_device=args.local_rank)
+
+loss_fn = torch.nn.MSELoss()
+
+for t in range(500):
+    optimizer.zero_grad()
+    y_pred = model(x)
+    loss = loss_fn(y_pred, y)
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+    optimizer.step()
+
+if args.local_rank == 0:
+    print("final loss = ", loss)
+
+torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
+torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
diff --git a/apex/tests/distributed/amp_master_params/compare.py b/apex/tests/distributed/amp_master_params/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cbf20c1db56cb96c29775d6a974bb5fd4fffc9
--- /dev/null
+++ b/apex/tests/distributed/amp_master_params/compare.py
@@ -0,0 +1,28 @@
+import torch
+
+model_params_rank0 = torch.load("rank0model.pth",
+                           map_location = lambda storage, loc: storage.cuda(0))
+model_params_rank1 = torch.load("rank1model.pth",
+                                 map_location = lambda storage, loc: storage.cuda(0))
+master_params_rank0 = torch.load("rank0master.pth",
+                                 map_location = lambda storage, loc: storage.cuda(0))
+master_params_rank1 = torch.load("rank1master.pth",
+                                 map_location = lambda storage, loc: storage.cuda(0))
+
+for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
+        model_params_rank0,
+        model_params_rank1,
+        master_params_rank0,
+        master_params_rank1):
+    assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
+    assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
+    # Some debugging/investigation assistance code:
+    # maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
+    # offending_val_half = model_rank0.view(-1)[maxind.item()]
+    # offending_val_float = master_rank0.view(-1)[maxind.item()]
+    # print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
+    #       offending_val_float.half().item())
+    # rtol needs to be > 2^-11 because of denormals...
+    assert torch.allclose(model_rank0, master_rank0.half(), rtol=.005), "Model-master mismatch"
+
+print("OK:  Model and master params match across ranks.")
diff --git a/apex/tests/distributed/amp_master_params/run.sh b/apex/tests/distributed/amp_master_params/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8599dbbbc9b7c97a8b4898658dbf3c453b8263cc
--- /dev/null
+++ b/apex/tests/distributed/amp_master_params/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py
+
+python compare.py
diff --git a/apex/tests/distributed/synced_batchnorm/single_gpu_unit_test.py b/apex/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dfcf651fc9109a8121054f1df8c465f518eb021
--- /dev/null
+++ b/apex/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
@@ -0,0 +1,156 @@
+import torch
+import numpy as np
+import apex
+if True:
+    print("using setup tools")
+    import syncbn
+else:
+    print("using jit")
+    from torch.utils.cpp_extension import load
+    syncbn = load(name='syncbn', sources=['../../csrc/syncbn.cpp', '../../csrc/welford.cu'])
+
+def compare(desc, inp1, inp2, error):
+    a = inp1.clone().detach().cpu().numpy()
+    b = inp2.clone().detach().cpu().numpy()
+    close = np.allclose(a,b, error, error)
+    if not close:
+        print(desc, close)
+        z = a - b
+        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
+        print("dif    : ", z[index])
+        print("inp1   : ", a[index])
+        print("inp2   : ", b[index])
+    return close
+
+feature_size = 10
+space_size = 16
+batch_size = 5
+
+
+error = 1e-5
+
+np.random.seed(1)
+dtype = np.float32
+inp = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
+grad = (np.random.randn(batch_size, feature_size, space_size, space_size)).astype(dtype)
+weight = (np.random.randn(feature_size)).astype(dtype)
+bias = (np.random.randn(feature_size)).astype(dtype)
+
+type_tensor = torch.cuda.FloatTensor
+ref_tensor = torch.cuda.DoubleTensor
+
+inp_t = type_tensor(inp)
+weight_t = type_tensor(weight)
+bias_t = type_tensor(bias)
+
+inp_r = ref_tensor(inp.transpose(1, 0, 2, 3).reshape(feature_size, -1))
+inp2_r = ref_tensor(inp)
+weight_r = ref_tensor(weight).view(-1, 1, 1)
+bias_r = ref_tensor(bias).view(-1, 1, 1)
+
+grad_output_t = type_tensor(grad)
+
+m = inp_r.mean(1)
+b_v = inp_r.var(1, unbiased=False)
+unb_v = inp_r.var(1, unbiased=True)
+
+eps = 1e-5
+
+#mean, var, var_biased = syncbn.welford_mean_var(inp_t)
+mean, var_biased = syncbn.welford_mean_var(inp_t)
+inv_std = 1.0 / torch.sqrt(var_biased + eps)
+
+bn = torch.nn.BatchNorm2d(feature_size).cuda()
+bn.momentum = 1.0
+bn.weight.data = weight_t.clone()
+bn.bias.data = bias_t.clone()
+inp_bn = inp_t.clone().requires_grad_()
+grad_bn = grad_output_t.clone().detach()
+out_bn = bn(inp_bn)
+out_bn.backward(grad_bn)
+
+sbn = apex.parallel.SyncBatchNorm(feature_size).cuda()
+sbn.momentum = 1.0
+sbn.weight.data = weight_t.clone()
+sbn.bias.data = bias_t.clone()
+inp_sbn = inp_t.clone().requires_grad_()
+grad_sbn = grad_output_t.clone().detach()
+out_sbn = sbn(inp_sbn)
+out_sbn.backward(grad_sbn)
+
+sbn_c_last = apex.parallel.SyncBatchNorm(feature_size, channel_last=True).cuda()
+sbn_c_last.momentum = 1.0
+sbn_c_last.weight.data = weight_t.clone()
+sbn_c_last.bias.data = bias_t.clone()
+inp_sbn_c_last = inp_t.clone().transpose(-1, 1).contiguous().requires_grad_()
+grad_sbn_c_last = grad_output_t.clone().transpose(-1, 1).contiguous().detach()
+out_sbn_c_last = sbn_c_last(inp_sbn_c_last)
+out_sbn_c_last.backward(grad_sbn_c_last)
+
+sbn_result = True
+sbn_result_c_last = True
+bn_result = True
+
+sbn_result = compare("comparing mean: ", mean, m, error) and sbn_result
+#sbn_result = compare("comparing variance: ", var, unb_v, error) and sbn_result
+sbn_result = compare("comparing biased variance: ", var_biased, b_v, error) and sbn_result
+
+
+out = syncbn.batchnorm_forward(inp_t, mean, inv_std, weight_t, bias_t)
+out_r = weight_r * (inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1,1,1) + eps) + bias_r
+
+sbn_result = compare("comparing output: ", out, out_r, error) and sbn_result
+compare("comparing bn output: ", out_bn, out_r, error)
+
+grad_output_t = type_tensor(grad)
+
+grad_output_r = ref_tensor(grad.transpose(1, 0, 2, 3).reshape(feature_size, -1))
+grad_output2_r = ref_tensor(grad)
+
+grad_bias_r = grad_output_r.sum(1)
+grad_weight_r = ((inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1,1,1) + eps) * grad_output2_r).transpose(1,0).contiguous().view(feature_size, -1).sum(1)
+
+mean_dy_r = grad_output_r.mean(1)
+mean_dy_xmu_r = ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r).transpose(1,0).contiguous().view(feature_size, -1).mean(1)
+
+grad_input_r = (grad_output2_r - mean_dy_r.view(-1, 1, 1) - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1,1,1) + eps) * mean_dy_xmu_r.view(-1, 1, 1) ) * torch.rsqrt(b_v.view(-1,1,1) + eps) * weight_r.view(-1,1,1)
+
+mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output_t, inp_t, mean, inv_std, weight_t)
+grad_input = syncbn.batchnorm_backward(grad_output_t, inp_t, mean, inv_std, weight_t, mean_dy, mean_dy_xmu)
+sbn_result = compare("comparing bias grad: ", grad_bias, grad_bias_r, error) and sbn_result
+sbn_result = compare("comparing weight grad: ", grad_weight, grad_weight_r, error) and sbn_result
+sbn_result = compare("comparing mean_dy grad: ", mean_dy, mean_dy_r, error) and sbn_result
+sbn_result = compare("comparing mean_dy_xmu grad: ", mean_dy_xmu, mean_dy_xmu_r, error) and sbn_result
+sbn_result = compare("comparing input grad: ", grad_input, grad_input_r, error) and sbn_result
+compare("comparing bn input grad: ", inp_bn.grad, grad_input_r, error)
+sbn_result = compare("comparing sbn input grad: ", inp_sbn.grad, grad_input_r, error) and sbn_result
+
+compare("comparing bn/sbn output: ", out_bn, out_sbn, error)
+sbn_result = compare("comparing running_mean: ", bn.running_mean.data, sbn.running_mean.data, error) and sbn_result
+sbn_result = compare("comparing running_variance: ", bn.running_var.data, sbn.running_var.data, error) and sbn_result
+compare("comparing grad_input: ", inp_bn.grad, inp_sbn.grad, error)
+compare("comparing grad_bias: ", bn.bias.grad, sbn.bias.grad, error)
+compare("comparing grad_bias bn to ref: ", bn.bias.grad, grad_bias_r, error)
+sbn_result = compare("comparing grad_bias sbn to ref: ", sbn.bias.grad, grad_bias_r, error) and sbn_result
+compare("comparing grad_weight: ", bn.weight.grad, sbn.weight.grad, error)
+compare("comparing grad_weight bn to ref: ", bn.weight.grad, grad_weight_r, error)
+sbn_result = compare("comparing grad_weight sbn to ref: ", sbn.weight.grad, grad_weight_r, error) and sbn_result
+
+compare("comparing channel last bn/sbn output: ", out_bn, out_sbn_c_last.transpose(-1, 1).contiguous(), error)
+sbn_result_c_last = compare("comparing channel last running_mean: ", bn.running_mean.data, sbn_c_last.running_mean.data, error) and sbn_result_c_last
+sbn_result_c_last = compare("comparing channel last running_variance: ", bn.running_var.data, sbn_c_last.running_var.data, error) and sbn_result_c_last
+compare("comparing channel last grad_input: ", inp_bn.grad, inp_sbn_c_last.grad.transpose(-1, 1).contiguous(), error)
+compare("comparing channel last grad_bias: ", bn.bias.grad, sbn_c_last.bias.grad, error)
+sbn_result_c_last = compare("comparing channel last grad_bias sbn to ref: ", sbn_c_last.bias.grad, grad_bias_r, error) and sbn_result_c_last
+compare("comparing channel last grad_weight: ", bn.weight.grad, sbn_c_last.weight.grad, error)
+sbn_result_c_last = compare("comparing channel last grad_weight sbn to ref: ", sbn_c_last.weight.grad, grad_weight_r, error) and sbn_result_c_last
+
+if sbn_result:
+    print("====SBN single gpu passed tests")
+else:
+    print("*SBN single gpu failed*")
+
+if sbn_result_c_last:
+    print("====SBN channel last single gpu passed tests")
+else:
+    print("*SBN channel last single gpu failed*")
diff --git a/apex/tests/distributed/synced_batchnorm/test_groups.py b/apex/tests/distributed/synced_batchnorm/test_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..d028cc397f738c6ce37c285b46bcf25c1e29d25b
--- /dev/null
+++ b/apex/tests/distributed/synced_batchnorm/test_groups.py
@@ -0,0 +1,185 @@
+import torch
+import numpy as np
+import apex
+import syncbn
+import os
+import argparse
+import torch.optim as optim
+
+def compare(desc, inp1, inp2, error):
+    a = inp1.clone().detach().cpu().numpy()
+    b = inp2.clone().detach().cpu().numpy()
+    close = np.allclose(a,b, error, error)
+    if not close:
+        print(desc, close)
+        z = a - b
+        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
+        print("dif    : ", z[index])
+        print("inp1   : ", a[index])
+        print("inp2   : ", b[index])
+    return close
+
+feature_size = 10
+space_size = 40
+batch_size = 32
+
+
+from apex.parallel import DistributedDataParallel as DDP
+parser = argparse.ArgumentParser()
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument("--fp16", action='store_true', default=False)
+parser.add_argument("--fp64", action='store_true', default=False)
+parser.add_argument("--group_size", default=0, type=int)
+args = parser.parse_args()
+
+try:
+    args.world_size = int(os.environ['WORLD_SIZE'])
+except:
+    print("This is a multi-gpu test. To run it please use 'python -m torch.distributed.launch --nproc_per_node=<num gpus> test_groups.py <more options>'")
+    exit(1)
+
+torch.cuda.set_device(args.local_rank)
+torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+start = (args.local_rank%args.group_size) * batch_size//args.group_size
+finish = (args.local_rank%args.group_size + 1) * batch_size//args.group_size
+
+error = 1e-5
+dtype = np.float32
+if args.fp16:
+    error = 1e-3
+    dtype = np.float16
+elif args.fp64:
+    error = 1e-8
+    dtype = np.float64
+
+
+np.random.seed(18 + args.local_rank//args.group_size)
+
+inp = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
+grad = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
+weight = np.random.randn(feature_size).astype(dtype)
+bias = np.random.randn(feature_size).astype(dtype)
+
+
+type_tensor = torch.cuda.FloatTensor
+if args.fp16:
+    type_tensor = torch.cuda.HalfTensor
+if args.fp64:
+    type_tensor = torch.cuda.DoubleTensor
+
+ref_tensor = torch.cuda.DoubleTensor
+
+inp_t = type_tensor(inp)
+weight_t = type_tensor(weight)
+bias_t = type_tensor(bias)
+
+inp_r = ref_tensor(inp.transpose(1, 0, 2, 3).reshape(feature_size, -1))
+inp2_r = ref_tensor(inp)
+weight_r = ref_tensor(weight).view(-1, 1, 1)
+bias_r = ref_tensor(bias).view(-1, 1, 1)
+
+grad_output_t = type_tensor(grad)
+
+m = inp_r.mean(1)
+b_v = inp_r.var(1, unbiased=False)
+unb_v = inp_r.var(1, unbiased=True)
+
+eps = 1e-5
+
+mean, var_biased = syncbn.welford_mean_var(inp_t)
+inv_std = 1.0 / torch.sqrt(var_biased + eps)
+
+bn = torch.nn.BatchNorm2d(feature_size).cuda()
+bn.momentum = 1.0
+bn.weight.data = weight_t.clone()
+bn.bias.data = bias_t.clone()
+if args.fp16:
+    bn.half()
+if args.fp64:
+    bn.double()
+bn = DDP(bn)
+inp_bn = inp_t.clone().requires_grad_()
+grad_bn = grad_output_t.clone().detach()
+out_bn = bn(inp_bn)
+out_bn.backward(grad_bn)
+# compensating the averaging over processes done by DDP
+# in order to produce mathematically equivalent result
+# https://github.com/NVIDIA/apex/issues/134#issuecomment-458307368
+for param in bn.parameters():
+    param.grad = param.grad / args.group_size
+bn_opt = optim.SGD(bn.parameters(), lr=1.0)
+
+sbn = apex.parallel.SyncBatchNorm(feature_size, process_group=apex.parallel.create_syncbn_process_group(args.group_size)).cuda()
+sbn.momentum = 1.0
+sbn.weight.data = weight_t.clone()
+sbn.bias.data = bias_t.clone()
+if args.fp16:
+    sbn.half()
+if args.fp64:
+    sbn.double()
+sbn = DDP(sbn)
+sbn_opt = optim.SGD(sbn.parameters(), lr=1.0)
+inp_sbn = inp_t.clone().requires_grad_()
+grad_sbn = grad_output_t.clone().detach()
+out_sbn = sbn(inp_sbn[start:finish])
+out_sbn.backward(grad_sbn[start:finish])
+
+sbn_result = True
+bn_result = True
+
+if args.local_rank == 0:
+    sbn_result = compare("comparing mean: ", mean, m, error) and sbn_result
+    sbn_result = compare("comparing biased variance: ", var_biased, b_v, error) and sbn_result
+
+out = syncbn.batchnorm_forward(inp_t, mean, inv_std, weight_t, bias_t)
+out_r = weight_r * (inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1,1,1) + eps) + bias_r
+
+if args.local_rank == 0:
+    sbn_result = compare("comparing output: ", out, out_r, error) and sbn_result
+    compare("comparing bn output: ", out_bn, out_r, error)
+
+grad_output_t = type_tensor(grad)
+
+grad_output_r = ref_tensor(grad.transpose(1, 0, 2, 3).reshape(feature_size, -1))
+grad_output2_r = ref_tensor(grad)
+
+grad_bias_r = grad_output_r.sum(1)
+grad_weight_r = ((inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1,1,1) + eps) * grad_output2_r).transpose(1,0).contiguous().view(feature_size, -1).sum(1)
+
+mean_dy_r = grad_output_r.mean(1)
+mean_dy_xmu_r = ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r).transpose(1,0).contiguous().view(feature_size, -1).mean(1)
+
+grad_input_r = (grad_output2_r - mean_dy_r.view(-1, 1, 1) - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1,1,1) + eps) * mean_dy_xmu_r.view(-1, 1, 1) ) * torch.rsqrt(b_v.view(-1,1,1) + eps) * weight_r.view(-1,1,1)
+
+mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output_t, inp_t, mean, inv_std, weight_t)
+grad_input = syncbn.batchnorm_backward(grad_output_t, inp_t, mean, inv_std, weight_t, mean_dy, mean_dy_xmu)
+
+if args.local_rank == 0:
+    sbn_result = compare("comparing bias grad: ", grad_bias, grad_bias_r, error) and sbn_result
+    sbn_result = compare("comparing weight grad: ", grad_weight, grad_weight_r, error) and sbn_result
+    sbn_result = compare("comparing mean_dy grad: ", mean_dy, mean_dy_r, error) and sbn_result
+    sbn_result = compare("comparing mean_dy_xmu grad: ", mean_dy_xmu, mean_dy_xmu_r, error) and sbn_result
+    sbn_result = compare("comparing input grad: ", grad_input, grad_input_r, error) and sbn_result
+    compare("comparing bn input grad: ", inp_bn.grad, grad_input_r, error)
+
+if args.local_rank == 0:
+    sbn_result = compare("comparing running_mean: ", bn.module.running_mean.data, sbn.module.running_mean.data, error) and sbn_result
+    sbn_result = compare("comparing running_variance: ", bn.module.running_var.data, sbn.module.running_var.data, error) and sbn_result
+
+# execute by both
+compare("comparing layers output: ", out_bn[start:finish], out_sbn, error) and sbn_result
+compare("comparing layers grad_input: ", inp_bn.grad[start:finish], inp_sbn.grad[start:finish], error) and sbn_result
+
+bn_opt.step()
+sbn_opt.step()
+
+if args.local_rank == 0:
+    compare("comparing bn vs sbn bias: ", bn.module.bias, sbn.module.bias, error)
+    compare("comparing bn vs sbn weight: ", bn.module.weight, sbn.module.weight, error)
+
+
+if sbn_result:
+    print("====SBN group test passed")
+else:
+    print("*SBN group test failed*")
diff --git a/apex/tests/distributed/synced_batchnorm/two_gpu_unit_test.py b/apex/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f546af2f440564bda7ae4f9996750ecf0c03324
--- /dev/null
+++ b/apex/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
@@ -0,0 +1,173 @@
+import torch
+import numpy as np
+import apex
+import syncbn
+import os
+import argparse
+import torch.optim as optim
+
+def compare(desc, inp1, inp2, error):
+    a = inp1.clone().detach().cpu().numpy()
+    b = inp2.clone().detach().cpu().numpy()
+    close = np.allclose(a,b, error, error)
+    if not close:
+        print(desc, close)
+        z = a - b
+        index = (np.abs(z) >= error + error * np.abs(b)).nonzero()
+        print("dif    : ", z[index])
+        print("inp1   : ", a[index])
+        print("inp2   : ", b[index])
+    return close
+
+feature_size = 10
+space_size = 40
+batch_size = 32
+
+
+from apex.parallel import DistributedDataParallel as DDP
+parser = argparse.ArgumentParser()
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument("--fp16", action='store_true', default=False)
+parser.add_argument("--fp64", action='store_true', default=False)
+args = parser.parse_args()
+args.world_size = int(os.environ['WORLD_SIZE'])
+torch.cuda.set_device(args.local_rank)
+torch.distributed.init_process_group(backend='nccl', init_method='env://')
+start = args.local_rank * batch_size//args.world_size
+finish = (args.local_rank + 1) * batch_size//args.world_size
+
+error = 1e-5
+dtype = np.float32
+if args.fp16:
+    error = 1e-3
+    dtype = np.float16
+elif args.fp64:
+    error = 1e-8
+    dtype = np.float64
+
+np.random.seed(18)
+inp = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
+grad = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
+weight = np.random.randn(feature_size).astype(dtype)
+bias = np.random.randn(feature_size).astype(dtype)
+
+
+type_tensor = torch.cuda.FloatTensor
+if args.fp16:
+    type_tensor = torch.cuda.HalfTensor
+if args.fp64:
+    type_tensor = torch.cuda.DoubleTensor
+
+ref_tensor = torch.cuda.DoubleTensor
+
+inp_t = type_tensor(inp)
+weight_t = type_tensor(weight)
+bias_t = type_tensor(bias)
+
+inp_r = ref_tensor(inp.transpose(1, 0, 2, 3).reshape(feature_size, -1))
+inp2_r = ref_tensor(inp)
+weight_r = ref_tensor(weight).view(-1, 1, 1)
+bias_r = ref_tensor(bias).view(-1, 1, 1)
+
+grad_output_t = type_tensor(grad)
+
+m = inp_r.mean(1)
+b_v = inp_r.var(1, unbiased=False)
+unb_v = inp_r.var(1, unbiased=True)
+
+eps = 1e-5
+
+mean, var_biased = syncbn.welford_mean_var(inp_t)
+inv_std = 1.0 / torch.sqrt(var_biased + eps)
+
+bn = torch.nn.BatchNorm2d(feature_size).cuda()
+bn.momentum = 1.0
+bn.weight.data = weight_t.clone()
+bn.bias.data = bias_t.clone()
+if args.fp16:
+    bn.half()
+if args.fp64:
+    bn.double()
+inp_bn = inp_t.clone().requires_grad_()
+grad_bn = grad_output_t.clone().detach()
+out_bn = bn(inp_bn)
+out_bn.backward(grad_bn)
+# compensating the averaging over processes done by DDP
+# in order to produce mathematically equivalent result
+# https://github.com/NVIDIA/apex/issues/134#issuecomment-458307368
+for param in bn.parameters():
+    param.grad = param.grad / args.world_size
+bn_opt = optim.SGD(bn.parameters(), lr=1.0)
+
+sbn = apex.parallel.SyncBatchNorm(feature_size).cuda()
+sbn.momentum = 1.0
+sbn.weight.data = weight_t.clone()
+sbn.bias.data = bias_t.clone()
+if args.fp16:
+    sbn.half()
+if args.fp64:
+    sbn.double()
+sbn = DDP(sbn)
+sbn_opt = optim.SGD(sbn.parameters(), lr=1.0)
+inp_sbn = inp_t.clone().requires_grad_()
+grad_sbn = grad_output_t.clone().detach()
+out_sbn = sbn(inp_sbn[start:finish])
+out_sbn.backward(grad_sbn[start:finish])
+
+sbn_result = True
+bn_result = True
+
+if args.local_rank == 0:
+    sbn_result = compare("comparing mean: ", mean, m, error) and sbn_result
+    sbn_result = compare("comparing biased variance: ", var_biased, b_v, error) and sbn_result
+
+out = syncbn.batchnorm_forward(inp_t, mean, inv_std, weight_t, bias_t)
+out_r = weight_r * (inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1,1,1) + eps) + bias_r
+
+if args.local_rank == 0:
+    sbn_result = compare("comparing output: ", out, out_r, error) and sbn_result
+    compare("comparing bn output: ", out_bn, out_r, error)
+
+grad_output_t = type_tensor(grad)
+
+grad_output_r = ref_tensor(grad.transpose(1, 0, 2, 3).reshape(feature_size, -1))
+grad_output2_r = ref_tensor(grad)
+
+grad_bias_r = grad_output_r.sum(1)
+grad_weight_r = ((inp2_r - m.view(-1, 1, 1)) * torch.rsqrt(b_v.view(-1,1,1) + eps) * grad_output2_r).transpose(1,0).contiguous().view(feature_size, -1).sum(1)
+
+mean_dy_r = grad_output_r.mean(1)
+mean_dy_xmu_r = ((inp2_r - m.view(-1, 1, 1)) * grad_output2_r).transpose(1,0).contiguous().view(feature_size, -1).mean(1)
+
+grad_input_r = (grad_output2_r - mean_dy_r.view(-1, 1, 1) - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1,1,1) + eps) * mean_dy_xmu_r.view(-1, 1, 1) ) * torch.rsqrt(b_v.view(-1,1,1) + eps) * weight_r.view(-1,1,1)
+
+mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output_t, inp_t, mean, inv_std, weight_t)
+grad_input = syncbn.batchnorm_backward(grad_output_t, inp_t, mean, inv_std, weight_t, mean_dy, mean_dy_xmu)
+if args.local_rank == 0:
+    sbn_result = compare("comparing bias grad: ", grad_bias, grad_bias_r, error) and sbn_result
+    sbn_result = compare("comparing weight grad: ", grad_weight, grad_weight_r, error) and sbn_result
+    sbn_result = compare("comparing mean_dy grad: ", mean_dy, mean_dy_r, error) and sbn_result
+    sbn_result = compare("comparing mean_dy_xmu grad: ", mean_dy_xmu, mean_dy_xmu_r, error) and sbn_result
+    sbn_result = compare("comparing input grad: ", grad_input, grad_input_r, error) and sbn_result
+    compare("comparing bn input grad: ", inp_bn.grad, grad_input_r, error)
+
+if args.local_rank == 0:
+    sbn_result = compare("comparing running_mean: ", bn.running_mean.data, sbn.module.running_mean.data, error) and sbn_result
+    sbn_result = compare("comparing running_variance: ", bn.running_var.data, sbn.module.running_var.data, error) and sbn_result
+
+# execute by both
+compare("comparing layers output: ", out_bn[start:finish], out_sbn, error) and sbn_result
+compare("comparing layers grad_input: ", inp_bn.grad[start:finish], inp_sbn.grad[start:finish], error) and sbn_result
+
+bn_opt.step()
+sbn_opt.step()
+
+if args.local_rank == 0:
+    compare("comparing bn vs sbn bias: ", bn.bias, sbn.module.bias, error)
+    compare("comparing bn vs sbn weight: ", bn.weight, sbn.module.weight, error)
+
+
+if sbn_result:
+    print("====SBN two gpu passed tests")
+else:
+    print("*SBN two gpu failed*")
diff --git a/apex/tests/distributed/synced_batchnorm/unit_test.sh b/apex/tests/distributed/synced_batchnorm/unit_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8e0f3290b469b8f1d79738e991a950ff45f679d8
--- /dev/null
+++ b/apex/tests/distributed/synced_batchnorm/unit_test.sh
@@ -0,0 +1,5 @@
+python single_gpu_unit_test.py
+python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py
+python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp64
+#beware, you need a system with at least 4 gpus to test group_size<world_size
+python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2
diff --git a/apex/tests/docker_extension_builds/run.sh b/apex/tests/docker_extension_builds/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f949690693ff080eb26405337dd1d3ac3eab5095
--- /dev/null
+++ b/apex/tests/docker_extension_builds/run.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+print_banner() {
+  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
+}
+
+print_green() {
+  printf "\e[30m\e[42m$1\e[0m\n"
+}
+
+print_red() {
+  printf "\e[30m\e[41m$1\e[0m\n"
+}
+
+images=(
+"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-devel"
+"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
+"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
+"pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
+"pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
+"pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
+"pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
+)
+
+branch="master"
+
+# Associative array for exit codes
+declare -A exit_codes
+for image in images
+do
+  exit_codes[$image]="None"
+done
+
+for image in "${images[@]}"
+do
+  print_banner "$image"
+  set -x
+  docker pull $image
+  # Trying python setup.py install instead of pip install to ensure direct access to error codes.
+  # Maybe pip install would be ok too but this works.
+  docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e;  python setup.py install --cuda_ext --cpp_ext"
+  exit_code=$?
+  set +x
+  if [ $exit_code != 0 ]
+  then
+    print_red "Exit code: $exit_code"
+  else
+    print_green "Exit code: $exit_code"
+  fi
+  exit_codes[$image]=$exit_code
+done
+
+success=0
+for image in "${images[@]}"
+do
+  exit_code=${exit_codes[$image]}
+  if [ $exit_code != 0 ]
+  then
+    print_red "$image : $exit_code"
+    success=1
+  else
+    print_green "$image : $exit_code"
+  fi
+done
+
+if [ $success != 0 ]
+then
+  print_red "Overall status:  failure"
+else
+  print_green "Overall status:  success"
+fi
+
+exit $success
diff --git a/jukebox/Interacting_with_Jukebox.ipynb b/jukebox/Interacting_with_Jukebox.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..616504d4ebc58682353fbdf855b5c2c0101a9e17
--- /dev/null
+++ b/jukebox/Interacting_with_Jukebox.ipynb
@@ -0,0 +1,961 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Interacting with Jukebox",
+      "provenance": [],
+      "collapsed_sections": [],
+      "machine_shape": "hm"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "sAdFGF-bqVMY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!pip install git+https://github.com/openai/jukebox.git"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uq8uLwZCn0BV",
+        "colab_type": "text"
+      },
+      "source": [
+        "IMPORTANT NOTE ON SYSTEM REQUIREMENTS:\n",
+        "\n",
+        "If you are connecting to a hosted runtime, make sure it has a P100 GPU (optionally run !nvidia-smi to confirm). Go to Edit>Notebook Settings to set this.\n",
+        "\n",
+        "CoLab may first assign you a lower memory machine if you are using a hosted runtime.  If so, the first time you try to load the 5B model, it will run out of memory, and then you'll be prompted to restart with more memory (then return to the top of this CoLab).  If you continue to have memory issues after this (or run into issues on your own home setup), switch to the 1B model.\n",
+        "\n",
+        "If you are using a local GPU, we recommend V100 or P100 with 16GB GPU memory for best performance. For GPU’s with less memory, we recommend using the 1B model and a smaller batch size throughout.  \n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8qEqdj8u0gdN",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!nvidia-smi"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "taDHgk1WCC_C",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import jukebox\n",
+        "import torch as t\n",
+        "import librosa\n",
+        "import os\n",
+        "from IPython.display import Audio\n",
+        "from jukebox.make_models import make_vqvae, make_prior, MODELS, make_model\n",
+        "from jukebox.hparams import Hyperparams, setup_hparams\n",
+        "from jukebox.sample import sample_single_window, _sample, \\\n",
+        "                           sample_partial_window, upsample\n",
+        "from jukebox.utils.dist_utils import setup_dist_from_mpi\n",
+        "from jukebox.utils.torch_utils import empty_cache\n",
+        "rank, local_rank, device = setup_dist_from_mpi()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "89FftI5kc-Az",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Sample from the 5B or 1B Lyrics Model\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "65aR2OZxmfzq",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model = \"5b_lyrics\" # or \"1b_lyrics\"     \n",
+        "hps = Hyperparams()\n",
+        "hps.sr = 44100\n",
+        "hps.n_samples = 3 if model=='5b_lyrics' else 8\n",
+        "hps.name = 'samples'\n",
+        "chunk_size = 16 if model==\"5b_lyrics\" else 32\n",
+        "max_batch_size = 3 if model==\"5b_lyrics\" else 16\n",
+        "hps.levels = 3\n",
+        "hps.hop_fraction = [.5,.5,.125]\n",
+        "\n",
+        "vqvae, *priors = MODELS[model]\n",
+        "vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length = 1048576)), device)\n",
+        "top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)\n",
+        "\n"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JYKiwkzy0Iyf",
+        "colab_type": "text"
+      },
+      "source": [
+        "Specify your choice of artist, genre, lyrics, and length of musical sample. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-sY9aGHcZP-u",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "sample_length_in_seconds = 60          # Full length of musical sample to generate - we find songs in the 1 to 4 minute\n",
+        "                                       # range work well, with generation time proportional to sample length.  \n",
+        "                                       # This total length affects how quickly the model \n",
+        "                                       # progresses through lyrics (model also generates differently\n",
+        "                                       # depending on if it thinks it's in the beginning, middle, or end of sample)\n",
+        "\n",
+        "hps.sample_length = (int(sample_length_in_seconds*hps.sr)//top_prior.raw_to_tokens)*top_prior.raw_to_tokens\n",
+        "assert hps.sample_length >= top_prior.n_ctx*top_prior.raw_to_tokens, f'Please choose a larger sampling rate'"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "qD0qxQeLaTR0",
+        "colab": {}
+      },
+      "source": [
+        "metas = [dict(artist = \"Zac Brown Band\",\n",
+        "            genre = \"Country\",\n",
+        "            total_length = hps.sample_length,\n",
+        "            offset = 0,\n",
+        "            lyrics = \"\"\"I met a traveller from an antique land,\n",
+        "            Who said—“Two vast and trunkless legs of stone\n",
+        "            Stand in the desert. . . . Near them, on the sand,\n",
+        "            Half sunk a shattered visage lies, whose frown,\n",
+        "            And wrinkled lip, and sneer of cold command,\n",
+        "            Tell that its sculptor well those passions read\n",
+        "            Which yet survive, stamped on these lifeless things,\n",
+        "            The hand that mocked them, and the heart that fed;\n",
+        "            And on the pedestal, these words appear:\n",
+        "            My name is Ozymandias, King of Kings;\n",
+        "            Look on my Works, ye Mighty, and despair!\n",
+        "            Nothing beside remains. Round the decay\n",
+        "            Of that colossal Wreck, boundless and bare\n",
+        "            The lone and level sands stretch far away\n",
+        "            \"\"\",\n",
+        "            ),\n",
+        "          ] * hps.n_samples\n",
+        "labels = [None, None, top_prior.labeller.get_batch_labels(metas, 'cuda')]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6PHC1XnEfV4Y",
+        "colab_type": "text"
+      },
+      "source": [
+        "Optionally adjust the sampling temperature (we've found .98 or .99 to be our favorite).  \n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "eNwKyqYraTR9",
+        "colab": {}
+      },
+      "source": [
+        "sampling_temperature = .98\n",
+        "\n",
+        "lower_batch_size = 16\n",
+        "max_batch_size = 3 if model == \"5b_lyrics\" else 16\n",
+        "lower_level_chunk_size = 32\n",
+        "chunk_size = 16 if model == \"5b_lyrics\" else 32\n",
+        "sampling_kwargs = [dict(temp=.99, fp16=True, max_batch_size=lower_batch_size,\n",
+        "                        chunk_size=lower_level_chunk_size),\n",
+        "                    dict(temp=0.99, fp16=True, max_batch_size=lower_batch_size,\n",
+        "                         chunk_size=lower_level_chunk_size),\n",
+        "                    dict(temp=sampling_temperature, fp16=True, \n",
+        "                         max_batch_size=max_batch_size, chunk_size=chunk_size)]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S3j0gT3HfrRD",
+        "colab_type": "text"
+      },
+      "source": [
+        "Now we're ready to sample from the model. We'll generate the top level (2) first, followed by the first upsampling (level 1), and the second upsampling (0).  In this CoLab we load the top prior separately from the upsamplers, because of memory concerns on the hosted runtimes. If you are using a local machine, you can also load all models directly with make_models, and then use sample.py's ancestral_sampling to put this all in one step.\n",
+        "\n",
+        "After each level, we decode to raw audio and save the audio files.   \n",
+        "\n",
+        "This next cell will take a while (approximately 10 minutes per 20 seconds of music sample)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2nET_YBEopyp",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "zs = [t.zeros(hps.n_samples,0,dtype=t.long, device='cuda') for _ in range(len(priors))]\n",
+        "zs = _sample(zs, labels, sampling_kwargs, [None, None, top_prior], [2], hps)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-gxY9aqHqfLJ",
+        "colab_type": "text"
+      },
+      "source": [
+        "Listen to the results from the top level (note this will sound very noisy until we do the upsampling stage).  You may have more generated samples, depending on the batch size you requested."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "TPZENDGZqOOb",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "Audio(f'{hps.name}/level_2/item_0.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EJc3bQxmusc6",
+        "colab_type": "text"
+      },
+      "source": [
+        "We are now done with the large top_prior model, and instead load the upsamplers."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "W5VLX0zRapIm",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Set this False if you are on a local machine that has enough memory (this allows you to do the\n",
+        "# lyrics alignment visualization during the upsampling stage). For a hosted runtime, \n",
+        "# we'll need to go ahead and delete the top_prior if you are using the 5b_lyrics model.\n",
+        "if True:\n",
+        "  del top_prior\n",
+        "  empty_cache()\n",
+        "  top_prior=None\n",
+        "upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, 'cpu') for prior in priors[:-1]]\n",
+        "labels[:2] = [prior.labeller.get_batch_labels(metas, 'cuda') for prior in upsamplers]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eH_jUhGDprAt",
+        "colab_type": "text"
+      },
+      "source": [
+        "Please note: this next upsampling step will take several hours.  At the free tier, Google CoLab lets you run for 12 hours.  As the upsampling is completed, samples will appear in the Files tab (you can access this at the left of the CoLab), under \"samples\" (or whatever hps.name is currently).  Level 1 is the partially upsampled version, and then Level 0 is fully completed."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9lkJgLolpZ6w",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "zs = upsample(zs, labels, sampling_kwargs, [*upsamplers, top_prior], hps)\n"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3SJgBYJPri55",
+        "colab_type": "text"
+      },
+      "source": [
+        "Listen to your final sample!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2ip2PPE0rgAb",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "Audio(f'{hps.name}/level_0/item_0.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8JAgFxytwrLG",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "del upsamplers\n",
+        "empty_cache()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LpvvFH85bbBC",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Co-Composing with the 5B or 1B Lyrics Model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nFDROuS7gFQY",
+        "colab_type": "text"
+      },
+      "source": [
+        "For more control over the generations, try co-composing with either the 5B or 1B Lyrics Models.  Again, specify your artist, genre, and lyrics. However, now instead of generating the entire sample, the model will return 3 short options for the opening of the piece (or up to 16 options if you use the 1B model instead).  Choose your favorite, and then continue the loop, for as long as you like.  Throughout these steps, you'll be listening to the audio at the top prior level, which means it will sound quite noisy.  When you are satisfied with your co-creation, continue on through the upsampling section. This will render the piece in higher audio quality.\n",
+        "\n",
+        "NOTE: CoLab will first assign you a lower memory machine if you are using a hosted runtime.  The next cell will run out of memory, and then you'll be prompted to restart with more memory (then return to the top of this CoLab).  If you continue to have memory issues after this (or run into issues on your own home setup), switch to the 1B model. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "3y-q8ifhGBlU",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model = \"5b_lyrics\" # or \"1b_lyrics\"\n",
+        "hps = Hyperparams()\n",
+        "hps.sr = 44100\n",
+        "hps.n_samples = 3 if model=='5b_lyrics' else 16\n",
+        "hps.name = 'co_composer'\n",
+        "hps.sample_length = 1048576 if model==\"5b_lyrics\" else 786432 \n",
+        "chunk_size = 16 if model==\"5b_lyrics\" else 32\n",
+        "max_batch_size = 3 if model==\"5b_lyrics\" else 16\n",
+        "hps.hop_fraction = [.5, .5, .125] \n",
+        "hps.levels = 3\n",
+        "\n",
+        "vqvae, *priors = MODELS[model]\n",
+        "vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length = hps.sample_length)), device)\n",
+        "top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "68hz4x7igq0c",
+        "colab_type": "text"
+      },
+      "source": [
+        "Choose your artist, genre, and lyrics here!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "QDMvH_1zUHo6",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "total_sample_length_in_seconds = 120\n",
+        "metas = [dict(artist = \"Zac Brown Band\",\n",
+        "            genre = \"Country\",\n",
+        "            total_length = total_sample_length_in_seconds * hps.sr,\n",
+        "            offset = 0,\n",
+        "            lyrics = \"\"\"I met a traveller from an antique land,\n",
+        "            Who said—“Two vast and trunkless legs of stone\n",
+        "            Stand in the desert. . . . Near them, on the sand,\n",
+        "            Half sunk a shattered visage lies, whose frown,\n",
+        "            And wrinkled lip, and sneer of cold command,\n",
+        "            Tell that its sculptor well those passions read\n",
+        "            Which yet survive, stamped on these lifeless things,\n",
+        "            The hand that mocked them, and the heart that fed;\n",
+        "            And on the pedestal, these words appear:\n",
+        "            My name is Ozymandias, King of Kings;\n",
+        "            Look on my Works, ye Mighty, and despair!\n",
+        "            Nothing beside remains. Round the decay\n",
+        "            Of that colossal Wreck, boundless and bare\n",
+        "            The lone and level sands stretch far away\n",
+        "            \"\"\",\n",
+        "            ),\n",
+        "          ] * hps.n_samples\n",
+        "labels = top_prior.labeller.get_batch_labels(metas, 'cuda')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B9onZMEXh34f",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate 3 options for the start of the song\n",
+        "\n",
+        "Initial generation is set to be 4 seconds long, but feel free to change this"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "c6peEj8I_HHO",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def seconds_to_tokens(sec, sr, prior, chunk_size):\n",
+        "  tokens = sec * hps.sr // prior.raw_to_tokens\n",
+        "  tokens = ((tokens // chunk_size) + 1) * chunk_size\n",
+        "  assert tokens <= prior.n_ctx, 'Choose a shorter generation length to stay within the top prior context'\n",
+        "  return tokens"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2gn2GXt3zt3y",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "initial_generation_in_seconds = 4\n",
+        "tokens_to_sample = seconds_to_tokens(initial_generation_in_seconds, hps.sr, top_prior, chunk_size)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U0zcWcMoiigl",
+        "colab_type": "text"
+      },
+      "source": [
+        "Change the sampling temperature if you like (higher is more random).  Our favorite is in the range .98 to .995"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "NHbH68H7VMeO",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "sampling_temperature = .98\n",
+        "sampling_kwargs = dict(temp=sampling_temperature, fp16=True,\n",
+        "                       max_batch_size=max_batch_size, chunk_size=chunk_size)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "JGZEPe-WTt4g",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "zs=[t.zeros(hps.n_samples,0,dtype=t.long, device='cuda') for _ in range(3)]\n",
+        "zs=sample_partial_window(zs, labels, sampling_kwargs, 2, top_prior, tokens_to_sample, hps)\n",
+        "x = vqvae.decode(zs[2:], start_level=2).cpu().numpy()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mveN4Be8jK2J",
+        "colab_type": "text"
+      },
+      "source": [
+        "Listen to your generated samples, and then pick a favorite. If you don't like any, go back and rerun the cell above. \n",
+        "\n",
+        "** NOTE this is at the noisy top level, upsample fully (in the next section) to hear the final audio version"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "LrJSGMhUOhZg",
+        "colab": {}
+      },
+      "source": [
+        "for i in range(hps.n_samples):\n",
+        "  librosa.output.write_wav(f'noisy_top_level_generation_{i}.wav', x[i], sr=44100)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "rQ4ersQ5OhZr",
+        "colab": {}
+      },
+      "source": [
+        "Audio('noisy_top_level_generation_0.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "-GdqzrGkOhZv",
+        "colab": {}
+      },
+      "source": [
+        "Audio('noisy_top_level_generation_1.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "gE5S8hyZOhZy",
+        "colab": {}
+      },
+      "source": [
+        "Audio('noisy_top_level_generation_2.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "t2-mEJaqZfuS",
+        "colab_type": "text"
+      },
+      "source": [
+        "If you don't like any of the options, return a few cells back to \"Sample a few options...\" and rerun from there."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "o7CzSiv0MmFP",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Choose your favorite sample and request longer generation\n",
+        "\n",
+        "---\n",
+        "\n",
+        "(Repeat from here)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "j_XFtVi99CIY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "my_choice=0"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Pgk3sHHBLYoq",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "zs[2]=zs[2][my_choice].repeat(hps.n_samples,1)\n",
+        "t.save(zs, 'zs-checkpoint2.t')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "W8Rd9xxm565S",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Set to True to load the previous checkpoint:\n",
+        "if False:\n",
+        "  zs=t.load('zs-checkpoint2.t') "
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "k12xjMgHkRGP",
+        "colab_type": "text"
+      },
+      "source": [
+        "Choose the length of the continuation.  The 1B model can generate up to 17 second samples and the 5B up to 23 seconds, but you'll want to pick a shorter continuation length so that it will be able to look back at what you've generated already.  Here we've chosen 4 seconds."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "h3_-0a07kHHG",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "continue_generation_in_seconds=4\n",
+        "tokens_to_sample = seconds_to_tokens(continue_generation_in_seconds, hps.sr, top_prior, chunk_size)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GpPG3Ifqk8ue",
+        "colab_type": "text"
+      },
+      "source": [
+        "The next step asks the top prior to generate more of the sample. It'll take up to a few minutes, depending on the sample length you request."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YoHkeSTaEyLj",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "zs = sample_partial_window(zs, labels, sampling_kwargs, 2, top_prior, tokens_to_sample, hps)\n",
+        "x = vqvae.decode(zs[2:], start_level=2).cpu().numpy()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ymhUqEdhleEi",
+        "colab_type": "text"
+      },
+      "source": [
+        "Now listen to the longer versions of the sample you selected, and again choose a favorite sample.  If you don't like any, return back to the cell where you can load the checkpoint, and continue again from there.\n",
+        "\n",
+        "When the samples start getting long, you might not always want to listen from the start, so change the playback start time later on if you like."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2H1LNLTa_R6a",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "playback_start_time_in_seconds = 0 "
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "r4SBGAmsnJtH",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "for i in range(hps.n_samples):\n",
+        "  librosa.output.write_wav(f'top_level_continuation_{i}.wav', x[i][playback_start_time_in_seconds*44100:], sr=44100)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2WeyE5Qtnmeo",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "Audio('top_level_continuation_0.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "BKtfEtcaazXE",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "Audio('top_level_continuation_1.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "7yrlS0XwK2S0",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "Audio('top_level_continuation_2.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-OJT704dvnGv",
+        "colab_type": "text"
+      },
+      "source": [
+        "To make a longer song, return back to \"Choose your favorite sample\" and loop through that again"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RzCrkCZJvUcQ",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Upsample Co-Composition to Higher Audio Quality"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4MPgukwMmB0p",
+        "colab_type": "text"
+      },
+      "source": [
+        "Choose your favorite sample from your latest group of generations.  (If you haven't already gone through the Co-Composition block, make sure to do that first so you have a generation to upsample)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yv-pNNPHBQYC",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "choice = 0\n",
+        "select_best_sample = True  # Set false if you want to upsample all your samples \n",
+        "                           # upsampling sometimes yields subtly different results on multiple runs,\n",
+        "                           # so this way you can choose your favorite upsampling"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "v17cEAqyCgfo",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "if select_best_sample:\n",
+        "  zs[2]=zs[2][choice].repeat(zs[2].shape[0],1)\n",
+        "\n",
+        "t.save(zs, 'zs-top-level-final.t')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0YjK-Ac0tBfu",
+        "colab_type": "text"
+      },
+      "source": [
+        "Note: If you are using a CoLab hosted runtime on the free tier, you may want to download this zs-top-level-final.t file, and then restart an instance and load it in the next cell.  The free tier will last a maximum of 12 hours, and the upsampling stage can take many hours, depending on how long a sample you have generated."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qqlR9368s3jJ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "if False:\n",
+        "  zs = t.load('zs-top-level-final.t')\n",
+        "\n",
+        "assert zs[2].shape[1]>=2048, f'Please first generate at least 2048 tokens at the top level, currently you have {zs[2].shape[1]}'\n",
+        "hps.sample_length = zs[2].shape[1]*top_prior.raw_to_tokens"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "jzHwF_iqgIWM",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Set this False if you are on a local machine that has enough memory (this allows you to do the\n",
+        "# lyrics alignment visualization). For a hosted runtime, we'll need to go ahead and delete the top_prior\n",
+        "# if you are using the 5b_lyrics model.\n",
+        "if True:\n",
+        "  del top_prior\n",
+        "  empty_cache()\n",
+        "  top_prior=None\n",
+        "\n",
+        "upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, 'cpu') for prior in priors[:-1]]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "q22Ier6YSkKS",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "sampling_kwargs = [dict(temp=.99, fp16=True, max_batch_size=16, chunk_size=32),\n",
+        "                    dict(temp=0.99, fp16=True, max_batch_size=16, chunk_size=32),\n",
+        "                    None]\n",
+        "\n",
+        "if type(labels)==dict:\n",
+        "  labels = [prior.labeller.get_batch_labels(metas, 'cuda') for prior in upsamplers] + [labels] "
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T1MCa9_jnjpf",
+        "colab_type": "text"
+      },
+      "source": [
+        "This next step upsamples 2 levels.  The level_1 samples will be available after around one hour (depending on the length of your sample) and are saved under {hps.name}/level_0/item_0.wav, while the fully upsampled level_0 will likely take 4-12 hours. You can access the wav files down below, or using the \"Files\" panel at the left of this CoLab.\n",
+        "\n",
+        "(Please note, if you are using this CoLab on Google's free tier, you may want to download intermediate steps as the connection will last for a maximum 12 hours.)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "NcNT5qIRMmHq",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "zs = upsample(zs, labels, sampling_kwargs, [*upsamplers, top_prior], hps)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "W2jTYLPBc29M",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "Audio(f'{hps.name}/level_0/item_0.wav')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/jukebox/__init__.py b/jukebox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/jukebox/align.py b/jukebox/align.py
new file mode 100644
index 0000000000000000000000000000000000000000..8084b9739ae0ee26d4c01050c3db4ee84a58db73
--- /dev/null
+++ b/jukebox/align.py
@@ -0,0 +1,115 @@
+"""
+Get alignment from attn values
+1. run a forward pass on each hop, get attn values
+2. concat for all hops
+"""
+import numpy as np
+import torch as t
+from jukebox.utils.torch_utils import assert_shape, empty_cache
+from jukebox.hparams import Hyperparams
+from jukebox.make_models import make_model
+from jukebox.save_html import save_html
+from jukebox.utils.sample_utils import get_starts
+import fire
+
+def get_alignment(x, zs, labels, prior, fp16, hps):
+    level = hps.levels - 1 # Top level used
+    n_ctx, n_tokens = prior.n_ctx, prior.n_tokens
+    z = zs[level]
+    bs, total_length = z.shape[0], z.shape[1]
+    if total_length < n_ctx:
+        padding_length = n_ctx - total_length
+        z = t.cat([z, t.zeros(bs, n_ctx - total_length, dtype=z.dtype, device=z.device)], dim=1)
+        total_length = z.shape[1]
+    else:
+        padding_length = 0
+
+    hop_length = int(hps.hop_fraction[level]*prior.n_ctx)
+    n_head = prior.prior.transformer.n_head
+    alignment_head, alignment_layer = prior.alignment_head, prior.alignment_layer
+    attn_layers = set([alignment_layer])
+    alignment_hops = {}
+    indices_hops = {}
+
+    prior.cuda()
+    empty_cache()
+    for start in get_starts(total_length, n_ctx, hop_length):
+        end = start + n_ctx
+
+        # set y offset, sample_length and lyrics tokens
+        y, indices_hop = prior.get_y(labels, start, get_indices=True)
+        assert len(indices_hop) == bs
+        for indices in indices_hop:
+            assert len(indices) == n_tokens
+
+        z_bs = t.chunk(z, bs, dim=0)
+        y_bs = t.chunk(y, bs, dim=0)
+        w_hops = []
+        for z_i, y_i in zip(z_bs, y_bs):
+            w_hop = prior.z_forward(z_i[:,start:end], [], y_i, fp16=fp16, get_attn_weights=attn_layers)
+            assert len(w_hop) == 1
+            w_hops.append(w_hop[0][:, alignment_head])
+            del w_hop
+        w = t.cat(w_hops, dim=0)
+        del w_hops
+        assert_shape(w, (bs, n_ctx, n_tokens))
+        alignment_hop = w.float().cpu().numpy()
+        assert_shape(alignment_hop, (bs, n_ctx, n_tokens))
+        del w
+
+        # alignment_hop has shape (bs, n_ctx, n_tokens)
+        # indices_hop is a list of len=bs, each entry of len hps.n_tokens
+        indices_hops[start] = indices_hop
+        alignment_hops[start] = alignment_hop
+    prior.cpu()
+    empty_cache()
+
+    # Combine attn for each hop into attn for full range
+    # Use indices to place them into correct place for corresponding source tokens
+    alignments = []
+    for item in range(bs):
+        # Note each item has different length lyrics
+        full_tokens = labels['info'][item]['full_tokens']
+        alignment = np.zeros((total_length, len(full_tokens) + 1))
+        for start in reversed(get_starts(total_length, n_ctx, hop_length)):
+            end = start + n_ctx
+            alignment_hop = alignment_hops[start][item]
+            indices = indices_hops[start][item]
+            assert len(indices) == n_tokens
+            assert alignment_hop.shape == (n_ctx, n_tokens)
+            alignment[start:end,indices] = alignment_hop
+        alignment = alignment[:total_length - padding_length,:-1] # remove token padding, and last lyric index
+        alignments.append(alignment)
+    return alignments
+
+def save_alignment(model, device, hps):
+    print(hps)
+    vqvae, priors = make_model(model, device, hps, levels=[-1])
+
+    logdir = f"{hps.logdir}/level_{0}"
+    data = t.load(f"{logdir}/data.pth.tar")
+    if model == '1b_lyrics':
+        fp16 = False
+    else:
+        fp16 = True
+
+    data['alignments'] = get_alignment(data['x'], data['zs'], data['labels'][-1], priors[-1], fp16, hps)
+    t.save(data, f"{logdir}/data_align.pth.tar")
+    save_html(logdir, data['x'], data['zs'], data['labels'][-1], data['alignments'], hps)
+
+def run(model, port=29500, **kwargs):
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    rank, local_rank, device = setup_dist_from_mpi(port=port)
+    hps = Hyperparams(**kwargs)
+
+    with t.no_grad():
+        save_alignment(model, device, hps)
+
+if __name__ == '__main__':
+    fire.Fire(run)
+
+
+
+
+
+
diff --git a/jukebox/data/__init__.py b/jukebox/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/jukebox/data/artist_genre_processor.py b/jukebox/data/artist_genre_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..be82ef9aad926f4a44776a6784d60babd02f9551
--- /dev/null
+++ b/jukebox/data/artist_genre_processor.py
@@ -0,0 +1,93 @@
+import os
+import re
+
+accepted = frozenset([chr(i) for i in range(ord('a'), ord('z') + 1)] +
+                     [chr(i) for i in range(ord('A'), ord('Z') + 1)] +
+                     [chr(i) for i in range(ord('0'), ord('9') + 1)])
+
+rex = re.compile(r'_+')
+
+def norm(s):
+    s = ''.join([c if c in accepted else '_' for c in s.lower()])
+    s = rex.sub('_', s).strip('_')
+    return s
+
+def create_reverse_lookup(atoi):
+    # Multiple entries could go to the same artist_id/genre_id
+    itoa = {}
+    for a, i in atoi.items():
+        if i not in itoa:
+            itoa[i] = []
+        itoa[i].append(a)
+    indices = sorted(list(itoa.keys()))
+    for i in indices:
+        itoa[i] = '_'.join(sorted(itoa[i]))
+    return itoa
+
+class ArtistGenreProcessor():
+    def __init__(self, v3=False):
+        self.v3 = v3
+        dirname = os.path.dirname(__file__)
+        if self.v3:
+            self.artist_id_file = f"{dirname}/ids/v3_artist_ids.txt"
+            self.genre_id_file = f"{dirname}/ids/v3_genre_ids.txt"
+        else:
+            self.artist_id_file = f"{dirname}/ids/v2_artist_ids.txt"
+            self.genre_id_file = f"{dirname}/ids/v2_genre_ids.txt"
+        self.load_artists()
+        self.load_genres()
+
+    def get_artist_id(self, artist):
+        input_artist = artist
+        if self.v3:
+            artist = artist.lower()
+        else:
+            artist = norm(artist)
+        if artist not in self.artist_ids:
+            print(f"Input artist {input_artist} maps to {artist}, which is not present in {self.artist_id_file}. "
+                  f"Defaulting to (artist_id, artist) = (0, unknown), if that seems wrong please format artist correctly")
+        return self.artist_ids.get(artist, 0)
+
+    def get_genre_ids(self, genre):
+        if self.v3:
+            genres = [genre.lower()]
+        else:
+            # In v2, we convert genre into a bag of words
+            genres = norm(genre).split("_")
+        for word in genres:
+            if word not in self.genre_ids:
+                print(f"Input genre {genre} maps to the list {genres}. {word} is not present in {self.genre_id_file}. "
+                      f"Defaulting to (word_id, word) = (0, unknown), if that seems wrong please format genre correctly")
+        return [self.genre_ids.get(word, 0) for word in genres]
+
+    # get_artist/genre throw error if we ask for non-present values
+    def get_artist(self, artist_id):
+        return self.artists[artist_id]
+
+    def get_genre(self, genre_ids):
+        if self.v3:
+            assert len(genre_ids) == 1
+            genre = self.genres[genre_ids[0]]
+        else:
+            genre = '_'.join([self.genres[genre_id] for genre_id in genre_ids if genre_id >= 0])
+        return genre
+
+    def load_artists(self):
+        print(f'Loading artist IDs from {self.artist_id_file}')
+        self.artist_ids = {}
+        with open(self.artist_id_file, 'r', encoding="utf-8") as f:
+            for line in f:
+                artist, artist_id = line.strip().split(';')
+                self.artist_ids[artist.lower()] = int(artist_id)
+        self.artists = create_reverse_lookup(self.artist_ids)
+
+    def load_genres(self):
+        print(f'Loading artist IDs from {self.genre_id_file}')
+        self.genre_ids = {}
+        with open(self.genre_id_file, 'r', encoding="utf-8") as f:
+            for line in f:
+                genre, genre_id = line.strip().split(';')
+                self.genre_ids[genre.lower()] = int(genre_id)
+        self.genres = create_reverse_lookup(self.genre_ids)
+
+
diff --git a/jukebox/data/data_processor.py b/jukebox/data/data_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1924c5a7e4ed1ea4017dfec6de10bb5644791e
--- /dev/null
+++ b/jukebox/data/data_processor.py
@@ -0,0 +1,69 @@
+import torch as t
+import jukebox.utils.dist_adapter as dist
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader, Dataset, BatchSampler, RandomSampler
+from jukebox.utils.dist_utils import print_all
+from jukebox.utils.audio_utils import calculate_bandwidth
+from jukebox.data.files_dataset import FilesAudioDataset
+
+class OffsetDataset(Dataset):
+    def __init__(self, dataset, start, end, test=False):
+        super().__init__()
+        self.dataset = dataset
+        self.start = start
+        self.end = end
+        self.test = test
+        assert 0 <= self.start < self.end <= len(self.dataset)
+
+    def __len__(self):
+        return self.end - self.start
+
+    def __getitem__(self, item):
+        return self.dataset.get_item(self.start + item, test=self.test)
+
+class DataProcessor():
+    def __init__(self, hps):
+        self.dataset = FilesAudioDataset(hps)
+        duration = 1 if hps.prior else 600
+        hps.bandwidth = calculate_bandwidth(self.dataset, hps, duration=duration)
+        self.create_datasets(hps)
+        self.create_samplers(hps)
+        self.create_data_loaders(hps)
+        self.print_stats(hps)
+
+    def set_epoch(self, epoch):
+        self.train_sampler.set_epoch(epoch)
+        self.test_sampler.set_epoch(epoch)
+
+    def create_datasets(self, hps):
+        train_len = int(len(self.dataset) * hps.train_test_split)
+        self.train_dataset = OffsetDataset(self.dataset, 0, train_len, test=False)
+        self.test_dataset = OffsetDataset(self.dataset, train_len, len(self.dataset), test=True)
+
+    def create_samplers(self, hps):
+        if not dist.is_available():
+            self.train_sampler = BatchSampler(RandomSampler(self.train_dataset), batch_size=hps.bs, drop_last=True)
+            self.test_sampler = BatchSampler(RandomSampler(self.test_dataset), batch_size=hps.bs, drop_last=True)
+        else:
+            self.train_sampler = DistributedSampler(self.train_dataset)
+            self.test_sampler = DistributedSampler(self.test_dataset)
+
+    def create_data_loaders(self, hps):
+        # Loader to load mini-batches
+        if hps.labels:
+            collate_fn = lambda batch: tuple(t.stack([t.from_numpy(b[i]) for b in batch], 0) for i in range(2))
+        else:
+            collate_fn = lambda batch: t.stack([t.from_numpy(b) for b in batch], 0)
+
+        print('Creating Data Loader')
+        self.train_loader = DataLoader(self.train_dataset, batch_size=hps.bs, num_workers=hps.nworkers,
+                                       sampler=self.train_sampler, pin_memory=False,
+                                       drop_last=True, collate_fn=collate_fn)
+        self.test_loader = DataLoader(self.test_dataset, batch_size=hps.bs, num_workers=hps.nworkers,
+                                      sampler=self.test_sampler, pin_memory=False,
+                                      drop_last=False, collate_fn=collate_fn)
+
+    def print_stats(self, hps):
+        print_all(f"Train {len(self.train_dataset)} samples. Test {len(self.test_dataset)} samples")
+        print_all(f'Train sampler: {self.train_sampler}')
+        print_all(f'Train loader: {len(self.train_loader)}')
diff --git a/jukebox/data/files_dataset.py b/jukebox/data/files_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..68d5408f4007353aa57a2cec8a425f9d92ea7e72
--- /dev/null
+++ b/jukebox/data/files_dataset.py
@@ -0,0 +1,99 @@
+import librosa
+import math
+import numpy as np
+import jukebox.utils.dist_adapter as dist
+from torch.utils.data import Dataset
+from jukebox.utils.dist_utils import print_all
+from jukebox.utils.io import get_duration_sec, load_audio
+from jukebox.data.labels import Labeller
+
+class FilesAudioDataset(Dataset):
+    def __init__(self, hps):
+        super().__init__()
+        self.sr = hps.sr
+        self.channels = hps.channels
+        self.min_duration = hps.min_duration or math.ceil(hps.sample_length / hps.sr)
+        self.max_duration = hps.max_duration or math.inf
+        self.sample_length = hps.sample_length
+        assert hps.sample_length / hps.sr < self.min_duration, f'Sample length {hps.sample_length} per sr {hps.sr} ({hps.sample_length / hps.sr:.2f}) should be shorter than min duration {self.min_duration}'
+        self.aug_shift = hps.aug_shift
+        self.labels = hps.labels
+        self.init_dataset(hps)
+
+    def filter(self, files, durations):
+        # Remove files too short or too long
+        keep = []
+        for i in range(len(files)):
+            if durations[i] / self.sr < self.min_duration:
+                continue
+            if durations[i] / self.sr >= self.max_duration:
+                continue
+            keep.append(i)
+        print_all(f'self.sr={self.sr}, min: {self.min_duration}, max: {self.max_duration}')
+        print_all(f"Keeping {len(keep)} of {len(files)} files")
+        self.files = [files[i] for i in keep]
+        self.durations = [int(durations[i]) for i in keep]
+        self.cumsum = np.cumsum(self.durations)
+
+    def init_dataset(self, hps):
+        # Load list of files and starts/durations
+        files = librosa.util.find_files(f'{hps.audio_files_dir}', ['mp3', 'opus', 'm4a', 'aac', 'wav'])
+        print_all(f"Found {len(files)} files. Getting durations")
+        cache = dist.get_rank() % 8 == 0 if dist.is_available() else True
+        durations = np.array([get_duration_sec(file, cache=cache) * self.sr for file in files])  # Could be approximate
+        self.filter(files, durations)
+
+        if self.labels:
+            self.labeller = Labeller(hps.max_bow_genre_size, hps.n_tokens, self.sample_length, v3=hps.labels_v3)
+
+    def get_index_offset(self, item):
+        # For a given dataset item and shift, return song index and offset within song
+        half_interval = self.sample_length//2
+        shift = np.random.randint(-half_interval, half_interval) if self.aug_shift else 0
+        offset = item * self.sample_length + shift # Note we centred shifts, so adding now
+        midpoint = offset + half_interval
+        assert 0 <= midpoint < self.cumsum[-1], f'Midpoint {midpoint} of item beyond total length {self.cumsum[-1]}'
+        index = np.searchsorted(self.cumsum, midpoint)  # index <-> midpoint of interval lies in this song
+        start, end = self.cumsum[index - 1] if index > 0 else 0.0, self.cumsum[index] # start and end of current song
+        assert start <= midpoint <= end, f"Midpoint {midpoint} not inside interval [{start}, {end}] for index {index}"
+        if offset > end - self.sample_length: # Going over song
+            offset = max(start, offset - half_interval)  # Now should fit
+        elif offset < start: # Going under song
+            offset = min(end - self.sample_length, offset + half_interval)  # Now should fit
+        assert start <= offset <= end - self.sample_length, f"Offset {offset} not in [{start}, {end - self.sample_length}]. End: {end}, SL: {self.sample_length}, Index: {index}"
+        offset = offset - start
+        return index, offset
+
+    def get_metadata(self, filename, test):
+        """
+        Insert metadata loading code for your dataset here.
+        If artist/genre labels are different from provided artist/genre lists,
+        update labeller accordingly.
+
+        Returns:
+            (artist, genre, full_lyrics) of type (str, str, str). For
+            example, ("unknown", "classical", "") could be a metadata for a
+            piano piece.
+        """
+        return None, None, None
+
+    def get_song_chunk(self, index, offset, test=False):
+        filename, total_length = self.files[index], self.durations[index]
+        data, sr = load_audio(filename, sr=self.sr, offset=offset, duration=self.sample_length)
+        assert data.shape == (self.channels, self.sample_length), f'Expected {(self.channels, self.sample_length)}, got {data.shape}'
+        if self.labels:
+            artist, genre, lyrics = self.get_metadata(filename, test)
+            labels = self.labeller.get_label(artist, genre, lyrics, total_length, offset)
+            return data.T, labels['y']
+        else:
+            return data.T
+
+    def get_item(self, item, test=False):
+        index, offset = self.get_index_offset(item)
+        return self.get_song_chunk(index, offset, test)
+
+    def __len__(self):
+        return int(np.floor(self.cumsum[-1] / self.sample_length))
+
+    def __getitem__(self, item):
+        return self.get_item(item)
diff --git a/jukebox/data/ids/v2_artist_ids.txt b/jukebox/data/ids/v2_artist_ids.txt
new file mode 100644
index 0000000000000000000000000000000000000000..67db1b0772560abd5fd01e0f6eb87ce14e184cd2
--- /dev/null
+++ b/jukebox/data/ids/v2_artist_ids.txt
@@ -0,0 +1,4111 @@
+unknown;0
+various;0
+;0
+andr_s_schiff;1
+sonny_terry;2
+nelly;3
+markus_schulz;4
+modest_petrovich_mussorgsky;5
+otis_redding;6
+aerosmith;7
+kenny_g;8
+james_taylor;9
+bobby_bland;10
+burning_spear;11
+skip_james;12
+heart;13
+tammy_wynette;14
+muse;15
+beres_hammond;16
+james_newton_howard;17
+nelson_freire;18
+benny_goodman;19
+hank_williams;20
+they_might_be_giants;21
+the_brian_jonestown_massacre;22
+lady_gaga;23
+chris_young;24
+alison_krauss_union_station;25
+seal;26
+the_hollies;27
+shabba_ranks;28
+paul_young;29
+iration;30
+buck_owens;31
+the_weeknd;32
+elton_john;33
+smokey_robinson;34
+roy_orbison;35
+headhunterz;36
+blondie;37
+the_temptations;38
+ray_stevens;39
+foo_fighters;40
+christoph_eschenbach;41
+blind_willie_mctell;42
+al_martino;43
+edwin_fischer;44
+victor_young;45
+justin_bieber;46
+styx;47
+doris_day;48
+tex_beneke;49
+the_monkees;50
+richard_wagner;51
+bryan_adams;52
+alessandro_scarlatti;53
+rebelution;54
+pitbull;55
+nat_king_cole;56
+wiz_khalifa;57
+roger_miller;58
+andy_williams;59
+peggy_lee;60
+pyotr_ilyich_tchaikovsky;61
+booker_t_the_mg_s;62
+cilla_black;63
+billy_fury;64
+vera_lynn;65
+enrico_caruso;66
+sly_and_robbie;67
+the_pretenders;68
+the_sweet;69
+kylie_minogue;70
+kay_kyser;71
+san_francisco_symphony;72
+prince;73
+queen;74
+kool_the_gang;75
+horace_andy;76
+midnite;77
+gentleman;78
+wilhelm_kempff;79
+busta_rhymes;80
+the_pogues;81
+def_leppard;82
+al_jolson;83
+king_tubby;84
+hot_chocolate;85
+delroy_wilson;86
+jody_watley;87
+bobby_vee;88
+johnny_mathis;89
+the_rascals;90
+sviatoslav_richter;91
+fred_astaire;92
+john_holt;93
+amy_grant;94
+b_b_king;95
+paul_weston;96
+four_tops;97
+jay_sean;98
+pat_boone;99
+george_frideric_handel;100
+bing_crosby;101
+shalamar;102
+tommy_dorsey;103
+ludacris;104
+kenny_chesney;105
+murray_perahia;106
+lightnin_hopkins;107
+ricky_nelson;108
+clint_mansell;109
+tom_petty_and_the_heartbreakers;110
+gary_glitter;111
+ringo_starr;112
+phil_collins;113
+leo_reisman;114
+al_green;115
+jim_reeves;116
+chris_brown;117
+cliff_edwards;118
+buddy_guy;119
+angelo_badalamenti;120
+frank_sinatra;121
+santana;122
+the_pussycat_dolls;123
+peaches_herb;124
+radu_lupu;125
+the_whispers;126
+eddy_howard;127
+memphis_slim;128
+henry_mancini;129
+giuseppe_verdi;130
+the_dream;131
+vladimir_sofronitsky;132
+u_roy;133
+ken_boothe;134
+the_kinks;135
+howard_shore;136
+hardwell;137
+lou_reed;138
+calvin_harris;139
+eddy_chen;140
+anne_murray;141
+juice_newton;142
+bee_gees;143
+wilson_pickett;144
+alan_jackson;145
+shirley_bassey;146
+waylon_jennings;147
+destiny_s_child;148
+cab_calloway;149
+johnny_copeland;150
+bright_eyes;151
+trey_songz;152
+neil_sedaka;153
+justin_timberlake;154
+arthur_grumiaux;155
+the_who;156
+the_yardbirds;157
+big_joe_turner;158
+duke_ellington;159
+herb_alpert;160
+laura_branigan;161
+michael_jackson;162
+john_denver;163
+peter_gordon;164
+solomon_cutner;165
+steve_miller_band;166
+don_williams;167
+the_pointer_sisters;168
+metallica;169
+the_ink_spots;170
+kenny_rogers;171
+the_game;172
+gene_krupa;173
+snoop_dogg;174
+j_cole;175
+taylor_dayne;176
+r3hab;177
+guy_lombardo_and_his_royal_canadians;178
+shakin_stevens;179
+tim_mcgraw;180
+olivia_newton_john_john_travolta;181
+the_replacements;182
+beenie_man;183
+diplo;184
+sammy_kaye;185
+don_gibson;186
+shaggy;187
+nina_simone;188
+stone_temple_pilots;189
+ne_yo;190
+huddie_william_ledbetter;191
+jerry_goldsmith;192
+the_bellamy_brothers;193
+winifred_atwell;194
+georg_philipp_telemann;195
+timbaland;196
+the_5th_dimension;197
+dionne_warwick;198
+r_kelly;199
+enrique_iglesias;200
+sting;201
+mikey_dread;202
+yellowman;203
+eddie_kendricks;204
+blur;205
+twista;206
+paul_anka;207
+chris_cornell;208
+a_ha;209
+pet_shop_boys;210
+yo_yo_ma;211
+tom_jones;212
+neil_diamond;213
+vic_damone;214
+paul_oakenfold;215
+jascha_heifetz;216
+t_i_;217
+dinah_washington;218
+vladimir_ashkenazy;219
+leif_ove_andsnes;220
+johnny_cash;221
+basement_jaxx;222
+sonic_youth;223
+the_isley_brothers;224
+jason_aldean;225
+henryk_szeryng;226
+lloyd_price;227
+reba_mcentire;228
+bon_jovi;229
+bed_ich_smetana;230
+donny_osmond;231
+chuck_berry;232
+the_smashing_pumpkins;233
+israel_vibration;234
+stan_kenton;235
+conway_twitty_loretta_lynn;236
+mcfly;237
+r_e_m;238
+morgan_heritage;239
+lil_wayne;240
+garnett_silk;241
+lee_scratch_perry;242
+howlin_wolf;243
+jon_secada;244
+goo_goo_dolls;245
+brian_hyland;246
+abc;247
+clarence_gatemouth_brown;248
+wilson_phillips;249
+atlantic_starr;250
+david_guetta;251
+s_rgio_mendes;252
+fr_d_ric_chopin;253
+robert_palmer;254
+charlie_musselwhite;255
+level_42;256
+peter_andre;257
+the_spinners;258
+erasure;259
+philippe_entremont;260
+leo_jan_ek;261
+the_kingston_trio;262
+ronnie_milsap;263
+pat_benatar;264
+robert_casadesus;265
+t_bone_walker;266
+duran_duran;267
+jerry_reed;268
+franz_liszt;269
+journey;270
+steve_angello;271
+showaddywaddy;272
+tears_for_fears;273
+john_williams;274
+daniel_m_ller_schott;275
+tampa_red;276
+tina_turner;277
+the_beatles;278
+miranda_lambert;279
+howard_jones;280
+reo_speedwagon;281
+lady_antebellum;282
+no_doubt;283
+status_quo;284
+tiffany;285
+billy_eckstine;286
+danny_elfman;287
+jimmy_wakely;288
+ike_tina_turner;289
+george_gershwin;290
+dinah_shore;291
+armin_van_buuren;292
+keith_sweat;293
+beaux_arts_trio;294
+arcangelo_corelli;295
+air_supply;296
+w_w;297
+the_mighty_diamonds;298
+harry_gregson_williams;299
+blind_lemon_jefferson;300
+brook_benton;301
+sizzla;302
+bobby_darin;303
+steely_dan;304
+the_skatalites;305
+dwight_yoakam;306
+clara_haskil;307
+jah_cure;308
+cliff_richard;309
+fabolous;310
+sonny_boy_williamson_ii;311
+tinie_tempah;312
+claude_debussy;313
+jewel;314
+bob_seger;315
+otis_rush;316
+mikhail_pletnev;317
+gene_autry;318
+blind_blake;319
+ethel_waters;320
+jackie_wilson;321
+big_mama_thornton;322
+mary_wells;323
+the_mills_brothers;324
+rage_against_the_machine;325
+barbra_streisand;326
+frank_crumit;327
+the_clash;328
+dion;329
+all_4_one;330
+t_i;331
+orchestral_manoeuvres_in_the_dark;332
+culture;333
+josh_turner;334
+one_direction;335
+the_rolling_stones;336
+half_pint;337
+the_searchers;338
+vangelis;339
+jackson_browne;340
+the_beach_boys;341
+train;342
+prince_buster;343
+tavares;344
+eve;345
+bessie_smith;346
+trace_adkins;347
+bay_city_rollers;348
+david_allan_coe;349
+rascal_flatts;350
+petula_clark;351
+10cc;352
+50_cent;353
+the_oak_ridge_boys;354
+barry_manilow;355
+truls_m_rk;356
+morrissey;357
+gerry_the_pacemakers;358
+kay_starr;359
+alborosie;360
+engelbert_humperdinck;361
+new_order;362
+tony_bennett;363
+stereophonics;364
+jimmy_reed;365
+akon;366
+echo_the_bunnymen;367
+jamiroquai;368
+stevie_ray_vaughan;369
+ben_e_king;370
+cheap_trick;371
+dusty_springfield;372
+mel_tillis;373
+damian_marley;374
+ruth_etting;375
+westlife;376
+diana_ross;377
+the_shirelles;378
+frankie_laine;379
+sarah_vaughan;380
+ray_price;381
+gordon_lightfoot;382
+eddie_cantor;383
+the_byrds;384
+gary_numan;385
+bonnie_tyler;386
+aswad;387
+brownie_mcghee;388
+joshua_bell;389
+manic_street_preachers;390
+mr_vegas;391
+tanya_tucker;392
+marvin_gaye_tammi_terrell;393
+the_staple_singers;394
+ry_cooder;395
+john_mayall;396
+martina_mcbride;397
+anner_bylsma;398
+magic_sam;399
+avril_lavigne;400
+robert_nighthawk;401
+the_coasters;402
+ace_of_base;403
+joseph_szigeti;404
+artie_shaw;405
+big_bill_broonzy;406
+the_ventures;407
+rick_astley;408
+les_paul;409
+leonid_kogan;410
+hall_oates;411
+three_dog_night;412
+charley_pride;413
+paul_mccartney;414
+alfred_cortot;415
+crystal_gayle;416
+taj_mahal;417
+mary_j_blige;418
+leann_rimes;419
+mildred_bailey;420
+ll_cool_j;421
+lobo;422
+blake_shelton;423
+matt_haimovitz;424
+beastie_boys;425
+johannes_brahms;426
+martha_and_the_vandellas;427
+lou_rawls;428
+the_righteous_brothers;429
+rosalyn_tureck;430
+vince_gill;431
+gary_moore;432
+bad_company;433
+marty_robbins;434
+russ_morgan;435
+david_cassidy;436
+dj_khaled;437
+leonard_pennario;438
+glenn_miller;439
+kings_of_leon;440
+afrojack;441
+cyndi_lauper;442
+trisha_yearwood;443
+diamond_rio;444
+patty_loveless;445
+sugar_minott;446
+willie_nelson;447
+thomas_newman;448
+georgia_gibbs;449
+eric_carmen;450
+ricky_skaggs;451
+earth_wind_fire;452
+peter_tosh;453
+joe_bonamassa;454
+lonnie_johnson;455
+missy_elliott;456
+nicki_minaj;457
+luther_allison;458
+grigory_sokolov;459
+luke_bryan;460
+alanis_morissette;461
+tracy_lawrence;462
+sonny_james;463
+brandy;464
+michael_bolton;465
+boswell_sisters;466
+antonio_vivaldi;467
+fritz_kreisler;468
+elvis_costello_the_attractions;469
+woody_herman;470
+korn;471
+hans_zimmer;472
+the_osmonds;473
+electric_light_orchestra;474
+t_rex;475
+van_cliburn;476
+harry_james;477
+glee_cast;478
+simple_minds;479
+abba;480
+the_jam;481
+tanya_stephens;482
+the_statler_brothers;483
+craig_david;484
+moby;485
+xxxtentacion;486
+paul_simon;487
+magic_slim;488
+natasha_bedingfield;489
+ennio_morricone;490
+carpenters;491
+joe_tex;492
+cocoa_tea;493
+the_glee_cast;494
+martha_argerich;495
+charlie_rich;496
+memphis_minnie;497
+sheryl_crow;498
+jan_dean;499
+nick_cave_and_the_bad_seeds;500
+del_shannon;501
+lee_greenwood;502
+gary_lewis_the_playboys;503
+marcelle_meyer;504
+count_basie;505
+harold_melvin_the_blue_notes;506
+the_fray;507
+the_prodigy;508
+alicia_keys;509
+conway_twitty;510
+barrington_levy;511
+b_o_b;512
+tritonal;513
+mario_lanza;514
+mac_davis;515
+billy_murray;516
+igor_stravinsky;517
+pretenders;518
+jordin_sparks;519
+alice_in_chains;520
+ohio_players;521
+rick_springfield;522
+jimmie_rodgers;523
+buju_banton;524
+linda_ronstadt;525
+george_benson;526
+2pac;527
+soulja_boy;528
+the_flaming_lips;529
+ignacy_jan_paderewski;530
+gloria_estefan;531
+ariana_grande;532
+black_uhuru;533
+tony_pastor;534
+sublime;535
+snow_patrol;536
+daft_punk;537
+johnny_winter;538
+robbie_williams;539
+eddie_rabbitt;540
+james_cotton;541
+brad_paisley;542
+manfred_mann;543
+bassnectar;544
+margaret_whiting;545
+sam_cooke;546
+robert_cray;547
+the_beautiful_south;548
+barbara_mandrell;549
+dick_haymes;550
+creedence_clearwater_revival;551
+chic;552
+ray_charles;553
+carter_family;554
+ti_sto;555
+survivor;556
+c_line_dion;557
+sergei_prokofiev;558
+b_la_bart_k;559
+the_congos;560
+yefim_bronfman;561
+laidback_luke;562
+darius_rucker;563
+ray_anthony;564
+incubus;565
+carole_king;566
+james_brown;567
+swv;568
+bruno_mars;569
+aphex_twin;570
+nitty_gritty_dirt_band;571
+gustav_mahler;572
+the_shadows;573
+the_moody_blues;574
+reverend_gary_davis;575
+sia;576
+gaetano_donizetti;577
+earl_hooker;578
+the_commodores;579
+maria_jo_o_pires;580
+eric_donaldson;581
+elmore_james;582
+sean_kingston;583
+don_carlos;584
+linkin_park;585
+jay_the_americans;586
+grand_funk_railroad;587
+jls;588
+frankie_valli;589
+lefty_frizzell;590
+en_vogue;591
+the_cure;592
+perry_como;593
+johnny_mercer;594
+stevie_wonder;595
+ernest_tubb;596
+ramin_djawadi;597
+ashanti;598
+rosemary_clooney;599
+anne_sophie_mutter;600
+helen_forrest;601
+augustus_pablo;602
+the_carpenters;603
+claudio_arrau;604
+bob_dylan;605
+joe_simon;606
+culture_club;607
+the_ipana_troubadors;608
+jennifer_lopez;609
+karyn_white;610
+joe;611
+sarah_mclachlan;612
+j_geils_band;613
+dean_martin;614
+hank_snow;615
+clyde_mcphatter;616
+tlc;617
+beck;618
+jimmy_dean;619
+roy_acuff;620
+outkast;621
+freddie_mcgregor;622
+gus_arnheim;623
+gramatik;624
+merle_haggard;625
+steve_lawrence;626
+ma_rainey;627
+jimmy_dorsey;628
+johnny_paycheck;629
+arthur_rubinstein;630
+talking_heads;631
+capleton;632
+les_brown;633
+leonard_bernstein;634
+bobby_goldsboro;635
+kendrick_lamar;636
+lenny_kravitz;637
+nat_shilkret;638
+toby_keith;639
+junior_wells;640
+billy_j_kramer_the_dakotas;641
+peter_paul_mary;642
+armand_van_helden;643
+h_sker_d_;644
+alabama;645
+eminem;646
+felix_mendelssohn_bartholdy;647
+richard_goode;648
+pink_floyd;649
+sara_evans;650
+lonnie_donegan;651
+boney_m;652
+deadmau5;653
+lee_ann_womack;654
+eric_clapton;655
+ray_parker_jr;656
+etta_james;657
+the_white_stripes;658
+gary_u_s_bonds;659
+glen_gray_and_the_casa_loma_orchestra;660
+bonnie_raitt;661
+soulja_boy_tell_em;662
+bobby_rydell;663
+carly_simon;664
+koko_taylor;665
+gregory_isaacs;666
+red_hot_chili_peppers;667
+josef_suk;668
+clint_black;669
+buddy_clark;670
+tool;671
+pierre_fournier;672
+alice_cooper;673
+lucky_dube;674
+jorge_bolet;675
+don_diablo;676
+r_l_burnside;677
+klaus_badelt;678
+dolly_parton;679
+james_horner;680
+green_day;681
+henry_burr;682
+the_doors;683
+roxette;684
+louis_armstrong;685
+jerry_butler;686
+louis_prima;687
+paul_van_dyk;688
+dennis_brown;689
+toni_braxton;690
+jerry_lee_lewis;691
+donna_summer;692
+percy_faith;693
+willie_dixon;694
+elvis_costello;695
+youri_egorov;696
+dmitri_shostakovich;697
+webb_pierce;698
+monica;699
+pierre_laurent_aimard;700
+muddy_waters;701
+garth_brooks;702
+boyz_ii_men;703
+kris_kristofferson;704
+duane_eddy;705
+coolio;706
+gidon_kremer;707
+eurythmics;708
+randy_travis;709
+collie_buddz;710
+faith_evans;711
+matisyahu;712
+brooks_dunn;713
+lulu;714
+fletcher_henderson;715
+eek_a_mouse;716
+shlomo_mintz;717
+ray_noble;718
+bill_withers;719
+ub40;720
+b_j_thomas;721
+mariah_carey;722
+olivia_newton_john;723
+jackson_5;724
+michael_rose;725
+three_days_grace;726
+glen_campbell;727
+keith_urban;728
+onerepublic;729
+ted_weems;730
+johnny_desmond;731
+billy_preston;732
+billy_vaughn;733
+otis_spann;734
+marion_harris;735
+richard_marx;736
+frankie_carle;737
+xavier_cugat;738
+zz_top;739
+faz_l_say;740
+new_york_philharmonic;741
+teresa_brewer;742
+pixies;743
+david_oistrakh;744
+rory_gallagher;745
+ted_lewis_his_band;746
+patsy_cline;747
+tracy_byrd;748
+daniel_shafran;749
+johnny_horton;750
+billy_ocean;751
+don_mclean;752
+ginuwine;753
+kiss;754
+kaskade;755
+the_hold_steady;756
+rod_stewart;757
+rudolf_serkin;758
+the_wanted;759
+herman_s_hermits;760
+rusty_draper;761
+jay_z;762
+mississippi_john_hurt;763
+linval_thompson;764
+billie_holiday;765
+lawrence_welk;766
+john_lee_hooker;767
+the_offspring;768
+ramones;769
+ronan_keating;770
+sir_clifford_michael_curzon;771
+dierks_bentley;772
+bush;773
+texas;774
+gene_austin;775
+after_all;776
+depeche_mode;777
+eddy_arnold;778
+ferry_corsten;779
+inxs;780
+jimmie_lunceford;781
+johann_sebastian_bach;782
+maxi_priest;783
+travis_tritt;784
+girls_aloud;785
+meat_loaf;786
+brenda_lee;787
+modest_mouse;788
+glenn_gould;789
+mc_hammer;790
+erik_satie;791
+the_andrews_sisters;792
+alicia_de_larrocha;793
+america;794
+charles_harrison;795
+georges_cziffra;796
+peter_serkin;797
+gladys_knight_the_pips;798
+anton_n_dvo_k;799
+emil_gilels;800
+myra_hess;801
+portugal_the_man;802
+rufus;803
+sergei_rachmaninoff;804
+andr_previn;805
+natalie_cole;806
+the_killers;807
+helen_reddy;808
+billy_idol;809
+shawn_mendes;810
+john_lennon;811
+joan_jett;812
+jimmy_cliff;813
+taylor_swift;814
+shura_cherkassky;815
+marvin_gaye;816
+the_drifters;817
+paul_revere_the_raiders;818
+donovan;819
+alma_gluck;820
+fleetwood_mac;821
+jo_stafford;822
+samson_fran_ois;823
+vaughn_monroe;824
+tennessee_ernie_ford;825
+whitesnake;826
+iron_maiden;827
+blind_boy_fuller;828
+eric_church;829
+atomic_kitten;830
+harry_nilsson;831
+paul_specht;832
+bob_marley;833
+les_baxter;834
+aretha_franklin;835
+jessie_j;836
+robert_johnson;837
+maroon_5;838
+sheena_easton;839
+george_strait;840
+bruce_springsteen;841
+oasis;842
+jack_white;843
+fatboy_slim;844
+the_jesus_and_mary_chain;845
+hank_williams_jr;846
+bill_haley_his_comets;847
+charlie_daniels;848
+pearl_jam;849
+the_stylistics;850
+sean_paul;851
+david_essex;852
+zino_francescatti;853
+bruce_hornsby;854
+system_of_a_down;855
+scott_joplin;856
+benny_benassi;857
+mitch_miller;858
+gloria_gaynor;859
+steve_winwood;860
+drake;861
+bone_thugs_n_harmony;862
+nickelback;863
+john_mellencamp;864
+gene_pitney;865
+chet_atkins;866
+mark_ronson;867
+kc_the_sunshine_band;868
+chingy;869
+maurice_ravel;870
+infected_mushroom;871
+walter_gieseking;872
+pavement;873
+deniece_williams;874
+tommy_james_the_shondells;875
+harry_belafonte;876
+robert_schuman;877
+george_jones;878
+whitney_houston;879
+the_ames_brothers;880
+max_romeo;881
+patti_page;882
+p_nk;883
+vienna_philharmonic;884
+violent_femmes;885
+sly_the_family_stone;886
+johnny_marvin;887
+sammy_davis_jr_;888
+nirvana;889
+the_turtles;890
+the_manhattans;891
+the_supremes;892
+gregg_allman;893
+alton_ellis;894
+giacomo_puccini;895
+artur_schnabel;896
+2_unlimited;897
+michael_rabin;898
+toto;899
+lil_kim;900
+shinedown;901
+teddy_wilson;902
+alpha_blondy;903
+the_guess_who;904
+my_chemical_romance;905
+matchbox_twenty;906
+wolfgang_gartner;907
+ed_sheeran;908
+faron_young;909
+fats_waller;910
+kate_smith;911
+loretta_lynn;912
+antonio_meneses;913
+tony_martin;914
+frankie_vaughan;915
+the_mamas_the_papas;916
+david_bowie;917
+erskine_hawkins;918
+the_four_lads;919
+queens_of_the_stone_age;920
+the_human_league;921
+mud;922
+bread;923
+dixie_chicks;924
+backstreet_boys;925
+connie_francis;926
+the_black_keys;927
+paul_weller;928
+pieter_wispelwey;929
+the_bachelors;930
+janet_jackson;931
+the_association;932
+edwin_starr;933
+hank_williams_jr_;934
+steven_isserlis;935
+the_doobie_brothers;936
+jimi_hendrix;937
+sousa_s_band;938
+ricky_martin;939
+the_cranberries;940
+paul_whiteman;941
+the_saturdays;942
+keb_mo;943
+the_psychedelic_furs;944
+boyzone;945
+the_chemical_brothers;946
+johnny_tillotson;947
+joe_cocker;948
+babyface;949
+wet_wet_wet;950
+tom_petty;951
+lesley_gore;952
+deorro;953
+war;954
+wolfgang_schneiderhan;955
+isaac_hayes;956
+beyonc;957
+gordon_jenkins;958
+flo_rida;959
+captain_tennille;960
+adolf_busch;961
+bobby_bare;962
+johnnie_taylor;963
+above_beyond;964
+robin_thicke;965
+ja_rule;966
+barry_white;967
+florence_the_machine;968
+the_jimi_hendrix_experience;969
+gabrielle;970
+leon_fleisher;971
+yehudi_menuhin;972
+george_harrison;973
+kate_bush;974
+u2;975
+nancy_sinatra;976
+the_everly_brothers;977
+peter_green;978
+cat_stevens;979
+the_new_seekers;980
+itzhak_perlman;981
+red_nichols_his_five_pennies;982
+the_script;983
+ijahman_levi;984
+rihanna;985
+joan_jett_and_the_blackhearts;986
+the_dorsey_brothers_orchestra;987
+charlie_barnet;988
+lmfao;989
+112;990
+kelis;991
+john_mccormack;992
+guns_n_roses;993
+gordon_macrae;994
+buddy_holly;995
+franz_schubert;996
+tommy_roe;997
+garbage;998
+bow_wow;999
+belle_and_sebastian;1000
+paul_tortelier;1001
+johnny_nash;1002
+lazar_berman;1003
+avicii;1004
+5_seconds_of_summer;1005
+mississippi_fred_mcdowell;1006
+ivan_moravec;1007
+little_richard;1008
+john_ogdon;1009
+dr_dre;1010
+andy_russell;1011
+sex_pistols;1012
+puddle_of_mudd;1013
+june_carter_cash;1014
+freddie_king;1015
+dino_ciani;1016
+h_sker_d;1017
+nicky_romero;1018
+shania_twain;1019
+major_lazer;1020
+usher;1021
+madonna;1022
+nsync;1023
+the_o_jays;1024
+faith_hill;1025
+terence_trent_d_arby;1026
+gene_chandler;1027
+lisa_stansfield;1028
+pharrell_williams;1029
+david_geringas;1030
+dr_alimantado;1031
+jack_scott;1032
+vladimir_horowitz;1033
+genesis;1034
+jessica_simpson;1035
+peter_gabriel;1036
+spike_jones_and_his_city_slickers;1037
+leonard_rose;1038
+richard_strauss;1039
+mgmt;1040
+blink_182;1041
+ella_fitzgerald;1042
+carrie_underwood;1043
+evgeny_kissin;1044
+olly_murs;1045
+ky_mani_marley;1046
+rose_royce;1047
+desmond_dekker;1048
+montell_jordan;1049
+little_river_band;1050
+t_pain;1051
+gary_allan;1052
+chubby_checker;1053
+the_diamonds;1054
+creed;1055
+louis_jordan;1056
+vernon_dalhart;1057
+gilbert_o_sullivan;1058
+ziggy_marley;1059
+irene_cara;1060
+katy_perry;1061
+all_saints;1062
+benjamin_britten;1063
+p_m_dawn;1064
+iona_brown;1065
+the_lettermen;1066
+natalie_imbruglia;1067
+viktoria_mullova;1068
+zac_brown_band;1069
+bj_rk;1070
+mutabaruka;1071
+jermaine_jackson;1072
+the_impressions;1073
+spandau_ballet;1074
+luther_vandross;1075
+thompson_twins;1076
+travis_scott;1077
+roots_radics;1078
+madness;1079
+skrillex;1080
+the_smiths;1081
+van_halen;1082
+the_four_seasons;1083
+j_b_lenoir;1084
+johnny_rivers;1085
+bob_marley_the_wailers;1086
+albert_collins;1087
+arctic_monkeys;1088
+billy_joel;1089
+joe_nichols;1090
+chicago;1091
+coldplay;1092
+blue;1093
+papa_roach;1094
+r_e_m_;1095
+eddie_money;1096
+ellie_goulding;1097
+gym_class_heroes;1098
+the_charlie_daniels_band;1099
+bette_midler;1100
+hubert_sumlin;1101
+phil_harris;1102
+arturo_benedetti_michelangeli;1103
+steps;1104
+simon_and_garfunkel;1105
+gyptian;1106
+britney_spears;1107
+lang_lang;1108
+blackstreet;1109
+the_dave_clark_five;1110
+the_chi_lites;1111
+wolfgang_amadeus_mozart;1112
+johnnie_ray;1113
+elvis_presley;1114
+red_norvo;1115
+the_velvet_underground;1116
+nelly_furtado;1117
+connee_boswell;1118
+soundgarden;1119
+dr_hook_the_medicine_show;1120
+led_zeppelin;1121
+boney_m_;1122
+gregor_piatigorsky;1123
+john_anderson;1124
+joni_james;1125
+the_all_american_rejects;1126
+marcia_griffiths;1127
+ben_bernie;1128
+eric_prydz;1129
+the_three_suns;1130
+albert_king;1131
+jodeci;1132
+inner_circle;1133
+don_cornell;1134
+jeff_healey;1135
+brian_mcknight;1136
+the_stranglers;1137
+adam_faith;1138
+francis_poulenc;1139
+fall_out_boy;1140
+the_jets;1141
+groundation;1142
+randy_newman;1143
+jane_s_addiction;1144
+bobby_vinton;1145
+guiomar_novaes;1146
+bo_diddley;1147
+philadelphia_orchestra;1148
+garrick_ohlsson;1149
+evanescence;1150
+wilhelm_backhaus;1151
+the_platters;1152
+the_seekers;1153
+selena_gomez;1154
+eddy_grant;1155
+jan_garber;1156
+gioacchino_rossini;1157
+christina_aguilera;1158
+stevie_nicks;1159
+color_me_badd;1160
+sandie_shaw;1161
+migos;1162
+the_allman_brothers_band;1163
+will_young;1164
+new_edition;1165
+radiohead;1166
+orbital;1167
+the_judds;1168
+dan_fogelberg;1169
+pascal_rog_;1170
+leona_lewis;1171
+charlie_patton;1172
+yann_tiersen;1173
+the_abyssinians;1174
+frank_ifield;1175
+john_mayer;1176
+roger_wolfe_kahn;1177
+the_mcguire_sisters;1178
+junior_reid;1179
+bukka_white;1180
+macy_gray;1181
+billy_currington;1182
+the_tremeloes;1183
+ac_dc;1184
+little_walter;1185
+gorillaz;1186
+the_hilltoppers;1187
+gil_shaham;1188
+little_mix;1189
+supertramp;1190
+third_world;1191
+david_garrett;1192
+the_lovin_spoonful;1193
+chris_lake;1194
+louis_lortie;1195
+shakira;1196
+vanessa_williams;1197
+the_bangles;1198
+mark_chesnutt;1199
+keith_whitley;1200
+roxy_music;1201
+betty_hutton;1202
+cheryl_cole;1203
+the_notorious_b_i_g_;1204
+kesha;1205
+boston;1206
+gerald_moore;1207
+heinrich_schiff;1208
+joy_division;1209
+kenny_loggins;1210
+nilsson;1211
+counting_crows;1212
+simply_red;1213
+aaron_tippin;1214
+sweet;1215
+sugababes;1216
+florida_georgia_line;1217
+lonnie_mack;1218
+jacob_miller;1219
+the_police;1220
+the_chordettes;1221
+fats_domino;1222
+willy_deville;1223
+bob_crosby;1224
+roy_clark;1225
+dinu_lipatti;1226
+alison_krauss;1227
+frankie_avalon;1228
+ella_mae_morse;1229
+linton_kwesi_johnson;1230
+glenn_frey;1231
+rudy_vall_e_his_connecticut_yankees;1232
+dj_snake;1233
+toots_the_maytals;1234
+annie_lennox;1235
+paramore;1236
+john_browning;1237
+sister_sledge;1238
+julian_lloyd_webber;1239
+kanye_west;1240
+christopher_cross;1241
+professor_longhair;1242
+flume;1243
+sophie_tucker;1244
+third_eye_blind;1245
+beyonc_;1246
+weezer;1247
+kelly_clarkson;1248
+take_that;1249
+ludwig_van_beethoven;1250
+the_cars;1251
+george_michael;1252
+chaka_khan;1253
+arty;1254
+steel_pulse;1255
+bunny_wailer;1256
+the_troggs;1257
+nick_lucas;1258
+keb_mo_;1259
+rise_against;1260
+mickey_gilley;1261
+poison;1262
+debbie_gibson;1263
+kim_wilde;1264
+sammy_davis_jr;1265
+jonas_brothers;1266
+cage_the_elephant;1267
+son_house;1268
+birth_control;1269
+faithless;1270
+cher;1271
+seether;1272
+new_kids_on_the_block;1273
+rage;1274
+richie_spice;1275
+sam_smith;1276
+the_marvelettes;1277
+the_jackson_5;1278
+bobbie_gentry;1279
+macklemore_ryan_lewis;1280
+s_club_7;1281
+billy_jones_ernest_hare;1282
+arcana;1283
+blasterjaxx;1284
+will_smith;1285
+ray_miller;1286
+sugar_ray;1287
+ziggy_marley_the_melody_makers;1288
+aaron_copland;1289
+diddy;1290
+daughtry;1291
+beach_house;1292
+the_dillinger_escape_plan;1293
+lonestar;1294
+foreigner;1295
+lionel_richie;1296
+roberta_flack;1297
+the_carter_family;1298
+demi_lovato;1299
+joseph_haydn;1300
+sander_van_doorn;1301
+underworld;1302
+deborah_cox;1303
+the_grass_roots;1304
+bananarama;1305
+iggy_azalea;1306
+3_doors_down;1307
+the_partridge_family;1308
+lead_belly;1309
+johnny_long;1310
+jagged_edge;1311
+the_derek_trucks_band;1312
+eddie_fisher;1313
+pablo_casals;1314
+the_original_dixieland_jazz_band;1315
+miley_cyrus;1316
+john_powell;1317
+the_black_eyed_peas;1318
+edvard_grieg;1319
+maurizio_pollini;1320
+nathan_milstein;1321
+twenty_one_pilots;1322
+rudolf_firku_n_;1323
+nine_inch_nails;1324
+the_four_aces;1325
+jimmy_rogers;1326
+salt_n_pepa;1327
+yellow_claw;1328
+the_strokes;1329
+bobby_brown;1330
+juelz_santana;1331
+staind;1332
+bj_rn_ulvaeus_benny_andersson;1333
+the_j_geils_band;1334
+the_muppets;1335
+gwen_stefani;1336
+fedde_le_grand;1337
+imagine_dragons;1338
+leo_sayer;1339
+the_animals;1340
+the_b_52_s;1341
+furry_lewis;1342
+ciara;1343
+larry_clinton;1344
+dire_straits;1345
+pato_banton_the_reggae_revol;1346
+goodie_mob;1347
+disclosure;1348
+georges_bizet;1349
+sonny_terry_brownie_mcghee;1350
+jim_croce;1351
+nelson;1352
+jason_derulo;1353
+guy_mitchell;1354
+the_fontane_sisters;1355
+sonny_boy_williamson_i;1356
+mirah;1357
+jimmy_rushing;1358
+john_michael_montgomery;1359
+michael_nesmith;1360
+george_clinton;1361
+burt_bacharach;1362
+v6;1363
+slim_thug;1364
+belinda_carlisle;1365
+philip_glass;1366
+slade;1367
+pete_townshend;1368
+oingo_boingo;1369
+andrew_lloyd_webber;1370
+the_collectors;1371
+boy_george;1372
+utada_hikaru;1373
+mel_torm;1374
+diana_krall;1375
+melanie_c;1376
+john_hammond;1377
+peter_green_splinter_group;1378
+trouble;1379
+g_unit;1380
+ferlin_husky;1381
+arcade_fire;1382
+latino;1383
+krayzie_bone;1384
+man;1385
+dave_clark_five;1386
+the_stone_roses;1387
+young_jeezy;1388
+blood_sweat_tears;1389
+kumikameli;1390
+dmx;1391
+ice_cube;1392
+eagles;1393
+jill_scott;1394
+xtc;1395
+peggy_march;1396
+michael_bubl;1397
+raimon;1398
+two_mix;1399
+dulce_pontes;1400
+the_unseen;1401
+hank_locklin;1402
+the_notorious_b_i_g;1403
+bumblefoot;1404
+the_busters;1405
+rick_ross;1406
+tegan_and_sara;1407
+skeeter_davis;1408
+curtis_mayfield;1409
+sade;1410
+wings;1411
+lorrie_morgan;1412
+saga;1413
+a_r_rahman;1414
+martha_wainwright;1415
+nas;1416
+will_i_am;1417
+kirsty_maccoll;1418
+angel;1419
+dave_davies;1420
+iggy_pop;1421
+jojo;1422
+sammy_hagar;1423
+ray_davies;1424
+and_one;1425
+neil_young;1426
+mikael_wiehe;1427
+the_cardigans;1428
+cage;1429
+dottie_west;1430
+keri_hilson;1431
+johnny_hallyday;1432
+bill_nelson;1433
+fifteen;1434
+deen;1435
+bobby_v;1436
+lil_yachty;1437
+too_hort;1438
+bernard_lavilliers;1439
+hank_thompson;1440
+the_chieftains;1441
+daryl_hall;1442
+antonio_carlos_jobim;1443
+aventura;1444
+the_the;1445
+van_morrison;1446
+wynonna_judd;1447
+gomez;1448
+charles_aznavour;1449
+m83;1450
+gnr;1451
+all;1452
+emmylou_harris;1453
+lee_hazlewood;1454
+mew;1455
+uriah_heep;1456
+yoko_ono;1457
+abw_rts;1458
+john_legend;1459
+d12;1460
+kitty_wells;1461
+timbiriche;1462
+shel_silverstein;1463
+cam_ron;1464
+rosanne_cash;1465
+2_chainz;1466
+tricky;1467
+8ball_mjg;1468
+flatt_scruggs;1469
+bill_anderson;1470
+emil_ana_torrini;1471
+ufo;1472
+mos_def;1473
+danzig;1474
+juan_gabriel;1475
+common;1476
+raekwon;1477
+france_gall;1478
+nicole_scherzinger;1479
+r_yksopp;1480
+sammie;1481
+lena_horne;1482
+david_byrne;1483
+paul_williams;1484
+josh_groban;1485
+the_gathering;1486
+frank_boeijen;1487
+scooter;1488
+steve_wariner;1489
+mika;1490
+pete_seeger;1491
+tex_ritter;1492
+warrant;1493
+porter_wagoner;1494
+field_music;1495
+three_6_mafia;1496
+jim_jones;1497
+daniel_o_donnell;1498
+brentalfloss;1499
+wyclef_jean;1500
+hey;1501
+bizzy_bone;1502
+the_mccalmans;1503
+blues_traveler;1504
+massive_attack;1505
+woody_guthrie;1506
+art_garfunkel;1507
+andrea_bocelli;1508
+david_crosby;1509
+dream;1510
+soul_asylum;1511
+natalie_merchant;1512
+shawn_colvin;1513
+jonny_lang;1514
+funeral_for_a_friend;1515
+boz_scaggs;1516
+example;1517
+lionel_hampton;1518
+the_tubes;1519
+marc_anthony;1520
+good_riddance;1521
+moonlight;1522
+marc_almond;1523
+rza;1524
+die_rzte;1525
+rbd;1526
+alejandro_fern_ndez;1527
+wanda_jackson;1528
+lara_fabian;1529
+julio_iglesias;1530
+jeff_beck;1531
+peabo_bryson;1532
+no_fun_at_all;1533
+prong;1534
+canibus;1535
+krs_one;1536
+u_s_bombs;1537
+trust;1538
+stonewall_jackson;1539
+jos_feliciano;1540
+m_a;1541
+polysics;1542
+n_e_r_d;1543
+sesame_street;1544
+lio;1545
+myl_ne_farmer;1546
+iris_dement;1547
+lily_allen;1548
+spoken;1549
+architects;1550
+jack_johnson;1551
+molly_hatchet;1552
+cypress_hill;1553
+future;1554
+the_nits;1555
+per_gessle;1556
+live;1557
+beau;1558
+deana_carter;1559
+lil_flip;1560
+fran_oise_hardy;1561
+billy_ray_cyrus;1562
+pepper;1563
+run_d_m_c;1564
+levon_helm;1565
+insane_clown_posse;1566
+stars;1567
+jean_michel_jarre;1568
+thunder;1569
+juanes;1570
+simple_plan;1571
+method_man;1572
+smash_mouth;1573
+meek_mill;1574
+fat_joe;1575
+kenny_wayne_shepherd;1576
+jhen_aiko;1577
+the_manhattan_transfer;1578
+joss_stone;1579
+cee_lo_green;1580
+tyrese;1581
+charlotte_martin;1582
+rodney_crowell;1583
+acappella;1584
+die_prinzen;1585
+pentatonix;1586
+abney_park;1587
+smooth_mcgroove;1588
+the_magnetic_fields;1589
+the_nylons;1590
+wise_guys;1591
+coil;1592
+do_as_infinity;1593
+lords_of_acid;1594
+the_church;1595
+chris_rea;1596
+jota_quest;1597
+miguel_bos;1598
+gang_starr;1599
+masta_ace;1600
+brand_new;1601
+mac_miller;1602
+cathedral;1603
+corrosion_of_conformity;1604
+country_joe_mcdonald;1605
+eric_johnson;1606
+grateful_dead;1607
+janis_joplin;1608
+john_miles;1609
+king_gizzard_the_lizard_wizard;1610
+ted_nugent;1611
+combichrist;1612
+alkaline_trio;1613
+anathema;1614
+angus_julia_stone;1615
+anna_ternheim;1616
+anthony_phillips;1617
+steve_hackett;1618
+aviators;1619
+banda_calypso;1620
+blue_stahli;1621
+the_boys;1622
+capital_inicial;1623
+city_and_colour;1624
+colin_hay;1625
+collective_soul;1626
+dashboard_confessional;1627
+david_rovics;1628
+david_usher;1629
+die_toten_hosen;1630
+funny_van_dannen;1631
+dirty_heads;1632
+tech_n9ne;1633
+elisa;1634
+emmerson_nogueira;1635
+engenheiros_do_hawaii;1636
+eric_bibb;1637
+maria_muldaur;1638
+panic_at_the_disco;1639
+punchline;1640
+godsmack;1641
+self;1642
+heideroosjes;1643
+sinner;1644
+heather_nova;1645
+hoobastank;1646
+quietdrive;1647
+hyde;1648
+jaguares;1649
+bert_jansch;1650
+jonatha_brooke;1651
+joni_mitchell;1652
+the_band;1653
+josh_garrels;1654
+josh_woodward;1655
+william_fitzsimmons;1656
+katie_melua;1657
+jamie_cullum;1658
+kristin_hersh;1659
+kt_tunstall;1660
+legi_o_urbana;1661
+the_zombies;1662
+francesco_de_gregori;1663
+m_ward;1664
+beth_orton;1665
+magnum;1666
+motorpsycho;1667
+marillion;1668
+jars_of_clay;1669
+mason_jennings;1670
+matt_nathanson;1671
+matthew_good;1672
+edguy;1673
+gamma_ray;1674
+minus_the_bear;1675
+mohsen_namjoo;1676
+nerina_pallot;1677
+never_shout_never;1678
+regina_spektor;1679
+passenger;1680
+paul_kelly;1681
+the_style_council;1682
+peter_hammill;1683
+phantom_planet;1684
+phil_keaggy;1685
+richard_thompson;1686
+said_the_whale;1687
+samsas_traum;1688
+senses_fail;1689
+sevendust;1690
+seventh_day_slumber;1691
+joan_baez;1692
+sister_hazel;1693
+slightly_stoopid;1694
+sophie_zelmani;1695
+suzanne_vega;1696
+tatiana;1697
+teoman;1698
+the_choir;1699
+charlotte_church;1700
+darlene_zschech;1701
+the_front_bottoms;1702
+the_maine;1703
+the_white_buffalo;1704
+little_big_town;1705
+threshold;1706
+tourniquet;1707
+everything_but_the_girl;1708
+vertical_horizon;1709
+vonda_shepard;1710
+warren_zevon;1711
+sarah_brightman;1712
+blackfoot;1713
+black_label_society;1714
+z_lia_duncan;1715
+2;1716
+alejandro_lerner;1717
+beth_nielsen_chapman;1718
+mercury_rev;1719
+brian_wilson;1720
+barenaked_ladies;1721
+carbon_leaf;1722
+celtic_woman;1723
+hayley_westenra;1724
+crowded_house;1725
+delta_goodrem;1726
+elbow;1727
+resurrection_band;1728
+nancy_wilson;1729
+janis_ian;1730
+jann_arden;1731
+jill_sobule;1732
+jos_augusto;1733
+xuxa;1734
+k_d_lang;1735
+kim_carnes;1736
+los_lobos;1737
+mandy_moore;1738
+marc_cohn;1739
+maureen_mcgovern;1740
+melissa_manchester;1741
+patti_labelle;1742
+helene_fischer;1743
+laura_pausini;1744
+ivan_lins;1745
+thal_a;1746
+mike_the_mechanics;1747
+paul_carrack;1748
+natasha_st_pier;1749
+michael_mcdonald;1750
+olivia;1751
+dizzee_rascal;1752
+sam_phillips;1753
+serge_gainsbourg;1754
+jane_birkin;1755
+luis_miguel;1756
+sondre_lerche;1757
+stan_ridgway;1758
+susan_boyle;1759
+mike_oldfield;1760
+ces_ria_vora;1761
+agonoize;1762
+funker_vogt;1763
+god_module;1764
+hocico;1765
+nachtmahr;1766
+suicide_commando;1767
+alejandro_escovedo;1768
+southside_johnny_the_asbury_jukes;1769
+broken_social_scene;1770
+vigilantes_of_love;1771
+billy_bragg;1772
+wilco;1773
+frank_black_and_the_catholics;1774
+blue_rodeo;1775
+brandi_carlile;1776
+patty_griffin;1777
+calexico;1778
+cass_mccombs;1779
+chris_knight;1780
+conor_oberst;1781
+corb_lund;1782
+cowboy_junkies;1783
+cracker;1784
+cross_canadian_ragweed;1785
+dave_alvin;1786
+drive_by_truckers;1787
+eleni_mandell;1788
+lucinda_williams;1789
+fred_eaglesmith;1790
+dar_williams;1791
+the_jayhawks;1792
+lana_del_rey;1793
+tim_o_brien;1794
+hank_williams_iii;1795
+james_mcmurtry;1796
+joe_henry;1797
+john_stewart;1798
+josh_ritter;1799
+lambchop;1800
+nanci_griffith;1801
+norma_jean;1802
+lyle_lovett;1803
+matthew_ryan;1804
+my_morning_jacket;1805
+neko_case;1806
+the_new_pornographers;1807
+blue_october;1808
+okkervil_river;1809
+old_97_s;1810
+ray_wylie_hubbard;1811
+richmond_fontaine;1812
+robert_earl_keen;1813
+rocky_votolato;1814
+ryan_adams;1815
+whiskeytown;1816
+son_volt;1817
+steve_earle;1818
+the_avett_brothers;1819
+the_bottle_rockets;1820
+the_felice_brothers;1821
+arlo_guthrie;1822
+the_handsome_family;1823
+the_mavericks;1824
+ten_years_after;1825
+the_walkabouts;1826
+todd_snider;1827
+vic_chesnutt;1828
+iron_wine;1829
+wovenhand;1830
+x;1831
+big_audio_dynamite;1832
+globe;1833
+carter_the_unstoppable_sex_machine;1834
+allison_moorer;1835
+front_line_assembly;1836
+the_national;1837
+the_fall;1838
+public_image_ltd;1839
+public_enemy;1840
+wire;1841
+a_tribe_called_quest;1842
+de_la_soul;1843
+aesop_rock;1844
+buck_65;1845
+caparezza;1846
+childish_gambino;1847
+the_roots;1848
+colbie_caillat;1849
+big_sean;1850
+dj_gruff;1851
+tyler_the_creator;1852
+dokken;1853
+fun_lovin_criminals;1854
+talib_kweli;1855
+jane_air;1856
+k_i_z;1857
+kid_cudi;1858
+jedi_mind_tricks;1859
+celph_titled;1860
+lupe_fiasco;1861
+bun_b;1862
+scarface;1863
+ghostface_killah;1864
+robyn;1865
+rehab;1866
+swollen_members;1867
+styles_p;1868
+the_streets;1869
+wale;1870
+joe_budden;1871
+tank;1872
+10_years;1873
+36_crazyfists;1874
+apocalyptica;1875
+nina_hagen;1876
+anastacia;1877
+black_stone_cherry;1878
+blindside;1879
+breaking_benjamin;1880
+bring_me_the_horizon;1881
+bullet_for_my_valentine;1882
+cave_in;1883
+chevelle;1884
+d_espairsray;1885
+death_angel;1886
+deftones;1887
+demon_hunter;1888
+demon;1889
+devin_townsend_project;1890
+devin_townsend;1891
+doa;1892
+dir_en_grey;1893
+disturbed;1894
+dope;1895
+drowning_pool;1896
+eighteen_visions;1897
+entombed;1898
+faith_no_more;1899
+fear_factory;1900
+fightstar;1901
+five_finger_death_punch;1902
+finger_eleven;1903
+flyleaf;1904
+grinspoon;1905
+guano_apes;1906
+h_blockx;1907
+halestorm;1908
+hamlet;1909
+helmet;1910
+bt;1911
+ill_ni_o;1912
+in_flames;1913
+in_this_moment;1914
+him;1915
+j_b_o;1916
+katatonia;1917
+killswitch_engage;1918
+xzibit;1919
+lacuna_coil;1920
+phish;1921
+limp_bizkit;1922
+living_colour;1923
+viikate;1924
+marilyn_manson;1925
+megaherz;1926
+falco;1927
+melvins;1928
+monster_magnet;1929
+mushroomhead;1930
+nonpoint;1931
+soil;1932
+otep;1933
+p_o_d;1934
+powerman_5000;1935
+primus;1936
+project_86;1937
+red;1938
+kris_allen;1939
+rob_zombie;1940
+ozzy_osbourne;1941
+rollins_band;1942
+saliva;1943
+sepultura;1944
+shihad;1945
+skillet;1946
+skindred;1947
+slipknot;1948
+smile_empty_soul;1949
+danielson;1950
+soilwork;1951
+sonic_syndicate;1952
+static_x;1953
+stone_sour;1954
+taproot;1955
+the_notwist;1956
+the_word_alive;1957
+theory_of_a_deadman;1958
+therapy;1959
+los_tucanes_de_tijuana;1960
+manu_chao;1961
+volbeat;1962
+zebrahead;1963
+hed_p_e;1964
+and_you_will_know_us_by_the_trail_of_dead;1965
+10_000_maniacs;1966
+311;1967
+77s;1968
+yes;1969
+david_lee_roth;1970
+hillsong;1971
+afi;1972
+adam_sandler;1973
+afterhours;1974
+hawkwind;1975
+all_about_eve;1976
+all_time_low;1977
+allison_crowe;1978
+amanda_palmer;1979
+american_music_club;1980
+amplifier;1981
+robert_wyatt;1982
+anberlin;1983
+andrew_bird;1984
+ani_difranco;1985
+apoptygma_berzerk;1986
+apulanta;1987
+arab_strap;1988
+joseph_arthur;1989
+tom_rosenthal;1990
+ash;1991
+asian_kung_fu_generation;1992
+poets_of_the_fall;1993
+babas_nicos;1994
+bayside;1995
+beatsteaks;1996
+ben_folds;1997
+ben_folds_five;1998
+ben_harper;1999
+better_than_ezra;2000
+bettie_serveert;2001
+big_country;2002
+big_head_todd_and_the_monsters;2003
+big_sugar;2004
+billy_talent;2005
+today_is_the_day;2006
+red_flag;2007
+black_rebel_motorcycle_club;2008
+megadeth;2009
+blonde_redhead;2010
+bob_mould;2011
+bodeans;2012
+bowling_for_soup;2013
+buck_tick;2014
+butch_walker;2015
+butthole_surfers;2016
+caf_tacvba;2017
+cake;2018
+camper_van_beethoven;2019
+carmen_consoli;2020
+mario_venuti;2021
+franco_battiato;2022
+catherine_wheel;2023
+catupecu_machu;2024
+cem_adrian;2025
+john_cale;2026
+charlie_brown_jr;2027
+nena;2028
+chumbawamba;2029
+clutch;2030
+cl;2031
+cmx;2032
+coheed_and_cambria;2033
+cold_war_kids;2034
+travis;2035
+coma;2036
+concrete_blonde;2037
+mint_condition;2038
+copeland;2039
+crash_test_dummies;2040
+joe_jackson;2041
+cristian_castro;2042
+curve;2043
+dada;2044
+daniel_amos;2045
+daniel_johnston;2046
+dave_matthews_band;2047
+burning_heads;2048
+david_gray;2049
+david_sylvian;2050
+deacon_blue;2051
+deerhoof;2052
+del_amitri;2053
+dinosaur_jr;2054
+dirty_projectors;2055
+draco_rosa;2056
+duncan_sheik;2057
+jeremy_camp;2058
+edwyn_collins;2059
+eels;2060
+nightwish;2061
+element_of_crime;2062
+embrace;2063
+enter_shikari;2064
+ulver;2065
+everclear;2066
+everlast;2067
+eyeshine;2068
+dio;2069
+faust_o;2070
+feeder;2071
+atmosphere;2072
+filter;2073
+firewater;2074
+fishbone;2075
+fountains_of_wayne;2076
+four_year_strong;2077
+steve_green;2078
+fresno;2079
+gang_of_four;2080
+good_charlotte;2081
+blood_on_the_dance_floor;2082
+graham_coxon;2083
+melissa_etheridge;2084
+tony_joe_white;2085
+guided_by_voices;2086
+robert_pollard;2087
+guster;2088
+elliott_smith;2089
+hedley;2090
+hole;2091
+hollywood_undead;2092
+hot_chip;2093
+l_arc_en_ciel;2094
+ian_brown;2095
+idlewild;2096
+jimmy_eat_world;2097
+fish;2098
+ingrid_michaelson;2099
+inme;2100
+inspiral_carpets;2101
+raf;2102
+james;2103
+jean_leloup;2104
+weird_al_yankovic;2105
+jeff_buckley;2106
+john_frusciante;2107
+dr_john;2108
+pj_harvey;2109
+jonathan_coulton;2110
+juliana_hatfield;2111
+julieta_venegas;2112
+k_s_choice;2113
+kaizers_orchestra;2114
+kargo;2115
+kasabian;2116
+keane;2117
+kevin_coyne;2118
+kevin_devine;2119
+kevin_max;2120
+rich_mullins;2121
+trooper;2122
+suzy_bogguss;2123
+kill_hannah;2124
+kisp_l_s_a_borz;2125
+kult;2126
+my_life_with_the_thrill_kill_kult;2127
+kutless;2128
+la_barranca;2129
+la_ley;2130
+lao_che;2131
+lech_janerka;2132
+les_cowboys_fringants;2133
+les_fatals_picards;2134
+les_rita_mitsouko;2135
+sparks;2136
+lifehouse;2137
+lisa_germano;2138
+ed_harcourt;2139
+lisa_loeb;2140
+liz_phair;2141
+local_h;2142
+lost_dogs;2143
+lostprophets;2144
+love_and_rockets;2145
+lucybell;2146
+lulu_santos;2147
+gabriel_o_pensador;2148
+adam_lambert;2149
+madrugada;2150
+mancha_de_rolando;2151
+manchester_orchestra;2152
+mando_diao;2153
+foetus;2154
+mark_lanegan;2155
+matthew_sweet;2156
+max_mo_park;2157
+mayday_parade;2158
+meat_puppets;2159
+men_without_hats;2160
+meshell_ndegeocello;2161
+midnight_oil;2162
+dance_gavin_dance;2163
+molotov;2164
+ov7;2165
+monkey_majik;2166
+suede;2167
+fernando_ortega;2168
+motion_city_soundtrack;2169
+mudhoney;2170
+mutemath;2171
+mercyme;2172
+m_o_morta;2173
+natalia_lafourcade;2174
+natewantstobattle;2175
+needtobreathe;2176
+split_enz;2177
+sum_41;2178
+no_te_va_gustar;2179
+noir_d_sir;2180
+t_tes_raides;2181
+o_rappa;2182
+o_a_r;2183
+ocean_colour_scene;2184
+omul_cu_obolani;2185
+one_ok_rock;2186
+2raumwohnung;2187
+our_lady_peace;2188
+pain;2189
+panda;2190
+parokya_ni_edgar;2191
+pato_fu;2192
+paul_westerberg;2193
+pere_ubu;2194
+pete_yorn;2195
+peter_murphy;2196
+placebo;2197
+plain_white_t_s;2198
+pop_will_eat_itself;2199
+porcupine_tree;2200
+powderfinger;2201
+cat_power;2202
+casting_crowns;2203
+primal_scream;2204
+m_tley_cr_e;2205
+the_used;2206
+raimundos;2207
+mark_knopfler;2208
+mark_kozelek;2209
+danko_jones;2210
+relient_k;2211
+raffi;2212
+renaud;2213
+richard_hawley;2214
+rickie_lee_jones;2215
+the_shins;2216
+rilo_kiley;2217
+robyn_hitchcock;2218
+mose_allison;2219
+roy_harper;2220
+rucka_rucka_ali;2221
+rx_bandits;2222
+saez;2223
+samiam;2224
+sarah_slean;2225
+say_anything;2226
+scout_niblett;2227
+screaming_females;2228
+shannon_wright;2229
+silverchair;2230
+sin_ad_o_connor;2231
+siouxsie_and_the_banshees;2232
+sixpence_none_the_richer;2233
+skank;2234
+skunk_anansie;2235
+sleater_kinney;2236
+sloan;2237
+social_distortion;2238
+sophie_hunger;2239
+e_40;2240
+steve_wynn;2241
+subsonica;2242
+joe_walsh;2243
+super_furry_animals;2244
+superchunk;2245
+supergrass;2246
+swervedriver;2247
+switchfoot;2248
+dido;2249
+takida;2250
+taking_back_sunday;2251
+teenage_fanclub;2252
+w_a_s_p;2253
+the_afghan_whigs;2254
+the_apples_in_stereo;2255
+the_ataris;2256
+smoking_popes;2257
+the_bluetones;2258
+the_breeders;2259
+the_cat_empire;2260
+the_charlatans_uk;2261
+the_clarks;2262
+guy_clark;2263
+the_comsat_angels;2264
+the_connells;2265
+the_coral;2266
+the_cribs;2267
+the_cult;2268
+bobby_o;2269
+the_mission;2270
+blue_yster_cult;2271
+the_dandy_warhols;2272
+the_dear_hunter;2273
+the_decemberists;2274
+the_early_november;2275
+thievery_corporation;2276
+the_fratellis;2277
+the_gaslight_anthem;2278
+jim_brickman;2279
+falling_up;2280
+the_hives;2281
+the_innocence_mission;2282
+the_jazz_butcher;2283
+the_jesus_lizard;2284
+the_lemonheads;2285
+babyshambles;2286
+the_living_end;2287
+the_matrixx;2288
+the_mother_hips;2289
+the_mountain_goats;2290
+the_muffs;2291
+the_pillows;2292
+the_posies;2293
+the_presidents_of_the_united_states_of_america;2294
+the_rasmus;2295
+the_raveonettes;2296
+the_saints;2297
+the_samples;2298
+bad_religion;2299
+the_smithereens;2300
+the_soundtrack_of_our_lives;2301
+the_tea_party;2302
+mayday;2303
+the_triffids;2304
+the_vines;2305
+the_violet_burning;2306
+the_wallflowers;2307
+testament;2308
+the_divine_comedy;2309
+third_day;2310
+thrice;2311
+tindersticks;2312
+tism;2313
+tit_s;2314
+toad_the_wet_sprocket;2315
+tocotronic;2316
+tom_mcrae;2317
+tori_amos;2318
+tracy_chapman;2319
+trashcan_sinatras;2320
+tre_allegri_ragazzi_morti;2321
+tub_ring;2322
+unkle;2323
+unwritten_law;2324
+uverworld;2325
+vast;2326
+verdena;2327
+veruca_salt;2328
+face_to_face;2329
+virus;2330
+voltaire;2331
+we_the_kings;2332
+the_kooks;2333
+lindisfarne;2334
+seals_crofts;2335
+andy_partridge;2336
+xutos_pontap_s;2337
+yellowcard;2338
+yup;2339
+leevi_and_the_leavings;2340
+zo;2341
+zucchero;2342
+z;2343
+ebnem_ferah;2344
+air;2345
+alice;2346
+boards_of_canada;2347
+brian_eno;2348
+burzum;2349
+daniel_lanois;2350
+enigma;2351
+juana_molina;2352
+lisa_gerrard;2353
+nox_arcana;2354
+renard;2355
+schiller;2356
+sigur_r_s;2357
+steven_wilson;2358
+swans;2359
+wolfgun;2360
+xiu_xiu;2361
+michael_johnson;2362
+montgomery_gentry;2363
+the_stanley_brothers;2364
+john_waite;2365
+shelby_lynne;2366
+judy_collins;2367
+burl_ives;2368
+the_irish_rovers;2369
+david_wilcox;2370
+devendra_banhart;2371
+doc_watson;2372
+bill_monroe;2373
+michael_martin_murphey;2374
+gordon_bok;2375
+asleep_at_the_wheel;2376
+the_browns;2377
+nana_mouskouri;2378
+jerry_jeff_walker;2379
+steve_goodman;2380
+malcolm_holcombe;2381
+malvina_reynolds;2382
+odetta;2383
+tom_paxton;2384
+strawbs;2385
+phil_ochs;2386
+harry_chapin;2387
+ramblin_jack_elliott;2388
+roger_mcguinn;2389
+gene_clark;2390
+mat_kearney;2391
+the_brothers_four;2392
+tom_russell;2393
+townes_van_zandt;2394
+uncle_dave_macon;2395
+delbert_mcclinton;2396
+john_hiatt;2397
+justin_townes_earle;2398
+mark_erelli;2399
+over_the_rhine;2400
+steve_forbert;2401
+manfred_mann_s_earth_band;2402
+mot_rhead;2403
+rudimentary_peni;2404
+illapu;2405
+inti_illimani;2406
+quilapay_n;2407
+v_ctor_jara;2408
+skylark;2409
+adam_green;2410
+cold_chisel;2411
+guy_sebastian;2412
+jefferson_starship;2413
+the_alan_parsons_project;2414
+ali_project;2415
+modern_talking;2416
+animal_collective;2417
+banco_del_mutuo_soccorso;2418
+ben_lee;2419
+bryan_ferry;2420
+buffy_sainte_marie;2421
+colin_blunstone;2422
+cursive;2423
+elysian_fields;2424
+emerson_lake_palmer;2425
+gino_vannelli;2426
+g_rard_manset;2427
+hot_dad;2428
+marina_and_the_diamonds;2429
+ismo_alanko;2430
+kansas;2431
+kari_peitsamo;2432
+laibach;2433
+laurie_anderson;2434
+puhdys;2435
+na_o_zumbi;2436
+roger_waters;2437
+rush;2438
+the_walker_brothers;2439
+hilltop_hoods;2440
+wolfgang_ambros;2441
+erste_allgemeine_verunsicherung;2442
+jacques_brel;2443
+rainhard_fendrich;2444
+tom_waits;2445
+adrian_belew;2446
+anne_clark;2447
+can;2448
+captain_beefheart_and_the_magic_band;2449
+deine_lakaien;2450
+devo;2451
+einst_rzende_neubauten;2452
+frank_zappa;2453
+goethes_erben;2454
+wishbone_ash;2455
+death_cab_for_cutie;2456
+antony_and_the_johnsons;2457
+jandek;2458
+nevermore;2459
+king_crimson;2460
+king_missile;2461
+the_residents;2462
+steeleye_span;2463
+vampire_rodents;2464
+the_walkmen;2465
+dog_fashion_disco;2466
+freak_kitchen;2467
+sigh;2468
+children_of_bodom;2469
+soft_machine;2470
+ara_ketu;2471
+asa_de_guia;2472
+banda_eva;2473
+ivete_sangalo;2474
+chiclete_com_banana;2475
+daniela_mercury;2476
+alejandro_sanz;2477
+timbalada;2478
+juan_luis_guerra;2479
+daddy_yankee;2480
+alceu_valen_a;2481
+luiz_gonzaga;2482
+matia_bazar;2483
+axelle_red;2484
+barbara;2485
+benny_neyman;2486
+gigi_d_agostino;2487
+jacques_higelin;2488
+caetano_veloso;2489
+gal_costa;2490
+jorge_ben;2491
+die_flippers;2492
+nicole;2493
+angra;2494
+reinhard_mey;2495
+wolf_biermann;2496
+florent_pagny;2497
+hannes_wader;2498
+tienne_daho;2499
+henri_salvador;2500
+f_lix_leclerc;2501
+daniel_lavoie;2502
+gerhard_sch_ne;2503
+g_lben_ergen;2504
+georg_kreisler;2505
+herbert_gr_nemeyer;2506
+herman_van_veen;2507
+hildegard_knef;2508
+marlene_dietrich;2509
+iu;2510
+jos_luis_rodr_guez;2511
+juliette_gr_co;2512
+klaus_hoffmann;2513
+konstantin_wecker;2514
+saltatio_mortis;2515
+luigi_tenco;2516
+maria_beth_nia;2517
+adriana_calcanhotto;2518
+marie_lafor_t;2519
+marius_m_ller_westernhagen;2520
+mina;2521
+no_l_coward;2522
+pippo_pollina;2523
+rita_lee;2524
+os_mutantes;2525
+rita_pavone;2526
+roger_whittaker;2527
+al_bano_romina_power;2528
+salvatore_adamo;2529
+simone;2530
+s_rgio_godinho;2531
+udo_j_rgens;2532
+udo_lindenberg;2533
+ulrich_roski;2534
+zaz;2535
+z_ramalho;2536
+fagner;2537
+dith_piaf;2538
+duelo;2539
+espinoza_paz;2540
+fidel_rueda;2541
+la_firma;2542
+la_arrolladora_banda_el_lim_n;2543
+voz_de_mando;2544
+sergio_vega;2545
+fool_s_garden;2546
+waltari;2547
+of_montreal;2548
+pierre_lapointe;2549
+rufus_wainwright;2550
+loudon_wainwright_iii;2551
+sufjan_stevens;2552
+machine_gun_kelly;2553
+francesco_guccini;2554
+le_orme;2555
+lucio_dalla;2556
+michel_fugain;2557
+al_jarreau;2558
+carmen_mcrae;2559
+javier_sol_s;2560
+harry_connick_jr;2561
+bap;2562
+cradle_of_filth;2563
+amorphis;2564
+avatar;2565
+bathory;2566
+behemoth;2567
+borknagar;2568
+countess;2569
+cruachan;2570
+darkthrone;2571
+hate;2572
+destruction;2573
+dimmu_borgir;2574
+eisregen;2575
+enslaved;2576
+finntroll;2577
+fates_warning;2578
+graveworm;2579
+impaled_nazarene;2580
+sentenced;2581
+king_diamond;2582
+kreator;2583
+lord_belial;2584
+marduk;2585
+mercyful_fate;2586
+stick_to_your_guns;2587
+moonspell;2588
+as_i_lay_dying;2589
+nunslaughter;2590
+rotting_christ;2591
+samael;2592
+sandy_denny;2593
+skyforger;2594
+sodom;2595
+cannibal_corpse;2596
+exodus;2597
+atreyu;2598
+theatres_des_vampires;2599
+wizard;2600
+transmetal;2601
+venom;2602
+belphegor;2603
+the_crown;2604
+moya_brennan;2605
+todd_rundgren;2606
+clay_walker;2607
+andrew_peterson;2608
+lynn_anderson;2609
+david_crowder_band;2610
+pam_tillis;2611
+norah_jones;2612
+rhonda_vincent;2613
+jamey_johnson;2614
+plumb;2615
+j_j_cale;2616
+new_riders_of_the_purple_sage;2617
+joe_diffie;2618
+kasey_chambers;2619
+leon_russell;2620
+jack_greene;2621
+the_string_cheese_incident;2622
+ystein_sunde;2623
+stephen_stills;2624
+cancerslug;2625
+robert_plant;2626
+alvin_lee;2627
+beth_hart;2628
+jimmy_buffett;2629
+billy_s_band;2630
+bunbury;2631
+nacho_vegas;2632
+calogero;2633
+georges_brassens;2634
+canned_heat;2635
+charlie_louvin;2636
+colin_james;2637
+cuby_blizzards;2638
+dick_annegarn;2639
+edoardo_bennato;2640
+eva_cassidy;2641
+gil_scott_heron;2642
+glenn_hughes;2643
+deep_purple;2644
+connie_smith;2645
+iva_zanicchi;2646
+izzy_stradlin;2647
+j_karjalainen;2648
+jack_bruce;2649
+leonard_cohen;2650
+joan_armatrading;2651
+joan_osborne;2652
+john_martyn;2653
+rio_reiser;2654
+larry_carlton;2655
+madeleine_peyroux;2656
+bruce_cockburn;2657
+kate_anna_mcgarrigle;2658
+mavis_staples;2659
+noa;2660
+ralph_mctell;2661
+renato_carosone;2662
+richie_kotzen;2663
+robben_ford;2664
+roberto_carlos;2665
+erasmo_carlos;2666
+robin_trower;2667
+rory_block;2668
+roy_buchanan;2669
+sandra_mihanovich;2670
+savoy_brown;2671
+shirley_horn;2672
+siniestro_total;2673
+slank;2674
+the_fabulous_thunderbirds;2675
+the_seatbelts;2676
+the_tragically_hip;2677
+mike_jones;2678
+trophy_scars;2679
+caravan;2680
+velhas_virgens;2681
+walter_trout;2682
+gov_t_mule;2683
+bar_o_vermelho;2684
+blue_cheer;2685
+ian_hunter;2686
+david_leb_n;2687
+de_palmas;2688
+eugenio_finardi;2689
+extreme;2690
+foghat;2691
+george_thorogood_the_destroyers;2692
+great_white;2693
+guardian;2694
+jethro_tull;2695
+ian_anderson;2696
+david_knopfler;2697
+steppenwolf;2698
+dave_edmunds;2699
+lynyrd_skynyrd;2700
+crosby_stills_nash;2701
+raul_seixas;2702
+the_poodles;2703
+musiq_soulchild;2704
+shocking_blue;2705
+nick_lowe;2706
+the_black_crowes;2707
+traffic;2708
+widespread_panic;2709
+co;2710
+alberto_cortez;2711
+joan_sebastian;2712
+ana_gabriel;2713
+gilberto_santa_rosa;2714
+rub_n_blades;2715
+v_ctor_manuelle;2716
+celia_cruz;2717
+luis_fonsi;2718
+nek;2719
+dr_feelgood;2720
+astrud_gilberto;2721
+benito_di_paula;2722
+brazzaville;2723
+sacha_distel;2724
+chico_buarque;2725
+elis_regina;2726
+milton_nascimento;2727
+faf_de_bel_m;2728
+nikka_costa;2729
+tim_maia;2730
+gilberto_gil;2731
+lisa_ekdahl;2732
+joyce;2733
+maria_rita;2734
+nara_le_o;2735
+nouvelle_vague;2736
+paulinho_moska;2737
+wilson_simonal;2738
+14_bis;2739
+arnaldo_antunes;2740
+biquini_cavad_o;2741
+cidade_negra;2742
+cpm_22;2743
+c_ssia_eller;2744
+os_paralamas_do_sucesso;2745
+guilherme_arantes;2746
+ira;2747
+lob_o;2748
+nenhum_de_n_s;2749
+djavan;2750
+rog_rio_skylab;2751
+roupa_nova;2752
+ultraje_a_rigor;2753
+kj_52;2754
+amado_batista;2755
+chit_ozinho_xoror;2756
+jo_o_paulo_daniel;2757
+leandro_leonardo;2758
+leonardo;2759
+odair_jos;2760
+kaiser_chiefs;2761
+kula_shaker;2762
+lightning_seeds;2763
+pulp;2764
+the_proclaimers;2765
+dying_fetus;2766
+napalm_death;2767
+nile;2768
+pathology;2769
+hilary_duff;2770
+badly_drawn_boy;2771
+federico_salvatore;2772
+i_gufi;2773
+zachary_richard;2774
+stan_rogers;2775
+moxy_fr_vous;2776
+poco;2777
+la_bottine_souriante;2778
+stompin_tom_connors;2779
+bersuit_vergarabat;2780
+las_pastillas_del_abuelo;2781
+george_lam;2782
+altan;2783
+clannad;2784
+blackmore_s_night;2785
+capercaillie;2786
+celtic_thunder;2787
+eluveitie;2788
+powerwolf;2789
+gaelic_storm;2790
+an_na;2791
+jon_anderson;2792
+the_dubliners;2793
+loreena_mckennitt;2794
+omnia;2795
+secret_garden;2796
+shaun_davey;2797
+roger_daltrey;2798
+the_corrs;2799
+los_tigres_del_norte;2800
+laurent_voulzy;2801
+the_kelly_family;2802
+wolfe_tones;2803
+alan_stivell;2804
+heather_alexander;2805
+kate_rusby;2806
+dropkick_murphys;2807
+great_big_sea;2808
+fiddler_s_green;2809
+heather_dale;2810
+runrig;2811
+the_waterboys;2812
+dougie_maclean;2813
+adriano_celentano;2814
+alain_chamfort;2815
+zazie;2816
+hamelen;2817
+tazenda;2818
+arno;2819
+arthur_h;2820
+boudewijn_de_groot;2821
+charles_trenet;2822
+claudio_baglioni;2823
+claudio_rocchi;2824
+fabrizio_de_andr;2825
+dalida;2826
+dana_winner;2827
+demis_roussos;2828
+esther_ofarim;2829
+eugenio_bennato;2830
+michel_berger;2831
+francis_cabrel;2832
+maxime_le_forestier;2833
+georges_moustaki;2834
+gianmaria_testa;2835
+gianni_morandi;2836
+gigliola_cinquetti;2837
+milva;2838
+gilbert_b_caud;2839
+ginette_reno;2840
+giuni_russo;2841
+guy_b_art;2842
+helena_vondr_kov;2843
+hugues_aufray;2844
+ivan_graziani;2845
+ivano_fossati;2846
+jacques_bertin;2847
+jean_ferrat;2848
+juliane_werding;2849
+julien_clerc;2850
+los_temerarios;2851
+katerine;2852
+leny_escudero;2853
+mathieu_chedid;2854
+luca_barbarossa;2855
+l_o_ferr;2856
+rosenstolz;2857
+marc_lavoine;2858
+massimo_bubola;2859
+mecano;2860
+mia_martini;2861
+michel_jonasz;2862
+michele_zarrillo;2863
+fiorello;2864
+nada;2865
+mercedes_sosa;2866
+nino_d_angelo;2867
+patrick_bruel;2868
+patty_pravo;2869
+pierre_bachelet;2870
+rainald_grebe;2871
+rapha_l;2872
+raphael;2873
+richard_anthony;2874
+roberto_murolo;2875
+ron;2876
+stefano_rosso;2877
+stephan_eicher;2878
+vasco_rossi;2879
+yves_duteil;2880
+yves_jamait;2881
+ang_lica;2882
+aaron_carter;2883
+barry_louis_polisar;2884
+yuri;2885
+cri_cri;2886
+hevisaurus;2887
+juice_leskinen;2888
+kidz_bop;2889
+mara_maravilha;2890
+destroyer;2891
+scorpions;2892
+obk;2893
+duncan_dhu;2894
+parry_gripp;2895
+sandy_junior;2896
+the_verve_pipe;2897
+the_verve;2898
+the_wiggles;2899
+veggietales;2900
+newsboys;2901
+steven_curtis_chapman;2902
+toro_y_moi;2903
+medi_val_b_bes;2904
+aaron_neville;2905
+bethel_music;2906
+apologetix;2907
+gaither_vocal_band;2908
+building_429;2909
+chris_tomlin;2910
+matt_maher;2911
+jerusalem;2912
+david_meece;2913
+debby_boone;2914
+elevation_worship;2915
+matt_redman;2916
+planetshakers;2917
+majesty;2918
+jump5;2919
+lecrae;2920
+michael_w_smith;2921
+bride;2922
+natalie_grant;2923
+the_lads;2924
+audio_adrenaline;2925
+paul_wilbur;2926
+psalmen_voor_nu;2927
+sawyer_brown;2928
+shane_shane;2929
+the_echoing_green;2930
+twila_paris;2931
+watch_tower_bible_and_tract_society;2932
+da_t_r_u_t_h;2933
+dc_talk;2934
+flame;2935
+grits;2936
+trip_lee;2937
+crystal_lewis;2938
+the_cross_movement;2939
+tobymac;2940
+vico_c;2941
+mormon_tabernacle_choir;2942
+august_burns_red;2943
+black_veil_brides;2944
+deliverance;2945
+opeth;2946
+die_happy;2947
+disciple;2948
+galactic_cowboys;2949
+haste_the_day;2950
+living_sacrifice;2951
+mastodon;2952
+mortification;2953
+showbread;2954
+labyrinth;2955
+stryper;2956
+the_devil_wears_prada;2957
+underoath;2958
+whitecross;2959
+petra;2960
+huntingtons;2961
+mxpx;2962
+d_a_d;2963
+caedmon_s_call;2964
+david_and_the_giants;2965
+degarmo_and_key;2966
+delirious;2967
+don_francisco;2968
+five_iron_frenzy;2969
+geoff_moore;2970
+hawk_nelson;2971
+grave;2972
+larry_norman;2973
+randy_stonehill;2974
+monty_python;2975
+oomph;2976
+oficina_g3;2977
+white_heart;2978
+rescate;2979
+rick_wakeman;2980
+la_oreja_de_van_gogh;2981
+sanctus_real;2982
+fun_people;2983
+thousand_foot_krutch;2984
+tim_hughes;2985
+the_o_c_supertones;2986
+4him;2987
+billy_gilman;2988
+aimee_mann;2989
+katharine_mcphee;2990
+eros_ramazzotti;2991
+z_ro;2992
+babbie_mason;2993
+bebo_norman;2994
+judy_garland;2995
+carman;2996
+cece_winans;2997
+trick_daddy;2998
+chris_isaak;2999
+cocteau_twins;3000
+edyta_g_rniak;3001
+enrico_ruggeri;3002
+ffh;3003
+hanson;3004
+hawksley_workman;3005
+indigo_girls;3006
+irene_grandi;3007
+jackie_evancho;3008
+joy_electric;3009
+kelly_price;3010
+mary_mary;3011
+israel_houghton;3012
+phil_wickham;3013
+phillips_craig_dean;3014
+roch_voisine;3015
+rupaul;3016
+gregorian;3017
+sarah_connor;3018
+sugarland;3019
+sweetbox;3020
+tarja;3021
+the_brian_setzer_orchestra;3022
+brian_setzer;3023
+badfinger;3024
+the_moffatts;3025
+the_vandals;3026
+trans_siberian_orchestra;3027
+roy_drusky;3028
+burton_cummings;3029
+procol_harum;3030
+renaissance;3031
+the_pretty_things;3032
+twisted_sister;3033
+bj_rn_eidsv_g;3034
+corvus_corax;3035
+schelmish;3036
+emilie_autumn;3037
+epica;3038
+katherine_jenkins;3039
+scala_kolacny_brothers;3040
+take_6;3041
+the_roches;3042
+tony_banks;3043
+to_e_proeski;3044
+lacrimosa;3045
+16_volt;3046
+bj_rn_rosenstr_m;3047
+bob_rivers;3048
+cledus_t_judd;3049
+frankjavcee;3050
+george_formby;3051
+ninja_sex_party;3052
+paul_and_storm;3053
+the_arrogant_worms;3054
+tripod;3055
+el_cuarteto_de_nos;3056
+gwar;3057
+knorkator;3058
+psychostick;3059
+rodgau_monotones;3060
+los_palominos;3061
+charlie_peacock;3062
+jesus_culture;3063
+michael_card;3064
+tenth_avenue_north;3065
+carrie_newcomer;3066
+nick_drake;3067
+aaron_watson;3068
+billy_joe_royal;3069
+billy_joe_shaver;3070
+charlie_landsborough;3071
+chris_ledoux;3072
+collin_raye;3073
+dan_seals;3074
+dave_dudley;3075
+hellbillies;3076
+ed_bruce;3077
+emilio_navaira;3078
+jean_shepard;3079
+freddie_hart;3080
+gary_stewart;3081
+gene_watson;3082
+gian_giovani;3083
+gilberto_gilmar;3084
+jason_mraz;3085
+ilse_delange;3086
+john_prine;3087
+jake_owen;3088
+wynn_stewart;3089
+jim_ed_brown;3090
+joe_ely;3091
+kid_rock;3092
+la_toya_jackson;3093
+lit;3094
+lita_ford;3095
+me_first_and_the_gimme_gimmes;3096
+lagwagon;3097
+melanie;3098
+mickey_newbury;3099
+paul_brunelle;3100
+paula_fernandes;3101
+zez_di_camargo_luciano;3102
+randy_rogers_band;3103
+reverend_horton_heat;3104
+rick_renner;3105
+rionegro_solim_es;3106
+shooter_jennings;3107
+terri_clark;3108
+vern_gosdin;3109
+webb_wilder;3110
+ween;3111
+38_special;3112
+the_beau_brummels;3113
+matanza;3114
+clawfinger;3115
+acid_drinkers;3116
+agnostic_front;3117
+biohazard;3118
+body_count;3119
+d_r_i;3120
+municipal_waste;3121
+neurosis;3122
+nuclear_assault;3123
+soziedad_alkoholika;3124
+suicidal_tendencies;3125
+paragon;3126
+mario;3127
+inna;3128
+belinda;3129
+bronco;3130
+grupo_bryndis;3131
+david_bisbal;3132
+ram_n_ayala;3133
+grant_lee_phillips;3134
+the_veronicas;3135
+amr_diab;3136
+atb;3137
+basshunter;3138
+dream_theater;3139
+frankie_j;3140
+baby_bash;3141
+sophie_ellis_bextor;3142
+grace_jones;3143
+laveerre;3144
+silkk_the_shocker;3145
+parov_stelar;3146
+raffaella_carr;3147
+elephant_man;3148
+saint_etienne;3149
+samantha_fox;3150
+selena;3151
+super_junior;3152
+t_a_t_u;3153
+tarkan;3154
+judie_tzuke;3155
+el_kel_iset;3156
+yello;3157
+franz_ferdinand;3158
+chenoa;3159
+lucero;3160
+tokio;3161
+puffy_amiyumi;3162
+wink;3163
+obie_trice;3164
+mystikal;3165
+current_93;3166
+dark_sanctuary;3167
+rome;3168
+lord_of_the_lost;3169
+bella_morte;3170
+mantus;3171
+blutengel;3172
+clan_of_xymox;3173
+dead_can_dance;3174
+death_in_june;3175
+diary_of_dreams;3176
+diorama;3177
+helium_vola;3178
+illuminate;3179
+l_me_immortelle;3180
+lacrimas_profundere;3181
+killing_joke;3182
+m_nchener_freiheit;3183
+otto_dix;3184
+project_pitchfork;3185
+qntal;3186
+sopor_aeternus;3187
+the_cr_xshadows;3188
+unheilig;3189
+welle;3190
+yendri;3191
+carcass;3192
+asphyx;3193
+bolt_thrower;3194
+darkseed;3195
+paradise_lost;3196
+tiamat;3197
+the_damned;3198
+pantera;3199
+the_amity_affliction;3200
+judas_priest;3201
+amon_amarth;3202
+alesana;3203
+atrocity;3204
+autopsy;3205
+avulsed;3206
+sabaton;3207
+misfits;3208
+iron_fire;3209
+centinex;3210
+dagoba;3211
+dark_tranquillity;3212
+asia;3213
+deicide;3214
+dethklok;3215
+dew_scented;3216
+edge_of_sanity;3217
+escape_the_fate;3218
+heaven_shall_burn;3219
+hypocrisy;3220
+incantation;3221
+jungle_rot;3222
+kataklysm;3223
+krisiun;3224
+macabre;3225
+malevolent_creation;3226
+meshuggah;3227
+misanthrope;3228
+morbid_angel;3229
+dead_kennedys;3230
+necro;3231
+pig_destroyer;3232
+shadows_fall;3233
+sinister;3234
+six_feet_under;3235
+dream_evil;3236
+soulfly;3237
+the_black_dahlia_murder;3238
+between_the_buried_and_me;3239
+therion;3240
+vader;3241
+whitechapel;3242
+attila;3243
+emmure;3244
+miss_may_i;3245
+the_acacia_strain;3246
+betontod;3247
+broilers;3248
+dritte_wahl;3249
+ohl;3250
+slime;3251
+terrorgruppe;3252
+b_hse_onkelz;3253
+frei_wild;3254
+k_rbholz;3255
+asp;3256
+tokio_hotel;3257
+queensr_che;3258
+amanda_miguel;3259
+arabesque;3260
+bad_boys_blue;3261
+boyce_avenue;3262
+parliament;3263
+wu_tang_clan;3264
+neoton_fam_lia;3265
+teena_marie;3266
+bobby_womack;3267
+agoraphobic_nosebleed;3268
+candlemass;3269
+electric_wizard;3270
+black_sabbath;3271
+theatre_of_tragedy;3272
+type_o_negative;3273
+marie_fredriksson;3274
+luna;3275
+marissa_nadler;3276
+yo_la_tengo;3277
+celldweller;3278
+hitomi;3279
+big_d_and_the_kids_table;3280
+alacranes_musical;3281
+k_paz_de_la_sierra;3282
+assemblage_23;3283
+covenant;3284
+die_krupps;3285
+kodak_black;3286
+front_242;3287
+haujobb;3288
+in_strict_confidence;3289
+le_ther_strip;3290
+snog;3291
+the_darkness;3292
+tanzwut;3293
+terminal_choice;3294
+velvet_acid_christ;3295
+vnv_nation;3296
+wumpscut;3297
+x_fusion;3298
+umbra_et_imago;3299
+de_vision;3300
+deichkind;3301
+eisbrecher;3302
+herbie_hancock;3303
+ana_moura;3304
+macaco;3305
+skinny_puppy;3306
+ayreon;3307
+black_moth_super_rainbow;3308
+erykah_badu;3309
+cocorosie;3310
+de_jeugd_van_tegenwoordig;3311
+dj_shadow;3312
+e_nomine;3313
+kmfdm;3314
+flying_lotus;3315
+goldfrapp;3316
+hanzel_und_gretyl;3317
+information_society;3318
+mc_frontalot;3319
+kraftwerk;3320
+ladytron;3321
+lamb;3322
+milk_inc;3323
+mind_in_a_box;3324
+ministry;3325
+m_m;3326
+m_nia;3327
+pig;3328
+pitchshifter;3329
+lil_boosie;3330
+master_p;3331
+mindless_self_indulgence;3332
+buzzcocks;3333
+vanilla_ice;3334
+milie_simon;3335
+gianna_nannini;3336
+pinback;3337
+the_birthday_massacre;3338
+archive;3339
+99_posse;3340
+bloc_party;3341
+morcheeba;3342
+origa;3343
+paul_kalkbrenner;3344
+tina_arena;3345
+dover;3346
+melotron;3347
+owl_city;3348
+kamelot;3349
+greeley_estates;3350
+hawthorne_heights;3351
+joan_of_arc;3352
+saves_the_day;3353
+thursday;3354
+transit;3355
+fairport_convention;3356
+maggie_reilly;3357
+joan_manuel_serrat;3358
+e_rotic;3359
+the_scene;3360
+sandra;3361
+amon_d_l_ii;3362
+circa_survive;3363
+love_solfege;3364
+caliban;3365
+tall_dwarfs;3366
+van_der_graaf_generator;3367
+death_grips;3368
+the_fiery_furnaces;3369
+am_lia_rodrigues;3370
+cristina_branco;3371
+jos_afonso;3372
+katia_guerreiro;3373
+ney_matogrosso;3374
+madredeus;3375
+mariza;3376
+gipsy_kings;3377
+mal;3378
+aleks_syntek;3379
+ni_a_pastori;3380
+rosario;3381
+al_stewart;3382
+amos_lee;3383
+andr_s_calamaro;3384
+ane_brun;3385
+asa;3386
+editors;3387
+catie_curtis;3388
+chrystian_ralf;3389
+clueso;3390
+eddi_reader;3391
+eddie_from_ohio;3392
+ellis_paul;3393
+frank_turner;3394
+estampie;3395
+ferdi_tayfur;3396
+fito_p_ez;3397
+luis_alberto_spinetta;3398
+gabriella_ferri;3399
+gigi;3400
+greg_brown;3401
+g_ksel;3402
+lando_fiorini;3403
+india_arie;3404
+jack_savoretti;3405
+anne_grete_preus;3406
+jarom_r_nohavica;3407
+joe_purdy;3408
+john_wesley_harding;3409
+josh_rouse;3410
+karel_kryl;3411
+v_tor_ramil;3412
+lars_winnerb_ck;3413
+laura_marling;3414
+llu_s_llach;3415
+los_chalchaleros;3416
+luka_bloom;3417
+malicorne;3418
+mark_heard;3419
+martin_carthy;3420
+nic_jones;3421
+le_n_gieco;3422
+mijares;3423
+nuova_compagnia_di_canto_popolare;3424
+ola_magnell;3425
+thin_lizzy;3426
+ray_lamontagne;3427
+ron_sexsmith;3428
+rosana;3429
+silvio_rodr_guez;3430
+stef_bos;3431
+sun_kil_moon;3432
+tanita_tikaram;3433
+the_incredible_string_band;3434
+thea_gilmore;3435
+tina_dico;3436
+victor_leo;3437
+v_rttin;3438
+ge_aleksandersen;3439
+i_brahim_tatl_ses;3440
+ektomorf;3441
+elvenking;3442
+ensiferum;3443
+falconer;3444
+feuerschwanz;3445
+in_extremo;3446
+korpiklaani;3447
+leaves_eyes;3448
+letzte_instanz;3449
+m_go_de_oz;3450
+saurom;3451
+schandmaul;3452
+skyclad;3453
+subway_to_sally;3454
+suidakra;3455
+t_r;3456
+icehouse;3457
+bomb_the_music_industry;3458
+the_real_mckenzies;3459
+54_40;3460
+armored_saint;3461
+alexz_johnson;3462
+bar_man_o;3463
+ezginin_g_nl;3464
+galija;3465
+sts;3466
+h_kan_hellstr_m;3467
+james_blunt;3468
+kazik;3469
+mewithoutyou;3470
+michel_polnareff;3471
+ovidi_montllor;3472
+rasputina;3473
+shearwater;3474
+gerry_rafferty;3475
+steam_powered_giraffe;3476
+the_saw_doctors;3477
+ty_segall;3478
+tyrone_wells;3479
+avi_es_do_forr;3480
+grimskunk;3481
+sinik;3482
+vitaa;3483
+kenza_farah;3484
+sexion_d_assaut;3485
+aliz_e;3486
+henri_tachan;3487
+jenifer;3488
+m_pokora;3489
+indochine;3490
+brainstorm;3491
+con_funk_shun;3492
+funkadelic;3493
+lena_park;3494
+neffa;3495
+ugk;3496
+suburban_legends;3497
+mai_kuraki;3498
+cherry_poppin_daddies;3499
+electric_six;3500
+los_straitjackets;3501
+the_69_eyes;3502
+the_angels;3503
+the_haunted;3504
+the_hellacopters;3505
+the_kills;3506
+thee_oh_sees;3507
+white_denim;3508
+zabranjeno_pu_enje;3509
+ol_dirty_bastard;3510
+kurupt;3511
+spice_1;3512
+brotha_lynch_hung;3513
+chamillionaire;3514
+paul_wall;3515
+trae;3516
+club_dogo;3517
+mc_eiht;3518
+royce_da_5_9;3519
+geto_boys;3520
+the_diplomats;3521
+ice_t;3522
+2_live_crew;3523
+xv;3524
+mobb_deep;3525
+c_murder;3526
+tru;3527
+lil_keke;3528
+project_pat;3529
+tha_dogg_pound;3530
+esham;3531
+twiztid;3532
+erick_sermon;3533
+big_tymers;3534
+kate_nash;3535
+the_cramps;3536
+nekromantix;3537
+tsol;3538
+ace_frehley;3539
+hardcore_superstar;3540
+harem_scarem;3541
+house_of_lords;3542
+kingdom_come;3543
+l_a_guns;3544
+mr_big;3545
+pink_cream_69;3546
+quiet_riot;3547
+riot;3548
+ratt;3549
+tnt;3550
+backyard_babies;3551
+ultima_thule;3552
+europe;3553
+hanoi_rocks;3554
+mott_the_hoople;3555
+smokie;3556
+suzi_quatro;3557
+haemorrhage;3558
+aline_barros;3559
+bruna_karla;3560
+kirk_franklin;3561
+minist_rio_koinonya_de_louvor;3562
+artrosis;3563
+closterkeller;3564
+indica;3565
+sirenia;3566
+trail_of_tears;3567
+tristania;3568
+within_temptation;3569
+bauhaus;3570
+mono_inc;3571
+pansy_division;3572
+xandria;3573
+immortal_technique;3574
+agathocles;3575
+rotten_sound;3576
+the_locust;3577
+anthrax;3578
+devildriver;3579
+lamb_of_god;3580
+machine_head;3581
+parkway_drive;3582
+pro_pain;3583
+throwdown;3584
+vicious_rumors;3585
+screaming_trees;3586
+cuisillos;3587
+intocable;3588
+pesado;3589
+la_mafia;3590
+marco_antonio_sol_s;3591
+los_bukis;3592
+andrew_w_k;3593
+april_wine;3594
+axel_rudi_pell;3595
+b_z;3596
+tak_matsumoto;3597
+barricada;3598
+bijelo_dugme;3599
+blaze_bayley;3600
+bonfire;3601
+bruce_dickinson;3602
+buckcherry;3603
+budgie;3604
+buitres;3605
+jorn;3606
+doro;3607
+enuff_z_nuff;3608
+gentle_giant;3609
+girlschool;3610
+golden_earring;3611
+gotthard;3612
+nazareth;3613
+a_day_to_remember;3614
+jefferson_airplane;3615
+joe_satriani;3616
+ken_hensley;3617
+kim_mitchell;3618
+king_s_x;3619
+kotiteollisuus;3620
+la_renga;3621
+lee_aaron;3622
+lordi;3623
+michael_schenker_group;3624
+mustasch;3625
+night_ranger;3626
+omega;3627
+parni_valjak;3628
+paul_gilbert;3629
+popeda;3630
+skid_row;3631
+tankcsapda;3632
+the_bronx;3633
+the_donnas;3634
+all_that_remains;3635
+triumph;3636
+umphrey_s_mcgee;3637
+y_t;3638
+ziggy;3639
+sfdk;3640
+7_seconds;3641
+aiden;3642
+alphaville;3643
+black_flag;3644
+slayer;3645
+circle_jerks;3646
+ritchie;3647
+converge;3648
+every_time_i_die;3649
+hatebreed;3650
+nomeansno;3651
+rancid;3652
+memphis_may_fire;3653
+nofx;3654
+propagandhi;3655
+tankard;3656
+screeching_weasel;3657
+sick_of_it_all;3658
+silverstein;3659
+two_steps_from_hell;3660
+faun;3661
+accept;3662
+the_frames;3663
+andromeda;3664
+annihilator;3665
+anvil;3666
+artillery;3667
+avenged_sevenfold;3668
+axxis;3669
+blind_guardian;3670
+vanden_plas;3671
+grave_digger;3672
+dragonforce;3673
+edenbridge;3674
+damien_jurado;3675
+exciter;3676
+firewind;3677
+halford;3678
+hammerfall;3679
+helloween;3680
+helstar;3681
+iced_earth;3682
+jag_panzer;3683
+machinae_supremacy;3684
+manowar;3685
+metal_church;3686
+morgana_lefay;3687
+mudvayne;3688
+nocturnal_rites;3689
+overkill;3690
+primal_fear;3691
+rebellion;3692
+running_wild;3693
+corey_hart;3694
+savatage;3695
+saxon;3696
+steve_vai;3697
+tad_morose;3698
+tarot;3699
+tierra_santa;3700
+trivium;3701
+turmion_k_til_t;3702
+u_d_o;3703
+virgin_steele;3704
+voivod;3705
+warcry;3706
+yngwie_malmsteen;3707
+zion_lennox;3708
+sido;3709
+mc_chris;3710
+assalti_frontali;3711
+kool_keith;3712
+ayumi_hamasaki;3713
+az;3714
+bahh_tee;3715
+bassi_maestro;3716
+revocation;3717
+blumentopf;3718
+brockhampton;3719
+bts;3720
+bushido;3721
+vinnie_paz;3722
+chakuza;3723
+cheek;3724
+cro;3725
+arc_ngel;3726
+alexis_fido;3727
+dargen_d_amico;3728
+the_coup;3729
+def_con_dos;3730
+die_fantastischen_vier;3731
+dom_no;3732
+donguralesko;3733
+epmd;3734
+kool_savas;3735
+fettes_brot;3736
+fronda;3737
+mc_solaar;3738
+pyhimys;3739
+kaaris;3740
+kollegah;3741
+kontra_k;3742
+k_k;3743
+l_o_c;3744
+logic;3745
+jerry_rivera;3746
+murs;3747
+angie_stone;3748
+namie_amuro;3749
+anthony_hamilton;3750
+lyfe_jennings;3751
+bl_f;3752
+o_s_t_r;3753
+paluch;3754
+parazi_ii;3755
+porta;3756
+bleeding_through;3757
+prinz_pi;3758
+rasmentalism;3759
+xavier_naidoo;3760
+sage_francis;3761
+stupeflip;3762
+young_thug;3763
+tego_calder_n;3764
+fifth_harmony;3765
+jay_chou;3766
+blitzkid;3767
+zumbis_do_espa_o;3768
+deer_tick;3769
+half_man_half_biscuit;3770
+hayden;3771
+club_8;3772
+grandaddy;3773
+jens_lekman;3774
+kent;3775
+keren_ann;3776
+los_campesinos;3777
+nellie_mckay;3778
+china_crisis;3779
+prefab_sprout;3780
+the_clientele;3781
+the_lucksmiths;3782
+bell_x1;3783
+british_sea_power;3784
+car_seat_headrest;3785
+deerhunter;3786
+dr_dog;3787
+elf_power;3788
+frightened_rabbit;3789
+fugazi;3790
+fury_in_the_slaughterhouse;3791
+julie_doiron;3792
+tinashe;3793
+la_habitaci_n_roja;3794
+margot_the_nuclear_so_and_so_s;3795
+matt_pond_pa;3796
+metric;3797
+mike_doughty;3798
+mother_mother;3799
+piebald;3800
+quasi;3801
+rheostatics;3802
+sebadoh;3803
+spoon;3804
+starflyer_59;3805
+stephen_malkmus;3806
+stereolab;3807
+ted_leo_and_the_pharmacists;3808
+the_appleseed_cast;3809
+the_faint;3810
+the_go_betweens;3811
+the_pineapple_thief;3812
+the_undertones;3813
+tronic;3814
+chris_de_burgh;3815
+mass_hysteria;3816
+angelo_branduardi;3817
+gigi_d_alessio;3818
+i_muvrini;3819
+back_number;3820
+boa;3821
+claris;3822
+crystal_kay;3823
+zard;3824
+gackt;3825
+garnet_crow;3826
+girls_generation;3827
+kat_tun;3828
+koda_kumi;3829
+kotoko;3830
+lisa;3831
+maaya_sakamoto;3832
+masami_okui;3833
+mr_children;3834
+news;3835
+shinee;3836
+w_inds;3837
+yui;3838
+yumi_matsutoya;3839
+the_high_lows;3840
+sid;3841
+abbey_lincoln;3842
+anna_maria_jopek;3843
+cassandra_wilson;3844
+dianne_reeves;3845
+fred_buscaglione;3846
+jane_monheit;3847
+zor_n;3848
+kraan;3849
+laura_fygi;3850
+michael_franks;3851
+natalino_otto;3852
+quartetto_cetra;3853
+scott_bradlee_s_postmodern_jukebox;3854
+stacey_kent;3855
+the_flower_kings;3856
+ronnie_von;3857
+brown_eyed_girls;3858
+ahmet_kaya;3859
+alejandra_guzm_n;3860
+ana_carolina;3861
+alcione;3862
+el_chapo_de_sinaloa;3863
+gustavo_cerati;3864
+soda_stereo;3865
+jenni_rivera;3866
+joaqu_n_sabina;3867
+los_fabulosos_cadillacs;3868
+abel_pintos;3869
+ana_bel_n;3870
+aterciopelados;3871
+camilo_sesto;3872
+david_demar_a;3873
+gian_marco;3874
+menudo;3875
+ricardo_arjona;3876
+sabroso;3877
+v_ctor_manuel;3878
+las_pelotas;3879
+ariel_pink;3880
+leehom_wang;3881
+jolin_tsai;3882
+darkest_hour;3883
+kalmah;3884
+nightrage;3885
+eppu_normaali;3886
+the_outfield;3887
+no_use_for_a_name;3888
+pennywise;3889
+callejon;3890
+d_f_c;3891
+our_last_night;3892
+exaltasamba;3893
+beth_carvalho;3894
+jo_o_bosco;3895
+marina_lima;3896
+marisa_monte;3897
+nando_reis;3898
+natiruts;3899
+ra_a_negra;3900
+s_pra_contrariar;3901
+zeca_pagodinho;3902
+andr_hazes;3903
+de_dijk;3904
+arena;3905
+iq;3906
+sol_invictus;3907
+new_found_glory;3908
+adam_ant;3909
+berlin;3910
+hoodoo_gurus;3911
+ultravox;3912
+nik_kershaw;3913
+squeeze;3914
+the_aquabats;3915
+the_fixx;3916
+beat_crusaders;3917
+cows;3918
+conjunto_primavera;3919
+peter_and_the_test_tube_babies;3920
+sham_69;3921
+the_adicts;3922
+the_analogs;3923
+instalok;3924
+jacek_kaczmarski;3925
+przemys_aw_gintrowski;3926
+ada_band;3927
+agnetha_f_ltskog;3928
+ajda_pekkan;3929
+al_bano;3930
+alex_ubago;3931
+alison_moyet;3932
+alunni_del_sole;3933
+anna_oxa;3934
+bajm;3935
+barclay_james_harvest;3936
+blue_system;3937
+brunner_brunner;3938
+candan_er_etin;3939
+christian_bautista;3940
+clay_aiken;3941
+clifford_t_ward;3942
+daniel;3943
+don_backy;3944
+jesse_mccartney;3945
+emma;3946
+marcella_bella;3947
+giorgio_gaber;3948
+guus_meeuwis;3949
+heinz_rudolf_kunze;3950
+john_farnham;3951
+ian_thomas;3952
+i_n_karaca;3953
+jennifer_rush;3954
+jo_vally;3955
+john_fogerty;3956
+julian_lennon;3957
+k3;3958
+kid_abelha;3959
+labv_l_gais_tips;3960
+l_vi;3961
+lea_salonga;3962
+les_wampas;3963
+magnus_uggla;3964
+mango;3965
+maria_mena;3966
+massimo_ranieri;3967
+max_gazz;3968
+michael_learns_to_rock;3969
+mietta;3970
+mustafa_sandal;3971
+nil_fer;3972
+peter_frampton;3973
+pr_ta_v_tra;3974
+pur;3975
+rettore;3976
+ricchi_e_poveri;3977
+rob_de_nijs;3978
+sara_bareilles;3979
+sasha;3980
+sertab_erener;3981
+sezen_aksu;3982
+stadio;3983
+stephen_sondheim;3984
+tamara;3985
+team_starkid;3986
+toto_cutugno;3987
+umberto_tozzi;3988
+herman_brood;3989
+wanessa;3990
+zen_caf;3991
+bonanza_banzai;3992
+bodyjar;3993
+bracket;3994
+frenzal_rhomb;3995
+goldfinger;3996
+the_wonder_years;3997
+useless_id;3998
+camel;3999
+hombres_g;4000
+leo_jaime;4001
+neal_morse;4002
+spock_s_beard;4003
+new_trolls;4004
+opus;4005
+piersi;4006
+premiata_forneria_marconi;4007
+superbus;4008
+zmelkoow;4009
+boysetsfire;4010
+hot_water_music;4011
+new_model_army;4012
+the_monochrome_set;4013
+big_big_train;4014
+avantasia;4015
+dark_moor;4016
+dreamtale;4017
+freedom_call;4018
+mystic_prophecy;4019
+nightmare;4020
+rhapsody_of_fire;4021
+royal_hunt;4022
+sonata_arctica;4023
+stratovarius;4024
+symphony_x;4025
+vision_divine;4026
+the_wildhearts;4027
+armia;4028
+evergrey;4029
+lana_lane;4030
+nektar;4031
+pain_of_salvation;4032
+riverside;4033
+beardfish;4034
+echolyn;4035
+eloy;4036
+john_wetton;4037
+medina_azahara;4038
+mostly_autumn;4039
+pendragon;4040
+rafo_r_ez;4041
+the_meteors;4042
+against_me;4043
+anti_flag;4044
+banda_bassotti;4045
+cadena_perpetua;4046
+descendents;4047
+distemper;4048
+dogwood;4049
+el_ltimo_ke_zierre;4050
+farben_lehre;4051
+toy_dolls;4052
+junkies;4053
+ksu;4054
+la_polla_records;4055
+la_vela_puerca;4056
+leatherface;4057
+less_than_jake;4058
+mad_caddies;4059
+millencolin;4060
+punkreas;4061
+reel_big_fish;4062
+snfu;4063
+stiff_little_fingers;4064
+swingin_utters;4065
+the_bouncing_souls;4066
+the_casualties;4067
+the_dickies;4068
+the_lawrence_arms;4069
+toyah;4070
+gerald_levert;4071
+gondwana;4072
+los_aut_nticos_decadentes;4073
+los_cafres;4074
+los_pericos;4075
+tryo;4076
+rakim_ken_y;4077
+billy_squier;4078
+bj_rn_afzelius;4079
+glay;4080
+hunters_collectors;4081
+john_entwistle;4082
+jokke;4083
+la_beriso;4084
+los_rancheros;4085
+los_tres;4086
+maanam;4087
+mikel_erentxun;4088
+peter_wolf;4089
+racoon;4090
+rev_lver;4091
+riblja_orba;4092
+sandro;4093
+gene_vincent;4094
+the_baseballs;4095
+stray_cats;4096
+as_marcianas;4097
+bruno_marrone;4098
+cristiano_ara_jo;4099
+fernando_sorocaba;4100
+joint_venture;4101
+serge_reggiani;4102
+ska_p;4103
+the_mighty_mighty_bosstones;4104
+fu_manchu;4105
+jay_jay_johanson;4106
+psyche;4107
+carlos_gardel;4108
diff --git a/jukebox/data/ids/v2_genre_ids.txt b/jukebox/data/ids/v2_genre_ids.txt
new file mode 100644
index 0000000000000000000000000000000000000000..39adb96a85bc3b08ce68f33fb9ead3d76b7e2aef
--- /dev/null
+++ b/jukebox/data/ids/v2_genre_ids.txt
@@ -0,0 +1,120 @@
+unknown;0
+classical;1
+blues;2
+hip;3
+hop;4
+dance;5
+soul;6
+hard;7
+rock;8
+jazz;9
+reggae;10
+country;11
+alternative;12
+soundtrack;13
+pop;14
+bluegrass;15
+vocal;16
+r;17
+b;18
+rap;19
+christian;20
+gospel;21
+electronic;22
+christmas;23
+singer;24
+songwriter;25
+metal;26
+n;27
+roll;28
+synthpop;29
+electronica;30
+mpb;31
+movie;32
+indie;33
+new;34
+wave;35
+electro;36
+house;37
+folk;38
+punk;39
+french;40
+contemporary;41
+garage;42
+soft;43
+acoustic;44
+nu;45
+television;46
+post;47
+eurodance;48
+progressive;49
+gothic;50
+classic;51
+funk;52
+disco;53
+swing;54
+trance;55
+thrash;56
+psychedelic;57
+heavy;58
+american;59
+grunge;60
+art;61
+j;62
+gangsta;63
+brazilian;64
+latin;65
+southern;66
+ska;67
+crossover;68
+hardcore;69
+industrial;70
+glam;71
+melodic;72
+ambient;73
+musical;74
+dream;75
+experimental;76
+americana;77
+chanson;78
+rockabilly;79
+britpop;80
+children;81
+s;82
+music;83
+electropop;84
+power;85
+celtic;86
+dark;87
+comedy;88
+doom;89
+trip;90
+lo;91
+fi;92
+metalcore;93
+symphonic;94
+fado;95
+schlager;96
+avant;97
+garde;98
+europop;99
+reggaeton;100
+emo;101
+death;102
+samba;103
+deathcore;104
+black;105
+horrorcore;106
+grindcore;107
+worship;108
+salsa;109
+ebm;110
+neofolk;111
+sertanejo;112
+deutschrock;113
+norte;114
+o;115
+ax;116
+k;117
+tejano;118
+medieval;119
diff --git a/jukebox/data/ids/v3_artist_ids.txt b/jukebox/data/ids/v3_artist_ids.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9987bb6e676b015ca5b3a7a0d234e69b171dd4cc
--- /dev/null
+++ b/jukebox/data/ids/v3_artist_ids.txt
@@ -0,0 +1,7898 @@
+beat farmers;1
+aaron sprinkle;2
+dianne reeves;3
+lowe;4
+harry manx;5
+hail of bullets;6
+ian gillan;7
+andraé crouch;8
+widespread panic;9
+buddy wasisname and the other fellers;10
+misery index;11
+albert west;12
+shadowland;13
+homer & jethro;14
+damien jurado;15
+dead to fall;16
+british sea power;17
+pam tillis;18
+ice cube;19
+hey rosetta!;20
+sophie zelmani;21
+riverside;22
+head automatica;23
+diabulus in musica;24
+unitopia;25
+revolting cocks;26
+zita swoon;27
+train;28
+ken stringfellow;29
+in dying arms;30
+red lorry yellow lorry;31
+small faces;32
+michael sweet;33
+30 odd foot of grunts;34
+white heart;35
+baby bash;36
+bad bones;37
+meat beat manifesto;38
+vengeance;39
+naomi;40
+koritni;41
+the fall of troy;42
+split enz;43
+emmy rossum;44
+les fleur de lys;45
+beaux arts trio;46
+david crowder band;47
+mojave 3;48
+girl talk;49
+motorpsycho;50
+burning point;51
+the rutles;52
+david and the giants;53
+jinjer;54
+sitd;55
+pedro the lion;56
+masta ace;57
+alexz johnson;58
+the floacist;59
+after 7;60
+anointed;61
+holy soldier;62
+sanchez;63
+wovenhand;64
+thea gilmore;65
+3t;66
+patty loveless;67
+ghost;68
+rie fu;69
+chemical vocation;70
+robbie nevil;71
+the notorious b.i.g.;72
+america;73
+the boo radleys;74
+in hearts wake;75
+jack the lad;76
+gerry and the pacemakers;77
+he is we;78
+cuban link;79
+galaxie 500;80
+something with numbers;81
+the last shadow puppets;82
+minor threat;83
+joss stone;84
+lynch mob;85
+zino francescatti;86
+genitorturers;87
+kenny g;88
+graveworm;89
+field mob;90
+opus;91
+jordan smith;92
+sheppard;93
+the haunted;94
+tiny ruins;95
+jimmy somerville;96
+acid reign;97
+falling in reverse;98
+ace troubleshooter;99
+josh groban;100
+adriano celentano;101
+john oates;102
+mind funk;103
+christafari;104
+clan of xymox;105
+anti-flag;106
+the blow monkeys;107
+the troggs;108
+priscilla ahn;109
+fastball;110
+raekwon;111
+royal wood;112
+agoraphobic nosebleed;113
+borknagar;114
+parker millsap;115
+kelly osbourne;116
+psyche;117
+brokencyde;118
+george clinton;119
+the hollies;120
+gabriel kahane;121
+dnce;122
+jimmy nail;123
+harem scarem;124
+pierre fournier;125
+gideon;126
+elitist;127
+the sheepdogs;128
+like moths to flames;129
+the constructus corporation;130
+impending doom;131
+joe williams;132
+bizzy bone;133
+nelson;134
+earth and fire;135
+underoath;136
+rancid;137
+exile;138
+vertical horizon;139
+percy sledge;140
+ill bill;141
+59 times the pain;142
+jimmy dean;143
+gary jules;144
+spellblast;145
+renee olstead;146
+barbra streisand;147
+spin doctors;148
+galt macdermot;149
+takara;150
+alan stivell;151
+andy davis;152
+babes in toyland;153
+still remains;154
+the donnas;155
+bishop allen;156
+the skids;157
+rhiannon giddens;158
+natalia;159
+henson cargill;160
+gov't mule;161
+jools holland;162
+kehlani;163
+londonbeat;164
+andy mineo;165
+corky and the juice pigs;166
+days away;167
+a fine frenzy;168
+roger mcguinn;169
+lena horne;170
+shark island;171
+machinemade god;172
+yank rachell;173
+hurricane;174
+his statue falls;175
+that petrol emotion;176
+764-hero;177
+leprous;178
+bridgit mendler;179
+beggars opera;180
+abbie gale;181
+the the;182
+y'akoto;183
+sound tesselated;184
+webb pierce;185
+river whyless;186
+ronnie dio & the prophets;187
+rotting christ;188
+duff mckagan;189
+slim harpo;190
+adele;191
+valencia;192
+the damned;193
+miguel;194
+chantal kreviazuk;195
+the db's;196
+cartel;197
+enrique iglesias;198
+skrewdriver;199
+one less reason;200
+lil wayne;201
+chris norman;202
+type o negative;203
+trip shakespeare;204
+jack blanchard & misty morgan;205
+fishboy;206
+ted leo and the pharmacists;207
+lukas graham;208
+the vapors;209
+conway twitty & loretta lynn;210
+sandie shaw;211
+mark knopfler;212
+through the eyes of the dead;213
+art of dying;214
+free;215
+saint motel;216
+sonreal;217
+gatsbys american dream;218
+elisa;219
+marc anthony;220
+joan baez;221
+someone still loves you boris yeltsin;222
+ugk;223
+deep purple;224
+mother mother;225
+the contortionist;226
+hot chelle rae;227
+eric clapton;228
+the doobie brothers;229
+john michael montgomery;230
+izegrim;231
+jason collett;232
+close your eyes;233
+snog;234
+ghostpoet;235
+new order;236
+the brian setzer orchestra;237
+royal tusk;238
+guy mitchell;239
+heart;240
+the free design;241
+billie ray martin;242
+toto;243
+david mallett;244
+donovan;245
+the years gone by;246
+element 101;247
+fairyland;248
+triggerfinger;249
+mc eiht;250
+ottorino respighi;251
+the four aces;252
+lil son jackson;253
+emanuel feuermann;254
+juliette and the licks;255
+p!nk;256
+gretchen wilson;257
+the animals;258
+locksley;259
+redgum;260
+young mc;261
+metronomy;262
+ashland high;263
+esoteric;264
+johnny hates jazz;265
+paul anka;266
+ethel merman;267
+east west;268
+the knife;269
+curren$y;270
+maaya sakamoto;271
+aesthetic perfection;272
+bobby helms;273
+jimi jamison;274
+darren styles;275
+lorrie morgan;276
+miley cyrus;277
+dropdead;278
+dr. sin;279
+burt bacharach;280
+nf;281
+astronautalis;282
+garth brooks;283
+flyleaf;284
+lake;285
+mad sin;286
+tiffany evans;287
+mudvayne;288
+máni svavarsson & magnús scheving;289
+unexpect;290
+collin raye;291
+johnny reid;292
+antonio vivaldi;293
+creed;294
+burning heads;295
+legion of the damned;296
+matt costa;297
+the aluminum group;298
+orgy;299
+2nd chapter of acts;300
+shotgun messiah;301
+mentallo & the fixer;302
+urma;303
+carmen mcrae;304
+skid row;305
+john denver & the muppets;306
+angie stone;307
+rob rock;308
+clara haskil;309
+morandi;310
+team dresch;311
+the zombies;312
+mr. vegas;313
+royal trux;314
+suzanne vega;315
+the afters;316
+skydiggers;317
+sunday's best;318
+gary numan;319
+three dog night;320
+jonas brothers;321
+uncle acid & the deadbeats;322
+pablo de sarasate;323
+goretrade;324
+strapping young lad;325
+fat joe;326
+robert johnson;327
+fred astaire;328
+anastacia;329
+devendra banhart;330
+into it. over it.;331
+john norum;332
+cross canadian ragweed;333
+destroyer;334
+michael christmas;335
+phobia;336
+jill tracy;337
+dilana;338
+royce da 5'9";339
+les savy fav;340
+the blow;341
+kim wilde;342
+parts & labor;343
+dinah washington;344
+maggie reilly;345
+screaming trees;346
+p.o.s.;347
+atomic rooster;348
+chamillionaire;349
+the vaccines;350
+tides of man;351
+heathen;352
+flame;353
+brain drill;354
+ac/dc;355
+kraan;356
+scary kids scaring kids;357
+rosaline;358
+john legend;359
+of montreal;360
+the brunettes;361
+shelley fabares;362
+volumes;363
+george enescu;364
+jacob's dream;365
+heartless bastards;366
+darin;367
+andy stochansky;368
+david geringas;369
+lucius;370
+steep;371
+bobby vinton;372
+shania twain;373
+rudolf serkin;374
+the zolas;375
+municipal waste;376
+spectral;377
+arcade fire;378
+steve hillage;379
+the presets;380
+gustav mahler;381
+gary morris;382
+laura cantrell;383
+dean brody;384
+roger miller;385
+tammy wynette;386
+joe cocker;387
+iceage;388
+apostasy;389
+tait;390
+reverend gary davis;391
+neverending white lights;392
+mimicking birds;393
+barney;394
+major parkinson;395
+seal;396
+wham!;397
+tha dogg pound;398
+big l;399
+ian thomas;400
+kronos;401
+dom pachino;402
+dead can dance;403
+the number twelve looks like you;404
+bert williams;405
+bedhead;406
+scott bradlee's postmodern jukebox;407
+monuments;408
+christine mcvie;409
+moonspell;410
+david & the citizens;411
+*nsync;412
+tiny tim;413
+surface;414
+k.flay;415
+travis scott;416
+lil jon;417
+jo stafford;418
+elo part ii;419
+sugarland;420
+eternal;421
+the dingees;422
+the summer set;423
+soft machine;424
+maanam;425
+right said fred;426
+chicks on speed;427
+foetus;428
+fiona apple;429
+primer 55;430
+the dillinger escape plan;431
+seahaven;432
+biga ranx;433
+the insyderz;434
+thirty seconds to mars;435
+page france;436
+howlin' wolf;437
+wishbone ash;438
+nina sky;439
+jess moskaluke;440
+stan rogers;441
+b.o.b;442
+cypecore;443
+young dro;444
+julian lennon;445
+opeth;446
+flying lotus;447
+rodney atkins;448
+sea of treachery;449
+montrose;450
+nellie mckay;451
+vladimir horowitz;452
+fatboy slim;453
+mystic prophecy;454
+little river band;455
+brooklyn bounce;456
+destroid;457
+mary hopkin;458
+elliott yamin;459
+billy bragg;460
+the doors;461
+esham;462
+cab calloway;463
+thi'sl;464
+the gothsicles;465
+david coverdale;466
+joe henry;467
+the human abstract;468
+alger "texas" alexander;469
+diane cluck;470
+fozzy;471
+zero 7;472
+cole swindell;473
+gladys knight & the pips;474
+donna fargo;475
+cave in;476
+eiffel 65;477
+fates warning;478
+decrepit birth;479
+bad religion;480
+poison clan;481
+shane & shane;482
+johnny shines;483
+u.n.l.v.;484
+seth lakeman;485
+mindy smith;486
+josh white;487
+android lust;488
+mylon lefevre;489
+aselin debison;490
+kaskade;491
+the stills;492
+alpha blondy;493
+hughes turner project;494
+spice girls;495
+zz top;496
+fairport convention;497
+the ritchie family;498
+eleanor friedberger;499
+laura branigan;500
+the jordanaires;501
+the bacon brothers;502
+atomic opera;503
+spike jones;504
+faith hill;505
+mandy moore;506
+jan werner;507
+kittie;508
+edwin;509
+michael roe;510
+leeland;511
+sammy hagar;512
+frankjavcee;513
+the bangles;514
+joey mcintyre;515
+david rovics;516
+across the border;517
+odd future;518
+bill ward;519
+eddy grant;520
+boa;521
+nirvana;522
+darzamat;523
+ed sheeran;524
+the prodigy;525
+wang chung;526
+balance problems;527
+valient thorr;528
+rupaul;529
+roy clark;530
+ross lynch;531
+ugly kid joe;532
+bettye lavette;533
+harry belafonte;534
+roy buchanan;535
+miguel bosé;536
+greenslade;537
+living legends;538
+bing crosby;539
+adam sandler;540
+the czars;541
+bethany dillon;542
+lea salonga;543
+kmfdm;544
+the diplomats;545
+magneta lane;546
+mira;547
+g herbo;548
+issues;549
+beastie boys;550
+marvin gaye;551
+ashes you leave;552
+mordred;553
+israel houghton;554
+screaming mechanical brain;555
+unknown hinson;556
+jack johnson;557
+do;558
+guns n' roses;559
+october project;560
+adore delano;561
+jedi mind tricks;562
+andrew peterson;563
+millionaires;564
+the beatnuts;565
+gilby clarke;566
+chickenfoot;567
+the stranglers;568
+rev theory;569
+the mccalmans;570
+drowning pool;571
+kutt calhoun;572
+dark fortress;573
+the undertones;574
+kevin gilbert;575
+ffh;576
+seven places;577
+fury in the slaughterhouse;578
+covenant;579
+jason isbell;580
+the creepshow;581
+ashbury heights;582
+shakey graves;583
+brett young;584
+lords of black;585
+the higher;586
+judy garland;587
+boy harsher;588
+status quo;589
+iq;590
+underworld;591
+krizz kaliko;592
+jefferson airplane;593
+billy walker;594
+jackie lomax;595
+lizzy borden;596
+keke wyatt;597
+closterkeller;598
+agnostic front;599
+mary mary;600
+birds in row;601
+mugison;602
+randy travis;603
+glorior belli;604
+amorphis;605
+martika;606
+jason webley;607
+duke ellington;608
+europe;609
+the wilkinsons;610
+a bullet for pretty boy;611
+jodeci;612
+sister hazel;613
+atrocity;614
+little willie john;615
+alexander borodin;616
+belouis some;617
+big boi;618
+newworldson;619
+muddy waters;620
+karen elson;621
+lou bega;622
+ivoryline;623
+pain confessor;624
+dolour;625
+captain beefheart and his magic band;626
+barclay james harvest;627
+todd snider;628
+enslaved;629
+beach fossils;630
+trick daddy;631
+the black dahlia murder;632
+rhapsody of fire;633
+cemetary;634
+patsy cline;635
+figure four;636
+manuel de falla;637
+neil diamond;638
+sworn enemy;639
+elvenking;640
+d.r.a.m.;641
+sonya kitchell;642
+flight of the conchords;643
+eddie from ohio;644
+talk talk;645
+thoushaltnot;646
+this is the kit;647
+crimson glory;648
+the bears;649
+amenra;650
+doris day;651
+death in june;652
+aaron copland;653
+astrud gilberto;654
+luna;655
+fury;656
+corpus christi;657
+soul position;658
+be'lakor;659
+roy orbison;660
+beyond dawn;661
+el-p;662
+watch tower bible and tract society;663
+end of you;664
+falconer;665
+war from a harlots mouth;666
+china;667
+erra;668
+brainpool;669
+psyclon nine;670
+quintorigo;671
+the blind boys of alabama;672
+mr. big;673
+reverend horton heat;674
+yehuda hanani;675
+bell x1;676
+john michael talbot;677
+sigh;678
+james brown;679
+the murder of my sweet;680
+courtney marie andrews;681
+kate alexa;682
+jasmine v;683
+malevolent creation;684
+said the whale;685
+the indelicates;686
+masterplan;687
+every time i die;688
+echosmith;689
+barzin;690
+thelma houston;691
+masters of reality;692
+tony iommi;693
+sex gang children;694
+chaos uk;695
+casper & the cookies;696
+johnny gill;697
+alex harvey;698
+lemar;699
+andre matos;700
+shinee;701
+earl sweatshirt;702
+sanctus real;703
+chilliwack;704
+lionel hampton;705
+faith assembly;706
+dave van ronk;707
+frankie goes to hollywood;708
+badfinger;709
+gazpacho;710
+centro-matic;711
+donald lawrence;712
+me first and the gimme gimmes;713
+margot & the nuclear so and so's;714
+big star;715
+unisonic;716
+the delgados;717
+28 days;718
+veil of maya;719
+tammi terrell;720
+the righteous brothers;721
+david knopfler;722
+ihsahn;723
+high inergy;724
+leaves' eyes;725
+parov stelar;726
+the retrosic;727
+tony rose;728
+tom lehrer;729
+buggles;730
+dennis brown;731
+trashcan sinatras;732
+polarkreis 18;733
+mavis staples;734
+xzibit;735
+black moth super rainbow;736
+yello;737
+the servant;738
+jana mashonee;739
+burl ives;740
+beyond the black;741
+tsjuder;742
+helen o'connell;743
+golden gate quartet;744
+debby boone;745
+beady eye;746
+acid king;747
+westlife;748
+big wreck;749
+manowar;750
+almôra;751
+sarah darling;752
+kenny white;753
+delta spirit;754
+curtis stigers;755
+fun.;756
+feist;757
+karate;758
+sleepy john estes;759
+lamb;760
+the roches;761
+tuatha de danann;762
+horrified;763
+fort minor;764
+jesse harris;765
+berman;766
+shannon & the clams;767
+indigo girls;768
+the matches;769
+eric stewart;770
+jls;771
+kristin hersh;772
+christon gray;773
+loreena mckennitt;774
+charley pride;775
+jocelyn enriquez;776
+helen baylor;777
+hot tuna;778
+skeeter davis;779
+brenton brown;780
+from good homes;781
+leslie hall;782
+natalia kills;783
+chris thompson;784
+circle of dust;785
+transmetal;786
+australian crawl;787
+dying fetus;788
+ratcat;789
+the waifs;790
+elevation worship;791
+bryan ferry;792
+camille;793
+patty griffin;794
+papa charlie jackson;795
+grendel;796
+chris murray;797
+ryan stevenson;798
+kiethevez;799
+the glorious unseen;800
+the ocean blue;801
+conducting from the grave;802
+negativland;803
+del shannon;804
+kathryn scott;805
+this ending;806
+bomfunk mc's;807
+opera ix;808
+steps;809
+the birthday massacre;810
+bronski beat;811
+the burns sisters;812
+willy deville;813
+girlicious;814
+death angel;815
+the quakes;816
+william fitzsimmons;817
+fm;818
+beggars & thieves;819
+richard buckner;820
+iwrestledabearonce;821
+here come the mummies;822
+only crime;823
+saint saviour;824
+millencolin;825
+bigwig;826
+benny mardones;827
+wale;828
+frida;829
+björn ulvaeus & benny andersson;830
+artrosis;831
+neurosis;832
+elis;833
+spain;834
+y&t;835
+jeff lynne;836
+syreeta;837
+ryan adams;838
+the lords of the new church;839
+stephen stills;840
+joel plaskett emergency;841
+blood or whiskey;842
+chenoa;843
+l.t.d.;844
+mariah carey;845
+bauhaus;846
+emma ruth rundle;847
+billy bragg & wilco;848
+sara evans;849
+sara bareilles;850
+cold world;851
+pig destroyer;852
+william elliott whitmore;853
+eyes set to kill;854
+agressor;855
+skiltron;856
+oysterband;857
+versaemerge;858
+now, now;859
+chelsea grin;860
+kaiser/mansfield;861
+george thorogood & the destroyers;862
+foghat;863
+xtc;864
+the nighthawks;865
+eric burdon & war;866
+the cross movement;867
+the winery dogs;868
+edyta górniak;869
+hexrx;870
+the cheeky girls;871
+gazebo;872
+canned heat;873
+anne sophie mutter;874
+sergei prokofiev;875
+beneath the sky;876
+jimmy reed;877
+annihilator;878
+the essex green;879
+john west;880
+bloodhound gang;881
+beth hart & joe bonamassa;882
+lacrimas profundere;883
+jbm;884
+commander cody and his lost planet airmen;885
+smoking popes;886
+seeed;887
+moon martin;888
+terence trent d'arby;889
+the darkest of the hillside thickets;890
+peter tosh;891
+throwdown;892
+the allman brothers band;893
+caribou;894
+axel rudi pell;895
+ne-yo;896
+joe ely;897
+james blake;898
+macklemore & ryan lewis;899
+王力宏 (leehom wang);900
+sam brown;901
+van morrison;902
+bella morte;903
+josé feliciano;904
+john popper;905
+genghis tron;906
+b3;907
+phil keaggy;908
+alesha dixon;909
+peace, love and pitbulls;910
+agents of mercy;911
+elton john;912
+etta james;913
+plus one;914
+spacemen 3;915
+tommy castro;916
+god forbid;917
+abysmal dawn;918
+cadillac blindside;919
+a*teens;920
+q5;921
+mr. president;922
+richmond fontaine;923
+polly paulusma;924
+hear'say;925
+vertical church band;926
+elvis costello;927
+nickelback;928
+the bolshoi;929
+kenny loggins;930
+bad manners;931
+dear reader;932
+tom robinson band;933
+delerium;934
+shirley caesar;935
+trae;936
+tesseract;937
+500 miles to memphis;938
+splitsville;939
+band of susans;940
+edl;941
+half-a-mill;942
+mechanical poet;943
+clay aiken;944
+mars argo;945
+palisades;946
+ian hunter;947
+tracey thorn;948
+jackson heights;949
+downplay;950
+old dominion;951
+jamey johnson;952
+deb talan;953
+little walter;954
+grits;955
+maren ord;956
+longfellow;957
+layzie bone;958
+cuby + blizzards;959
+oszibarack;960
+jillette johnson;961
+the hundred in the hands;962
+culture beat;963
+frozen plasma;964
+paul robeson;965
+alt-j;966
+darkside;967
+other lives;968
+blackfoot;969
+the beta band;970
+sally seltmann;971
+foals;972
+robin williamson;973
+gluecifer;974
+tristan prettyman;975
+guy sebastian;976
+pink guy;977
+johann sebastian bach;978
+methyl ethel;979
+active child;980
+john anderson;981
+neil halstead;982
+phantom planet;983
+john d. loudermilk;984
+greg long;985
+the limeliters;986
+peggy seeger;987
+b la bart k;988
+fallujah;989
+shanice;990
+darius danesh;991
+post malone;992
+audrey;993
+parry gripp;994
+barry white;995
+ravenous;996
+nils lofgren;997
+paddy goes to holyhead;998
+yefim bronfman;999
+prefab sprout;1000
+rose funeral;1001
+the boxmasters;1002
+eluveitie;1003
+tony yayo;1004
+michael learns to rock;1005
+dawes;1006
+lodger;1007
+the joe perry project;1008
+mandragora scream;1009
+kristene dimarco;1010
+today is the day;1011
+riot;1012
+skold;1013
+pendragon;1014
+el debarge;1015
+the wanted;1016
+the pharcyde;1017
+jason derulo;1018
+black light burns;1019
+raintime;1020
+lisa hannigan;1021
+moby;1022
+tedeschi trucks band;1023
+breaking laces;1024
+deströyer 666;1025
+glenn hughes;1026
+martin carthy and dave swarbrick;1027
+the verve;1028
+orleans;1029
+the browns;1030
+the drums;1031
+between the buried and me;1032
+the kingston trio;1033
+jean shepard;1034
+almah;1035
+gare du nord;1036
+the bellamy brothers;1037
+sandra;1038
+the twang;1039
+gorod;1040
+pandora;1041
+mick jagger;1042
+rebellion;1043
+lauren hoffman;1044
+poco;1045
+clara smith;1046
+oscar peterson;1047
+slobberbone;1048
+pitchshifter;1049
+hania;1050
+ziggy marley;1051
+billie jo spears;1052
+dum dum girls;1053
+tricky;1054
+lamont dozier;1055
+slingshot dakota;1056
+koko taylor;1057
+judas priest;1058
+idiot stare;1059
+olivier messiaen;1060
+akala;1061
+it dies today;1062
+fred eaglesmith;1063
+jessie james;1064
+moving mountains;1065
+knights of the abyss;1066
+gregorian;1067
+mr weebl;1068
+johnnie allan;1069
+newton faulkner;1070
+lonely kings;1071
+al stewart;1072
+broods;1073
+zyklon;1074
+basia;1075
+damaged;1076
+the reign of kindo;1077
+black bomb a;1078
+vic damone;1079
+desert rose band;1080
+swing out sister;1081
+arjen anthony lucassen;1082
+kerrs pink;1083
+level 42;1084
+the dollyrots;1085
+giant squid;1086
+jamie cullum;1087
+fritz kalkbrenner;1088
+the whispers;1089
+pilot speed;1090
+adhesive;1091
+leona lewis;1092
+hank williams;1093
+chris botti;1094
+creeper;1095
+tori amos;1096
+evocation;1097
+gothminister;1098
+mandolin orange;1099
+namie amuro;1100
+black sheep;1101
+bleed the sky;1102
+laura pausini;1103
+consequence;1104
+fever ray;1105
+third day;1106
+bed ich smetana;1107
+the chain gang of 1974;1108
+comes with the fall;1109
+duff mckagan's loaded;1110
+israel kamakawiwo'ole;1111
+pixie lott;1112
+bruce dickinson;1113
+sonny landreth;1114
+squeeze;1115
+pennywise;1116
+red café;1117
+the autumn offering;1118
+crashdïet;1119
+neil young;1120
+hi-tek;1121
+hanson;1122
+the blues brothers;1123
+snow tha product;1124
+ibeyi;1125
+carpathian forest;1126
+sheb wooley;1127
+russian red;1128
+american authors;1129
+nick lachey;1130
+jurassic 5;1131
+the smashing pumpkins;1132
+the lyric quartet;1133
+howard shore;1134
+julian lloyd webber;1135
+syleena johnson;1136
+wolf alice;1137
+nico;1138
+beach slang;1139
+robin zander;1140
+tanya tucker;1141
+mac;1142
+mogg/way;1143
+shaggy 2 dope;1144
+godley & creme;1145
+n.e.r.d;1146
+carbon leaf;1147
+august burns red;1148
+1349;1149
+smokey robinson & the miracles;1150
+röyksopp;1151
+stephanie mills;1152
+halestorm;1153
+webb wilder;1154
+chris stapleton;1155
+paul oakenfold;1156
+planetshakers;1157
+andrew belle;1158
+baha men;1159
+memphis willie b.;1160
+andromeda;1161
+cynic;1162
+gil shaham;1163
+george;1164
+toad the wet sprocket;1165
+kiuas;1166
+cat power;1167
+metric;1168
+saving jane;1169
+patrick watson;1170
+faith evans;1171
+trivium;1172
+kelly willis;1173
+my darkest days;1174
+the reason;1175
+david gilmour;1176
+roo panes;1177
+sasha;1178
+sympathy;1179
+go periscope;1180
+american me;1181
+sex pistols;1182
+jay-z;1183
+tiësto;1184
+jean sibelius;1185
+war;1186
+agent 51;1187
+thrice;1188
+amel larrieux;1189
+the futureheads;1190
+arlo guthrie;1191
+the saw doctors;1192
+hungry lucy;1193
+michelle malone;1194
+kay starr;1195
+adestria;1196
+ab-soul;1197
+the merry wives of windsor;1198
+bo carter;1199
+charlotte gainsbourg;1200
+suicidal angels;1201
+zolof the rock & roll destroyer;1202
+brian kennedy;1203
+jess glynne;1204
+the enemy;1205
+crystal lewis;1206
+hopesfall;1207
+helmet;1208
+alicia keys;1209
+tom milsom;1210
+keith green;1211
+macy gray;1212
+libera;1213
+the myriad;1214
+steven wilson;1215
+dennis wilson;1216
+war of ages;1217
+elliott brood;1218
+ace of base;1219
+elefant;1220
+ryker's;1221
+apologetix;1222
+stream of passion;1223
+interface;1224
+susan tedeschi;1225
+circle takes the square;1226
+team starkid;1227
+dillinger four;1228
+jt the bigga figga;1229
+mf doom;1230
+carrie newcomer;1231
+big & rich;1232
+caliban;1233
+project pat;1234
+joan osborne;1235
+juicy j;1236
+beady belle;1237
+victory;1238
+syd barrett;1239
+liv kristine;1240
+anything box;1241
+devin townsend project;1242
+amanda marshall;1243
+die krupps;1244
+sonny boy williamson ii;1245
+assuming we survive;1246
+soko;1247
+much the same;1248
+faunts;1249
+sally oldfield;1250
+the bottle rockets;1251
+kaledon;1252
+lighthouse family;1253
+the mākaha sons;1254
+dark moor;1255
+antagonist a.d.;1256
+lee dorsey;1257
+merle travis;1258
+cursed;1259
+pete rock;1260
+brother cane;1261
+...and oceans;1262
+ancient bards;1263
+the searchers;1264
+cappella;1265
+iced earth;1266
+marmaduke duke;1267
+colin meloy;1268
+venomous concept;1269
+anne murray;1270
+jay farrar;1271
+the town pants;1272
+chubby checker;1273
+tori kelly;1274
+the dodos;1275
+chris spedding;1276
+pmtoday;1277
+thunder lord;1278
+bloodbath;1279
+dear criminals;1280
+aaron carter;1281
+shai hulud;1282
+lil skies;1283
+jeniferever;1284
+jucifer;1285
+the new pornographers;1286
+fabrizio faniello;1287
+fleurie;1288
+mirror of deception;1289
+the real mckenzies;1290
+q-tip;1291
+neuraxis;1292
+rick derringer;1293
+crystal kay;1294
+robert randolph & the family band;1295
+naâman;1296
+little jimmy dickens;1297
+sir mix-a-lot;1298
+krs-one;1299
+daniil trifonov;1300
+morning glory;1301
+cheap trick;1302
+king tee;1303
+angela mccluskey;1304
+derdian;1305
+heitor villa lobos;1306
+nina;1307
+tyrese;1308
+dope stars inc.;1309
+vendetta red;1310
+pussycat;1311
+benjamin gibbard;1312
+nine;1313
+de lux;1314
+william kapell;1315
+vomito negro;1316
+a$ap rocky;1317
+mo b. dick;1318
+gamma ray;1319
+sarah vaughan;1320
+georges bizet;1321
+acid ranch;1322
+deadline;1323
+lady sovereign;1324
+flipsyde;1325
+jim jackson;1326
+violent femmes;1327
+emeli sandé;1328
+la bionda;1329
+sammie;1330
+joe budden;1331
+butterfly boucher;1332
+martha and the muffins;1333
+the faction;1334
+sohn;1335
+the cyrkle;1336
+the butchies;1337
+magna-fi;1338
+jennifer nettles;1339
+udora;1340
+missing persons;1341
+hey mercedes;1342
+cracker;1343
+alphaville;1344
+behexen;1345
+the tremeloes;1346
+the power station;1347
+mark collie;1348
+janet jackson;1349
+out out;1350
+mystery;1351
+warrant;1352
+kamelot;1353
+max webster;1354
+kristian stanfill;1355
+ken hensley;1356
+lyfe jennings;1357
+gusgus;1358
+after the burial;1359
+sam tsui;1360
+tony orlando & dawn;1361
+guy clark;1362
+electric wizard;1363
+vanna;1364
+nicole c. mullen;1365
+prozak;1366
+spahn ranch;1367
+william mcdowell;1368
+brandy clark;1369
+jamie lidell;1370
+grief;1371
+the maccabees;1372
+weedeater;1373
+marlango;1374
+hirax;1375
+elvis presley;1376
+aly & aj;1377
+ellis paul;1378
+we the kings;1379
+bound stems;1380
+symphony x;1381
+randy stonehill;1382
+man;1383
+centinex;1384
+blaine larsen;1385
+aqueduct;1386
+june of 44;1387
+we came as romans;1388
+natasha bedingfield;1389
+divinefire;1390
+alan hull;1391
+outlawz;1392
+seasick steve;1393
+nerina pallot;1394
+sandy denny;1395
+glenn gould;1396
+matt dusk;1397
+strawberry alarm clock;1398
+stryper;1399
+stefanie heinzmann;1400
+miracle of sound;1401
+pop unknown;1402
+rahsaan patterson;1403
+yellowman;1404
+the clay people;1405
+the good life;1406
+leroy carr;1407
+sonata arctica;1408
+tom odell;1409
+that handsome devil;1410
+birdy;1411
+jimmy cliff;1412
+kompressor;1413
+deadmau5;1414
+linda ronstadt;1415
+ufx;1416
+blind blake;1417
+jody watley;1418
+razed in black;1419
+dave clark five;1420
+diddy;1421
+anacrusis;1422
+dropkick murphys;1423
+doyle bramhall;1424
+lisa "left eye" lopes;1425
+ll cool j;1426
+deadsoul tribe;1427
+sunset rubdown;1428
+heaven 17;1429
+pavlov's dog;1430
+billie eilish;1431
+dido;1432
+deathboy;1433
+antaeus;1434
+dreamland;1435
+the beloved;1436
+the arrogant worms;1437
+closet monster;1438
+eartha kitt;1439
+radu lupu;1440
+cinderella effect;1441
+anathema;1442
+for the fallen dreams;1443
+shola ama;1444
+big thief;1445
+armand van helden;1446
+fake?;1447
+phil harris;1448
+loggins & messina;1449
+promise of redemption;1450
+the walkabouts;1451
+eisley;1452
+big joe turner;1453
+axenstar;1454
+hank williams iii;1455
+b.o.b.;1456
+wallis bird;1457
+as i lay dying;1458
+trey songz;1459
+charlotte hatherley;1460
+nneka;1461
+anúna;1462
+asia;1463
+mcauley schenker group;1464
+rose polenzani;1465
+john mayer;1466
+grant-lee phillips;1467
+taj mahal;1468
+ruston kelly;1469
+melissa manchester;1470
+lenka;1471
+kira isabella;1472
+hap palmer;1473
+taken by trees;1474
+anni b sweet;1475
+planet p project;1476
+the velvet underground;1477
+hoagy carmichael;1478
+testament;1479
+the 3rd and the mortal;1480
+fever tree;1481
+iris dement;1482
+happy days;1483
+prostitute disfigurement;1484
+the andrews sisters;1485
+eyedea & abilities;1486
+mipso;1487
+dinu lipatti;1488
+josephine foster;1489
+stephen sondheim;1490
+little big;1491
+kip winger;1492
+the voidz;1493
+matchbook romance;1494
+green carnation;1495
+xiu xiu;1496
+the hard-ons;1497
+glasseater;1498
+bloodlined calligraphy;1499
+yodelice;1500
+infectious grooves;1501
+alex lloyd;1502
+overcome;1503
+tom dice;1504
+rorschach test;1505
+nat & alex wolff;1506
+van halen;1507
+robert earl keen;1508
+dave hollister;1509
+rob thomas;1510
+chanté moore;1511
+lyle lovett;1512
+aaron lines;1513
+my life with the thrill kill kult;1514
+honne;1515
+the flatliners;1516
+eric b. & rakim;1517
+old man gloom;1518
+new found glory;1519
+louis logic;1520
+murray perahia;1521
+bass drum of death;1522
+702;1523
+jamie o'neal;1524
+the sheila divine;1525
+chris ledoux;1526
+huski;1527
+wolf parade;1528
+emmure;1529
+defiance, ohio;1530
+nine below zero;1531
+jamie winchester;1532
+cece winans;1533
+splashdown;1534
+the strumbellas;1535
+otis spann;1536
+juice wrld;1537
+brett anderson;1538
+the ambassador;1539
+arturo benedetti michelangeli;1540
+spinal tap;1541
+wild strawberries;1542
+mystikal;1543
+acumen nation;1544
+the ex;1545
+pearls before swine;1546
+the ink spots;1547
+mayhem;1548
+e;1549
+sara noxx;1550
+kid cudi;1551
+neko case;1552
+dethklok;1553
+vast;1554
+lila mccann;1555
+boxcar willie;1556
+the soundtrack of our lives;1557
+the deep dark woods;1558
+johnossi;1559
+sinéad lohan;1560
+tragic black;1561
+public enemy;1562
+toby keith;1563
+jesus on extasy;1564
+after forever;1565
+lightnin' hopkins;1566
+any given day;1567
+terminal choice;1568
+head east;1569
+sturgill simpson;1570
+cavalera conspiracy;1571
+pitboss 2000;1572
+broken social scene;1573
+hanzel und gretyl;1574
+kajagoogoo;1575
+village people;1576
+the most serene republic;1577
+ernest tubb and loretta lynn;1578
+arno;1579
+ninja sex party;1580
+bts;1581
+charlie simpson;1582
+criss angel;1583
+local natives;1584
+elliphant;1585
+right away, great captain!;1586
+frankie valli;1587
+dion;1588
+j. tillman;1589
+krayzie bone;1590
+atlanta rhythm section;1591
+grimes;1592
+slapp happy;1593
+voivod;1594
+ella fitzgerald;1595
+maddy prior;1596
+exo;1597
+bobby v;1598
+cherry glazerr;1599
+jackie deshannon;1600
+nancy sinatra;1601
+dragonheart;1602
+the casualties;1603
+polysics;1604
+cliff richard;1605
+sleeping with sirens;1606
+dew-scented;1607
+diva destruction;1608
+jens lekman;1609
+charlie landsborough;1610
+born ruffians;1611
+joe diffie;1612
+sonny terry;1613
+the black angels;1614
+envy;1615
+mary lou lord;1616
+of mice & men;1617
+tina arena;1618
+candlemass;1619
+stacie orrico;1620
+too $hort;1621
+bombay bicycle club;1622
+the dc3;1623
+marié digby;1624
+frank ocean;1625
+slash's snakepit;1626
+akcent;1627
+mortal sin;1628
+the rural alberta advantage;1629
+alter bridge;1630
+caligula's horse;1631
+diana ross;1632
+strand of oaks;1633
+austrian death machine;1634
+ana popovic;1635
+skepticism;1636
+shirley horn;1637
+siva six;1638
+valentine wolfe;1639
+titanic sinclair;1640
+joshua perahia;1641
+gallows;1642
+luxt;1643
+nick lowe;1644
+clarence "gatemouth" brown;1645
+sweet noise;1646
+thank you scientist;1647
+cherish the ladies;1648
+ten years after;1649
+frost;1650
+first blood;1651
+nightwish;1652
+musiq soulchild;1653
+big bill broonzy;1654
+benjamin francis leftwich;1655
+phantom blue;1656
+dune;1657
+hangnail;1658
+harold melvin & the blue notes;1659
+big d and the kids table;1660
+zeromancer;1661
+jello biafra;1662
+yg;1663
+katharine mcphee;1664
+quinn xcii;1665
+mississippi john hurt;1666
+new trolls;1667
+wizards;1668
+kix;1669
+slapshot;1670
+tor miller;1671
+xentrifuge;1672
+toni braxton;1673
+finch;1674
+caroline herring;1675
+dreadful shadows;1676
+ringworm;1677
+cory asbury;1678
+dezperadoz;1679
+mac davis;1680
+dionysus;1681
+michael w. smith;1682
+cold as life;1683
+peabo bryson;1684
+k.d. lang;1685
+grammatrain;1686
+jorn;1687
+no-man;1688
+nocturne;1689
+the screaming jets;1690
+charli xcx;1691
+tactical sekt;1692
+oomph!;1693
+atlas sound;1694
+the idle race;1695
+helstar;1696
+toxik;1697
+jesus culture;1698
+cissy houston;1699
+catman cohen;1700
+strike anywhere;1701
+toni childs;1702
+mika;1703
+theory in practice;1704
+lucinda williams;1705
+lord belial;1706
+raul midón;1707
+ida;1708
+trisha yearwood;1709
+bad astronaut;1710
+the runaways;1711
+a day to remember;1712
+milk inc.;1713
+fisher;1714
+king kobra;1715
+ma rainey;1716
+ralph stanley;1717
+andr watts;1718
+gregory and the hawk;1719
+the temptations;1720
+flaw;1721
+terror squad;1722
+black;1723
+bolt thrower;1724
+matt goss;1725
+nappy roots;1726
+a$ap ferg;1727
+shawn mendes;1728
+alison krauss & union station;1729
+eric johnson;1730
+ashley monroe;1731
+old crow medicine show;1732
+kelis;1733
+bad habit;1734
+van canto;1735
+the birthday party;1736
+rowland s. howard;1737
+marsha ambrosius;1738
+little dragon;1739
+k'jon;1740
+jack white;1741
+cactus;1742
+daft punk;1743
+jon oliva's pain;1744
+a$ap mob;1745
+emika;1746
+lazar berman;1747
+mark kozelek;1748
+ice-t;1749
+little richard;1750
+elijah blake;1751
+the laurie berkner band;1752
+clara luzia;1753
+ma$e;1754
+dikembe;1755
+boz scaggs;1756
+antony and the johnsons;1757
+autopilot off;1758
+big audio dynamite;1759
+grant lee buffalo;1760
+john reuben;1761
+mission of burma;1762
+unloco;1763
+transit;1764
+marina and the diamonds;1765
+alela diane;1766
+the sorrow;1767
+gossip;1768
+emerald;1769
+lucille bogan;1770
+frank zappa;1771
+the coathangers;1772
+captain jack;1773
+stellastarr*;1774
+david kersh;1775
+broken bones;1776
+hayley kiyoko;1777
+wire;1778
+thurston moore;1779
+cop shoot cop;1780
+the white buffalo;1781
+bad books;1782
+irene cara;1783
+gorillaz;1784
+the gap band;1785
+lead belly;1786
+cassadee pope;1787
+elvis depressedly;1788
+curtis mayfield;1789
+waylon;1790
+the gun club;1791
+behemoth;1792
+she wants revenge;1793
+the crüxshadows;1794
+týr;1795
+dan fogelberg;1796
+stan ridgway;1797
+blind witness;1798
+deerhunter;1799
+agalloch;1800
+grand puba;1801
+heavens edge;1802
+the acacia strain;1803
+beseech;1804
+sting;1805
+rival sons;1806
+henry jamison;1807
+the black lillies;1808
+the cog is dead;1809
+benno moiseiwitsch;1810
+nazz;1811
+greg brown;1812
+reel big fish;1813
+the muppets;1814
+threshold;1815
+tracie spencer;1816
+cimorelli;1817
+alexandra burke;1818
+whigfield;1819
+eminem;1820
+wolverine;1821
+grave maker;1822
+example;1823
+lambchop;1824
+madeleine peyroux;1825
+bondage fairies;1826
+darren hanlon;1827
+lyria;1828
+capital kings;1829
+john pizzarelli;1830
+high on fire;1831
+stereolab;1832
+machines of loving grace;1833
+kim carnes;1834
+so many dynamos;1835
+phony ppl;1836
+san cisco;1837
+emilie autumn;1838
+pietro locatelli;1839
+solefald;1840
+kellie pickler;1841
+the maranatha! singers;1842
+danbert nobacon;1843
+soulja boy;1844
+mario winans;1845
+camille saint sa ns;1846
+indecision;1847
+lasgo;1848
+karyn white;1849
+hurts;1850
+between the trees;1851
+nat king cole;1852
+front 242;1853
+johannes brahms;1854
+the national;1855
+shane barnard;1856
+chris knox;1857
+poison girls;1858
+oh, sleeper;1859
+hell is for heroes;1860
+pfr;1861
+john lee hooker;1862
+gramatik;1863
+moya brennan;1864
+pop evil;1865
+scar symmetry;1866
+silly wizard;1867
+the ventures;1868
+steve holy;1869
+devotchkas;1870
+ignite;1871
+wayne newton;1872
+the gufs;1873
+decapitated;1874
+billie the vision & the dancers;1875
+thestart;1876
+idle cure;1877
+jill sobule;1878
+the soviettes;1879
+the irish rovers;1880
+münchener freiheit;1881
+pentagram;1882
+masterboy;1883
+return;1884
+minus story;1885
+satan;1886
+byron cage;1887
+listener;1888
+the mars volta;1889
+jennifer love hewitt;1890
+eartha;1891
+no doctors;1892
+steve hackett;1893
+elvin bishop;1894
+k'naan;1895
+ruby;1896
+princess nokia;1897
+ellie goulding;1898
+material issue;1899
+fun boy three;1900
+circulatory system;1901
+the road hammers;1902
+brian mcknight;1903
+"weird al" yankovic;1904
+the wailers;1905
+kurupt;1906
+criminal;1907
+bal-sagoth;1908
+lou reed;1909
+queen;1910
+rudimentary peni;1911
+a great big world;1912
+jarboe;1913
+augie march;1914
+jim guthrie;1915
+emma bunton;1916
+aaron watson;1917
+έλενα παπαρίζου;1918
+blue cheer;1919
+mark harris;1920
+alexander scriabin;1921
+andrew gold;1922
+climax blues band;1923
+conjure one;1924
+blind guardian;1925
+stephen schwartz;1926
+martha wainwright;1927
+lil' flip;1928
+katherine jenkins;1929
+amy shark;1930
+styles p;1931
+new model army;1932
+franz liszt;1933
+ben weasel;1934
+john k. samson;1935
+jason anderson;1936
+sheena easton;1937
+inna;1938
+mayer hawthorne;1939
+easy rider;1940
+eddy arnold;1941
+zoegirl;1942
+jimmy barnes;1943
+tear da club up thugs;1944
+daughter;1945
+buzzcocks;1946
+freezepop;1947
+steve aoki;1948
+kierra sheard;1949
+captain beyond;1950
+sabrina claudio;1951
+mc shan;1952
+the j. geils band;1953
+camel;1954
+indica;1955
+attack in black;1956
+the jam;1957
+bananarama;1958
+al jarreau;1959
+eric carmen;1960
+gwar;1961
+us the duo;1962
+ziggy alberts;1963
+amir obè;1964
+25 ta life;1965
+newsboys;1966
+chris knight;1967
+marlene dietrich;1968
+112;1969
+wonderwall;1970
+officer negative;1971
+enochian crescent;1972
+jupiter one;1973
+whitecross;1974
+tim mcgraw;1975
+bile;1976
+brandi carlile;1977
+danzig;1978
+twiztid;1979
+david baerwald;1980
+jimmy webb;1981
+mamas gun;1982
+harry chapin;1983
+holy ghost!;1984
+stephen hough;1985
+k.t. oslin;1986
+robin trower;1987
+us3;1988
+esben and the witch;1989
+schiller;1990
+god lives underwater;1991
+hayseed dixie;1992
+polluted inheritance;1993
+nicki minaj;1994
+robin gibb;1995
+pinback;1996
+bobbie gentry;1997
+freakwater;1998
+kite;1999
+kid down;2000
+georges cziffra;2001
+sivert høyem;2002
+conor oberst;2003
+i'm from barcelona;2004
+shining;2005
+henry purcell;2006
+jeremy messersmith;2007
+goat of mendes;2008
+catch 22;2009
+vampire rodents;2010
+the go-go's;2011
+noah gundersen;2012
+diablo swing orchestra;2013
+uncle dave macon;2014
+john eddie;2015
+calvin harris;2016
+roomful of blues;2017
+bane;2018
+sam lewis;2019
+alejandro escovedo;2020
+kekal;2021
+manticora;2022
+frankie lymon & the teenagers;2023
+randy meisner;2024
+carcass;2025
+nile;2026
+circus maximus;2027
+non phixion;2028
+eric woolfson;2029
+starship;2030
+kaiser chiefs;2031
+sexy sadie;2032
+confide;2033
+boy meets girl;2034
+henryk szeryng;2035
+wynter gordon;2036
+jay-jay johanson;2037
+hannah fury;2038
+mike jones;2039
+bebe & cece winans;2040
+trip lee;2041
+tigers jaw;2042
+hector berlioz;2043
+minus the bear;2044
+johnny winter;2045
+my sister's machine;2046
+gilbert o'sullivan;2047
+jean michel jarre;2048
+jody miller;2049
+skylark;2050
+the poodles;2051
+fear of domination;2052
+the donefors;2053
+zed yago;2054
+martha argerich;2055
+faun fables;2056
+joy electric;2057
+the grass roots;2058
+cygnosic;2059
+disfear;2060
+crosby, stills, nash & young;2061
+69 boyz;2062
+celesty;2063
+the forecast;2064
+the magnetic fields;2065
+miranda lambert;2066
+veto;2067
+the offspring;2068
+bobby goldsboro;2069
+calibretto;2070
+common rider;2071
+10cc;2072
+tim hughes;2073
+bald vulture;2074
+david cassidy;2075
+simple minds;2076
+little man tate;2077
+carla thomas;2078
+cher lloyd;2079
+cameo;2080
+tko;2081
+george frideric handel;2082
+immaculate fools;2083
+kero kero bonito;2084
+john sebastian;2085
+joe nichols;2086
+classified;2087
+velvet revolver;2088
+skip the use;2089
+ratt;2090
+gilberto gil;2091
+the miracles;2092
+the subways;2093
+screwed up click;2094
+fm static;2095
+further seems forever;2096
+barry louis polisar;2097
+antiskeptic;2098
+den harrow;2099
+eleni mandell;2100
+mungo jerry;2101
+fucked up;2102
+jermaine jackson;2103
+freedom call;2104
+phillip phillips;2105
+van der graaf generator;2106
+h-town;2107
+madvillain;2108
+pietasters;2109
+sonicflood;2110
+thomas dolby;2111
+scud mountain boys;2112
+orphanage;2113
+eleventyseven;2114
+peter hammill;2115
+kaysha;2116
+skyharbor;2117
+south border;2118
+mad marge and the stonecutters;2119
+ben kweller;2120
+monstrosity;2121
+young galaxy;2122
+pusha t;2123
+john wesley;2124
+glass harp;2125
+blue system;2126
+pain of salvation;2127
+film school;2128
+plasmatics;2129
+down by law;2130
+circus of power;2131
+left alone;2132
+girls under glass;2133
+saga;2134
+redemption;2135
+memphis may fire;2136
+aion;2137
+kj-52;2138
+gaia epicus;2139
+stacy lattisaw;2140
+larry norman;2141
+david lee roth;2142
+willie nelson;2143
+melody gardot;2144
+in the midst of lions;2145
+iron butterfly;2146
+katy rose;2147
+izz;2148
+sevyn streeter;2149
+pendulum;2150
+cult of luna;2151
+avalanche city;2152
+brother firetribe;2153
+lena;2154
+steve vai;2155
+motosierra;2156
+the bouncing souls;2157
+maher zain;2158
+colbie caillat;2159
+blaze ya dead homie;2160
+kyung wha chung;2161
+delta goodrem;2162
+lena katina;2163
+the veronicas;2164
+asleep at the wheel;2165
+anne clark;2166
+jimmie vaughan;2167
+wildpath;2168
+autumnblaze;2169
+lefty frizzell;2170
+the weakerthans;2171
+real estate;2172
+dj drama;2173
+joji;2174
+little milton;2175
+beholder;2176
+grace jones;2177
+wolfheart;2178
+kt tunstall;2179
+robert forster;2180
+lana lane;2181
+meat loaf;2182
+mark chesnutt;2183
+autumn;2184
+bronze nazareth;2185
+ladysmith black mambazo;2186
+memphis minnie;2187
+the plimsouls;2188
+36 crazyfists;2189
+private line;2190
+nikolai medtner;2191
+jaden smith;2192
+chris cagle;2193
+beatnik termites;2194
+bernard butler;2195
+emin;2196
+major accident;2197
+the mayan factor;2198
+stonewall jackson;2199
+rat boy;2200
+das efx;2201
+roxette;2202
+tears for fears;2203
+hank locklin;2204
+n.w.a;2205
+hadouken!;2206
+king diamond;2207
+axe;2208
+lolo;2209
+rufus thomas;2210
+montgomery gentry;2211
+the front bottoms;2212
+gabrielle;2213
+beng beng cocktail;2214
+wyclef jean;2215
+cœur de pirate;2216
+faithless;2217
+heavy heavy low low;2218
+cutting crew;2219
+limbeck;2220
+saukrates;2221
+artie shaw;2222
+jojo;2223
+red rider;2224
+nikolai rimsky korsakov;2225
+choking victim;2226
+the scene aesthetic;2227
+fight amp;2228
+émilie simon;2229
+christoph willibald gluck;2230
+boney m.;2231
+ookla the mok;2232
+beck;2233
+mcfly;2234
+hey ocean!;2235
+dixie chicks;2236
+lindsay lohan;2237
+from autumn to ashes;2238
+giovanni battista pergolesi;2239
+boondox;2240
+wilhelm kempff;2241
+jay & the americans;2242
+the game;2243
+night in gales;2244
+blazin' squad;2245
+lords of acid;2246
+darlingside;2247
+squealer;2248
+the ghost inside;2249
+abbey lincoln;2250
+cake bake betty;2251
+all that remains;2252
+bunny wailer;2253
+culture club;2254
+jim croce;2255
+vixen;2256
+chelsea wolfe;2257
+the zutons;2258
+the ship;2259
+timbuk3;2260
+wise guys;2261
+gabriel faur ;2262
+prototype;2263
+great big sea;2264
+celtic frost;2265
+stage dolls;2266
+vinnie paz;2267
+symbols;2268
+casanova;2269
+horse the band;2270
+raul seixas;2271
+phil collins;2272
+belphegor;2273
+a whisper in the noise;2274
+rufio;2275
+trin-i-tee 5;2276
+isaac hayes;2277
+i am ghost;2278
+gregg allman;2279
+three 6 mafia;2280
+t. mills;2281
+as cities burn;2282
+fear factory;2283
+good riddance;2284
+daniel m ller schott;2285
+stromkern;2286
+spirogyra;2287
+resurrection band;2288
+jamie's elsewhere;2289
+one bad pig;2290
+renaldo & the loaf;2291
+danny kaye;2292
+belle and sebastian;2293
+cryptopsy;2294
+yeah yeah yeahs;2295
+the jayhawks;2296
+mel tillis;2297
+yazoo;2298
+colin hay;2299
+the desert sessions;2300
+the dismemberment plan;2301
+sammy adams;2302
+discipline;2303
+secrets of the moon;2304
+tapping the vein;2305
+army of the pharaohs;2306
+big bad voodoo daddy;2307
+professor green;2308
+roine stolt;2309
+wu-tang clan;2310
+tracy byrd;2311
+big maybelle;2312
+aage kvalbein;2313
+club nouveau;2314
+mose allison;2315
+edenbridge;2316
+the foundations;2317
+darkseed;2318
+dear and the headlights;2319
+twila paris;2320
+ed bruce;2321
+damien dempsey;2322
+starlight mints;2323
+accessory;2324
+ador dorath;2325
+boogie down productions;2326
+stars;2327
+creedence clearwater revival;2328
+dwight yoakam;2329
+4him;2330
+gorguts;2331
+demons & wizards;2332
+陰陽座;2333
+barstool prophets;2334
+paolo nutini;2335
+asking alexandria;2336
+kid rock;2337
+jerusalem;2338
+them;2339
+robert wyatt;2340
+the weeknd;2341
+jeff deyo;2342
+beartooth;2343
+pilot;2344
+jenny hval;2345
+the devil wears prada;2346
+karine polwart;2347
+maxwell;2348
+michael rabin;2349
+babyface;2350
+white lies;2351
+groundation;2352
+nina hagen;2353
+tracy lawrence;2354
+ring of fire;2355
+koda kumi;2356
+umphrey's mcgee;2357
+polkadot cadaver;2358
+alabama;2359
+edge of sanity;2360
+thyrane;2361
+dick brave & the backbeats;2362
+pantera;2363
+françoise hardy;2364
+whirlwind heat;2365
+lightning seeds;2366
+disrupt;2367
+true colors;2368
+vérité;2369
+jeremy riddle;2370
+billy crawford;2371
+royal hunt;2372
+apache indian;2373
+peggy lee;2374
+the flight of sleipnir;2375
+hard-fi;2376
+ladytron;2377
+massacration;2378
+john browning;2379
+grant hart;2380
+chris hillman;2381
+bear vs. shark;2382
+descendents;2383
+mark ronson;2384
+nofx;2385
+rich kids on lsd;2386
+jordin sparks;2387
+vv brown;2388
+michael franks;2389
+aram khachaturian;2390
+bathory;2391
+jessica pratt;2392
+mormon tabernacle choir;2393
+dan hill;2394
+laleh;2395
+curl up and die;2396
+jonah matranga;2397
+anna tsuchiya;2398
+jack howard;2399
+nick heyward;2400
+steven curtis chapman;2401
+pimp c;2402
+chef'special;2403
+super junior-d&e;2404
+willie nelson & wynton marsalis;2405
+lionel richie;2406
+martyr;2407
+eagles;2408
+project pitchfork;2409
+crosby, stills & nash;2410
+slim thug;2411
+frankie j;2412
+the stereo;2413
+blues pills;2414
+fog;2415
+bow wow wow;2416
+gowan;2417
+king charles;2418
+sviatoslav richter;2419
+m. ward;2420
+freshlyground;2421
+ed schrader's music beat;2422
+company flow;2423
+painbastard;2424
+buddy jewell;2425
+mckinney's cotton pickers;2426
+mandisa;2427
+watershed;2428
+the wailin' jennys;2429
+faderhead;2430
+mötley crüe;2431
+stormwarrior;2432
+keb' mo';2433
+jimmy witherspoon;2434
+the lucksmiths;2435
+faster pussycat;2436
+knightowl;2437
+keri hilson;2438
+mxpx;2439
+the nice;2440
+ritual;2441
+oxymoron;2442
+catamenia;2443
+vince gill;2444
+bill fay;2445
+toy-box;2446
+love unlimited;2447
+mc frontalot;2448
+dolly parton;2449
+papoose;2450
+demi lovato;2451
+rhodes;2452
+big joe williams;2453
+charlie rich;2454
+extreme noise terror;2455
+wilco;2456
+low roar;2457
+carrie underwood;2458
+when particles collide;2459
+3 feet smaller;2460
+the supremes & the four tops;2461
+the police;2462
+empress of;2463
+herman brood;2464
+be your own pet;2465
+kill switch...klick;2466
+superjoint ritual;2467
+lynyrd skynyrd;2468
+al green;2469
+hilltop hoods;2470
+ghostlimb;2471
+chamber - l'orchestre de chambre noir;2472
+thou;2473
+the icicle works;2474
+debbie harry;2475
+victoria beckham;2476
+clare maguire;2477
+cause & effect;2478
+marion;2479
+the lawrence arms;2480
+x;2481
+the russian futurists;2482
+maggie rose;2483
+jim cuddy;2484
+the turtles;2485
+as blood runs black;2486
+the night flight orchestra;2487
+graziano romani;2488
+alcatrazz;2489
+lp;2490
+jon foreman;2491
+popa chubby;2492
+alfred cortot;2493
+eddi reader;2494
+plushgun;2495
+sugababes;2496
+sesame street;2497
+johnny cash;2498
+joe walsh;2499
+jeanette biedermann;2500
+ashford & simpson;2501
+sleep;2502
+hum;2503
+papas fritas;2504
+fifteen;2505
+denzel curry;2506
+phedora;2507
+anders osborne;2508
+the morning of;2509
+cash rivers and the sinners;2510
+blueprint;2511
+slim dusty;2512
+acappella;2513
+katatonia;2514
+diary of dreams;2515
+deine lakaien;2516
+rick wakeman;2517
+freedom fry;2518
+fruupp;2519
+jean baptiste lully;2520
+october 31;2521
+sinister;2522
+jesse mccartney;2523
+vigilantes of love;2524
+elegant machinery;2525
+corbin bleu;2526
+gza/genius;2527
+alison wonderland;2528
+kelly price;2529
+destruction;2530
+fractured;2531
+david ford;2532
+julie miller;2533
+joe mcelderry;2534
+phinehas;2535
+sylvan esso;2536
+basement jaxx;2537
+shearwater;2538
+deadstar assembly;2539
+wildbirds & peacedrums;2540
+patty smyth;2541
+short stack;2542
+jupiter apple;2543
+raven-symoné;2544
+misery loves co.;2545
+the motels;2546
+steppenwolf;2547
+vic chesnutt;2548
+asha;2549
+the almighty;2550
+lauryn hill;2551
+khalid;2552
+great lake swimmers;2553
+ashanti;2554
+蔡依林 (jolin tsai);2555
+donnie munro;2556
+john fogerty;2557
+kimberley locke;2558
+keith moon;2559
+janis joplin;2560
+reamonn;2561
+psychopathic rydas;2562
+air supply;2563
+ashlee simpson;2564
+tame impala;2565
+perzonal war;2566
+adem;2567
+franz ferdinand;2568
+martina mcbride;2569
+natalie cole;2570
+jason falkner;2571
+rosalyn tureck;2572
+missy higgins;2573
+beach house;2574
+current 93;2575
+a hill to die upon;2576
+ghoti hook;2577
+forgive durden;2578
+gojira;2579
+jay ferguson;2580
+eilera;2581
+kate nash;2582
+foxes;2583
+at the drive-in;2584
+job for a cowboy;2585
+from ashes to new;2586
+brooke white;2587
+dwele;2588
+vacuum;2589
+samson fran ois;2590
+loverboy;2591
+black country communion;2592
+paul revere and the raiders;2593
+chris brown;2594
+tommy lee;2595
+vengaboys;2596
+joy zipper;2597
+bella hardy;2598
+blondie;2599
+brian setzer;2600
+mc zulu;2601
+building 429;2602
+cindy bullens;2603
+cyne;2604
+b*witched;2605
+the black eyed peas;2606
+paul weller;2607
+gun;2608
+illogic;2609
+men at work;2610
+luv';2611
+shadows fall;2612
+the crest;2613
+zendaya;2614
+runemagick;2615
+john farnham;2616
+josh ritter;2617
+funeral;2618
+wanda jackson;2619
+waylon jennings;2620
+falco;2621
+marlon williams;2622
+seals & crofts;2623
+kurt elling;2624
+neon hitch;2625
+the swift;2626
+sub focus;2627
+bent knee;2628
+trevor rabin;2629
+bazzi;2630
+jim's big ego;2631
+andrew huang;2632
+glenn miller;2633
+donna summer;2634
+the meteors;2635
+ugly duckling;2636
+李玟 (coco lee);2637
+bruno mars;2638
+sambassadeur;2639
+the neville brothers;2640
+terrorgruppe;2641
+the lone bellow;2642
+deer tick;2643
+bentley jones;2644
+carter the unstoppable sex machine;2645
+connie talbot;2646
+adult.;2647
+crosby & nash;2648
+ayọ;2649
+jay rock;2650
+crystal castles;2651
+tyrone wells;2652
+heavens to betsy;2653
+the limousines;2654
+chris garneau;2655
+13th floor elevators;2656
+dirty pretty things;2657
+the more i see;2658
+raimon;2659
+winterstorm;2660
+spice 1;2661
+jade valerie;2662
+kacey musgraves;2663
+roko;2664
+lights of euphoria;2665
+the bronx;2666
+raffaella carrà;2667
+mastercastle;2668
+city and colour;2669
+salvador;2670
+carolyn arends;2671
+motörhead;2672
+justin young;2673
+the dreadnoughts;2674
+india.arie;2675
+decoded feedback;2676
+guy verlinde;2677
+the clientele;2678
+rapture;2679
+caterina valente;2680
+battle beast;2681
+wednesday 13;2682
+walter gieseking;2683
+the duckworth lewis method;2684
+bloc party;2685
+zuill bailey;2686
+mother love bone;2687
+martha and the vandellas;2688
+blitzkid;2689
+nightrage;2690
+martin zellar;2691
+38 special;2692
+christina grimmie;2693
+dagoba;2694
+glenn tipton;2695
+blue rodeo;2696
+acoustic junction;2697
+led zeppelin;2698
+evergrey;2699
+the locust;2700
+jazmine sullivan;2701
+the colourfield;2702
+yob;2703
+vengeance rising;2704
+chunk! no, captain chunk!;2705
+the alan parsons project;2706
+dustin kensrue;2707
+fool's garden;2708
+saucy monky;2709
+sergei rachmaninoff;2710
+charlie sexton;2711
+babyshambles;2712
+nina simone;2713
+born against;2714
+dååth;2715
+jon allen;2716
+leigh nash;2717
+the vandals;2718
+matt redman;2719
+marian hill;2720
+tanita tikaram;2721
+leonard bernstein;2722
+raffi;2723
+pharao;2724
+miseration;2725
+allan taylor;2726
+rita ora;2727
+gary glitter;2728
+girlschool;2729
+k-ci & jojo;2730
+jimmy dorsey;2731
+.moneen.;2732
+imperia;2733
+virtuoso;2734
+votum;2735
+lucy woodward;2736
+alvin stardust;2737
+will young;2738
+crown of thorns;2739
+arcadi volodos;2740
+the jim yoshii pile-up;2741
+a life divided;2742
+children 18;2743
+james ingram;2744
+梶浦由記 (yuki kajiura);2745
+ynw melly;2746
+jump, little children;2747
+trans-siberian orchestra;2748
+armageddon;2749
+faces;2750
+bokka;2751
+leviathan;2752
+the black keys;2753
+the panic division;2754
+roy drusky;2755
+veggietales;2756
+jt music;2757
+tim o'brien;2758
+brockhampton;2759
+the sundays;2760
+hayes carll;2761
+leatherface;2762
+spirit of the west;2763
+dawn landes;2764
+kygo;2765
+isis;2766
+meek mill;2767
+brisa roché;2768
+freesscape;2769
+joe hill;2770
+pride and fall;2771
+54-40;2772
+washed out;2773
+michael sembello;2774
+crowded house;2775
+cpr;2776
+steve winwood;2777
+jukebox the ghost;2778
+atari teenage riot;2779
+ballyhoo!;2780
+eddie floyd;2781
+trophy scars;2782
+erykah badu;2783
+james cotton;2784
+kristinia debarge;2785
+morbid angel;2786
+code red;2787
+al kooper;2788
+monty python;2789
+the mary onettes;2790
+future;2791
+shabazz the disciple;2792
+sad café;2793
+x-fusion;2794
+deniece williams;2795
+robert gordon;2796
+seabear;2797
+wallows;2798
+john illsley;2799
+dream theater;2800
+secret lives of the freemasons;2801
+josef suk;2802
+the bled;2803
+gregor piatigorsky;2804
+john ralston;2805
+tim kasher;2806
+the hellacopters;2807
+autopsy;2808
+the silencers;2809
+hank green;2810
+abk;2811
+hound dog taylor;2812
+angela bofill;2813
+wade bowen;2814
+jamestown story;2815
+andy partridge;2816
+echolyn;2817
+search the city;2818
+giacomo puccini;2819
+gregory isaacs;2820
+stormtroopers of death;2821
+silkk the shocker;2822
+1208;2823
+everlife;2824
+bobby mcferrin;2825
+pretty ricky;2826
+ronna reeves;2827
+annie lennox;2828
+london grammar;2829
+john doe;2830
+shel silverstein;2831
+blue october;2832
+aaron lee tasjan;2833
+marion raven;2834
+carl perkins;2835
+big pokey;2836
+eddie cochran;2837
+enter the haggis;2838
+burden of a day;2839
+tourniquet;2840
+emilíana torrini;2841
+brian doerksen;2842
+crossfaith;2843
+ludwig van beethoven;2844
+good clean fun;2845
+an horse;2846
+kali uchis;2847
+stone breath;2848
+daniele liverani;2849
+patrick stump;2850
+henry krieger;2851
+fit for a king;2852
+angel corpse;2853
+the audition;2854
+helalyn flowers;2855
+fleet foxes;2856
+the wiggles;2857
+clouds;2858
+peter cetera;2859
+wreckshop family;2860
+the mistake;2861
+lacrimosa;2862
+carpenters;2863
+the crabb family;2864
+twilight fauna;2865
+anton n dvo k;2866
+patti page;2867
+dawnbringer;2868
+bachman-turner overdrive;2869
+love like blood;2870
+pat mcgee band;2871
+shedaisy;2872
+the other ones;2873
+hatebreed;2874
+iamamiwhoami;2875
+showbread;2876
+thyx;2877
+face to face;2878
+bonnie owens;2879
+this wild life;2880
+adrienne young;2881
+tombs;2882
+f.k.ü.;2883
+carl nielsen;2884
+trouble;2885
+peter wolf;2886
+shout out louds;2887
+new york philharmonic;2888
+paul and storm;2889
+the dickies;2890
+hussein fatal;2891
+a tribe called quest;2892
+jade 4u;2893
+bonobo;2894
+alphabeat;2895
+gladys knight;2896
+tilt;2897
+jacqueline du pr ;2898
+boyz ii men;2899
+blood oranges;2900
+scala & kolacny brothers;2901
+steel attack;2902
+vienna teng;2903
+nunslaughter;2904
+roxy music;2905
+benny benassi;2906
+feargal sharkey;2907
+willie dixon;2908
+mary j. blige;2909
+ghost town;2910
+george ezra;2911
+rose cousins;2912
+the stone roses;2913
+mc5;2914
+go west;2915
+knut;2916
+the head and the heart;2917
+room eleven;2918
+simon and garfunkel;2919
+wilson pickett;2920
+the amboy dukes;2921
+walk the moon;2922
+abgott;2923
+truly;2924
+the louvin brothers;2925
+melanie c;2926
+enchant;2927
+duffy;2928
+union;2929
+great white;2930
+the violet burning;2931
+kat-tun;2932
+mercyme;2933
+elmore james;2934
+johnny horton;2935
+dreams of sanity;2936
+vázquez sounds;2937
+kraftwerk;2938
+cool hand luke;2939
+country joe mcdonald;2940
+abandoned pools;2941
+the swell season;2942
+3 inches of blood;2943
+rocky loves emily;2944
+whores.;2945
+natalie grant;2946
+frank proffitt;2947
+red fang;2948
+jason mraz;2949
+barefoot truth;2950
+lee aaron;2951
+frank hutchison;2952
+kasey chambers;2953
+sick of it all;2954
+girls in hawaii;2955
+geoff farina;2956
+rory gallagher;2957
+out of eden;2958
+mø;2959
+darkthrone;2960
+eddie money;2961
+envy on the coast;2962
+virgin prunes;2963
+mississippi sheiks;2964
+capitol steps;2965
+battery;2966
+hilary duff;2967
+betraying the martyrs;2968
+stray cats;2969
+the groundhogs;2970
+thieves and villains;2971
+la roux;2972
+southern raiders band;2973
+cam'ron;2974
+mississippi fred mcdowell;2975
+gary u.s. bonds;2976
+matt duke;2977
+stillborn;2978
+mnemic;2979
+jerry cantrell;2980
+clannad;2981
+mychildren mybride;2982
+dope;2983
+his hero is gone;2984
+deltron 3030;2985
+nebelhexë;2986
+angelic upstarts;2987
+bebo norman;2988
+kb;2989
+patricia barber;2990
+tsol;2991
+cathedral;2992
+glenn kaiser;2993
+peter koppes;2994
+the hunna;2995
+peaches;2996
+bryan adams;2997
+michael kiske;2998
+sigrid;2999
+gordon bok;3000
+the vamps;3001
+burning witches;3002
+the twilight singers;3003
+teyana taylor;3004
+robbie robertson;3005
+black veil brides;3006
+corbin-hanner band;3007
+white denim;3008
+see you next tuesday;3009
+corey hart;3010
+shy;3011
+visions of atlantis;3012
+pyrexia;3013
+rah digga;3014
+killwhitneydead;3015
+austra;3016
+yung joc;3017
+sevendust;3018
+miles kane;3019
+albert hammond, jr.;3020
+the sleepy jackson;3021
+quarterflash;3022
+baba brinkman;3023
+gautier capu on;3024
+heather dale;3025
+hinder;3026
+evoken;3027
+kardinal offishall;3028
+jett rebel;3029
+n-dubz;3030
+marcia ball;3031
+bucks fizz;3032
+freddie jackson;3033
+steve grand;3034
+leonard pennario;3035
+summoning;3036
+blaqk audio;3037
+the sisters of mercy;3038
+doyle bramhall ii;3039
+dominici;3040
+al b. sure!;3041
+giant sand;3042
+colosseum;3043
+johannes moser;3044
+yeasayer;3045
+unearth;3046
+oneiroid psychosis;3047
+sheila e.;3048
+alicia de larrocha;3049
+last tuesday;3050
+the bonzo dog doo-dah band;3051
+gilbert and sullivan;3052
+this day & age;3053
+anekdoten;3054
+greg lake;3055
+mike scott;3056
+black tide;3057
+colony 5;3058
+the malibooz;3059
+adam ant;3060
+pacewon;3061
+queensrÿche;3062
+the ames brothers;3063
+rooster;3064
+rare bird;3065
+justin nozuka;3066
+elle milano;3067
+yacht;3068
+neville marriner;3069
+faith no more;3070
+kool & the gang;3071
+fun lovin' criminals;3072
+yann tiersen;3073
+schäffer the darklord;3074
+aiden;3075
+metanoia;3076
+the high dials;3077
+charles bradley;3078
+paul gilbert;3079
+malvina reynolds;3080
+iron & wine;3081
+saturday looks good to me;3082
+viktor vaughn;3083
+isgaard;3084
+frank sinatra;3085
+alice cooper;3086
+passenger;3087
+michael nesmith;3088
+ancient;3089
+liza anne;3090
+the magic numbers;3091
+seraphim shock;3092
+abney park;3093
+guerilla maab;3094
+orphaned land;3095
+jack savoretti;3096
+zombina and the skeletones;3097
+leon fleisher;3098
+rick springfield;3099
+the left rights;3100
+starflyer 59;3101
+2 brothers on the 4th floor;3102
+emery;3103
+c.c. catch;3104
+mick taylor;3105
+good rats;3106
+blackmore's night;3107
+twisted sister;3108
+boney james;3109
+cancer;3110
+maria jo o pires;3111
+meiko;3112
+cass mccombs;3113
+deep dish;3114
+information society;3115
+blackie and the rodeo kings;3116
+the fold;3117
+ed harcourt;3118
+ramshackle glory;3119
+tom misch;3120
+bebe winans;3121
+joni mitchell;3122
+elizabeth shepherd;3123
+lil boosie;3124
+celluloide;3125
+house of lords;3126
+j.d. souther;3127
+midori goto;3128
+shenandoah;3129
+averi;3130
+modwheelmood;3131
+betty who;3132
+burst;3133
+pete shelley;3134
+jon bon jovi;3135
+x-perience;3136
+xentrix;3137
+grinderman;3138
+hem;3139
+the impressions;3140
+sylosis;3141
+the gregory brothers;3142
+macklemore;3143
+lizz wright;3144
+sawyer brown;3145
+capercaillie;3146
+sam & dave;3147
+the germs;3148
+salt-n-pepa;3149
+big sean;3150
+stornoway;3151
+arthur rubinstein;3152
+cock robin;3153
+big daddy;3154
+fernando ortega;3155
+ted nugent;3156
+coptic rain;3157
+kat deluna;3158
+wolfsheim;3159
+graham central station;3160
+charles harrison;3161
+lil jon & the east side boyz;3162
+lovedrug;3163
+our last night;3164
+mura masa;3165
+lacuna coil;3166
+luke doucet;3167
+punchline;3168
+animal logic;3169
+big bang;3170
+sieges even;3171
+g.g.f.h.;3172
+tyler lyle;3173
+the boswell sisters;3174
+fightstar;3175
+kevin rudolf;3176
+dangerous toys;3177
+comeback kid;3178
+wild orchid;3179
+one direction;3180
+electric six;3181
+the corrs;3182
+christy nockels;3183
+hillsong;3184
+sham 69;3185
+hurt;3186
+clint black;3187
+chris hillman & herb pedersen;3188
+def leppard;3189
+plants and animals;3190
+jack bruce;3191
+thalía;3192
+arthur crudup;3193
+phil wickham;3194
+king gizzard & the lizard wizard;3195
+beautiful eulogy;3196
+the last bison;3197
+caravan;3198
+the wonder stuff;3199
+jeff williams;3200
+sebadoh;3201
+jets overhead;3202
+golden smog;3203
+jonna lee;3204
+the faceless;3205
+daniel lanois;3206
+the judds;3207
+paragon;3208
+vienna philharmonic;3209
+electric president;3210
+tenth avenue north;3211
+solomon cutner;3212
+vnv nation;3213
+system syn;3214
+eric lindell;3215
+jenny lewis;3216
+the flower kings;3217
+chicago;3218
+stacey q;3219
+ark;3220
+chingo bling;3221
+isac elliot;3222
+penumbra;3223
+cypress hill;3224
+landon pigg;3225
+collide;3226
+mick ronson;3227
+fall of the leafe;3228
+melissa etheridge;3229
+suicidal tendencies;3230
+joshua kadison;3231
+the spill canvas;3232
+eclipse;3233
+within the ruins;3234
+haemorrhage;3235
+deliverance;3236
+robert pollard;3237
+paul kelly;3238
+alesana;3239
+the big pink;3240
+e-rotic;3241
+the books;3242
+h.p. lovecraft historical society;3243
+argent;3244
+nivea;3245
+mario;3246
+the fiery furnaces;3247
+atc;3248
+matt and kim;3249
+tobymac;3250
+lcd soundsystem;3251
+ant & dec;3252
+howe gelb;3253
+hozier;3254
+astarte;3255
+anathallo;3256
+basia bulat;3257
+tad morose;3258
+smp;3259
+greensky bluegrass;3260
+holly miranda;3261
+ub40;3262
+d12;3263
+scott miller;3264
+chicane;3265
+the faint;3266
+cream;3267
+archive;3268
+deicide;3269
+legendary shack shakers;3270
+guano apes;3271
+wolfgun;3272
+typhoon;3273
+pile;3274
+the philosopher kings;3275
+black grape;3276
+kotipelto;3277
+david lee murphy;3278
+fish;3279
+nekromantix;3280
+the chordettes;3281
+sherban lupu;3282
+darlene zschech;3283
+the henry girls;3284
+ida cox;3285
+peter & gordon;3286
+fear, and loathing in las vegas;3287
+jme;3288
+blood on the dance floor;3289
+electric light orchestra;3290
+romeo;3291
+mike heron;3292
+kendrick lamar;3293
+spock's beard;3294
+the bunny the bear;3295
+sweetbox;3296
+mac miller;3297
+spazz;3298
+liars;3299
+98°;3300
+blood duster;3301
+make do and mend;3302
+cheri dennis;3303
+gerald levert;3304
+chris tomlin;3305
+all saints;3306
+ghost ship;3307
+young the giant;3308
+jim james;3309
+the unseen;3310
+whitesnake;3311
+ciara;3312
+seventh day slumber;3313
+the b-52's;3314
+crime & the city solution;3315
+brentalfloss;3316
+gene clark;3317
+nouvelle vague;3318
+don johnson big band;3319
+shwayze;3320
+cat rapes dog;3321
+poisonblack;3322
+the wildhearts;3323
+bruce hornsby and the range;3324
+logic;3325
+vaux;3326
+suzy bogguss;3327
+sarke;3328
+charlie louvin;3329
+wild child;3330
+icons of filth;3331
+tarnation;3332
+thousand foot krutch;3333
+nikki webster;3334
+wolfmother;3335
+samuel barber;3336
+panda bear;3337
+ganggajang;3338
+rosetta stone;3339
+sonia disappearfear;3340
+the s.o.s. band;3341
+w.a.s.p.;3342
+silverstein;3343
+tin machine;3344
+jennifer warnes;3345
+andru donalds;3346
+olympos mons;3347
+cerebral fix;3348
+glenn frey;3349
+suzi quatro;3350
+sons of the pioneers;3351
+venetian princess;3352
+delirious?;3353
+lecrae;3354
+marty robbins;3355
+jeannie seely;3356
+the psychedelic ensemble;3357
+glenn medeiros;3358
+bleed from within;3359
+third world;3360
+alison krauss;3361
+dj shadow;3362
+a static lullaby;3363
+maps & atlases;3364
+james taylor;3365
+mekong delta;3366
+easy star all-stars;3367
+aura noir;3368
+santana;3369
+the gadjits;3370
+can;3371
+deepspace 5;3372
+leslie west;3373
+fr d ric chopin;3374
+sundara karma;3375
+powerman 5000;3376
+dutch rebelle;3377
+bones brigade;3378
+the browning;3379
+peter andre;3380
+connie francis;3381
+smooth;3382
+majesty;3383
+matthew dear;3384
+fionn regan;3385
+nuclear assault;3386
+over the rhine;3387
+magellan;3388
+zoot woman;3389
+shyne;3390
+kidneythieves;3391
+racoon;3392
+ironsword;3393
+alessandro scarlatti;3394
+the appleseed cast;3395
+operation ivy;3396
+akron/family;3397
+qkumba zoo;3398
+the chasm;3399
+the style council;3400
+method man;3401
+the halliard;3402
+boy george;3403
+tarot;3404
+delays;3405
+cromok;3406
+honeymoon suite;3407
+hercules & love affair;3408
+lemuria;3409
+the kelly family;3410
+tweet;3411
+tru;3412
+lawnmower deth;3413
+no use for a name;3414
+dogwood;3415
+penitent;3416
+capture the crown;3417
+hawksley workman;3418
+the drifters;3419
+school of seven bells;3420
+james keelaghan;3421
+the 69 eyes;3422
+insane clown posse;3423
+save ferris;3424
+trust;3425
+atb;3426
+todd rundgren;3427
+hank cochran;3428
+forever the sickest kids;3429
+new riders of the purple sage;3430
+georg philipp telemann;3431
+crippled black phoenix;3432
+paul mccartney;3433
+ryan adams and the cardinals;3434
+silent cry;3435
+king creosote;3436
+run the jewels;3437
+the kings;3438
+steve goodman;3439
+nitty gritty dirt band;3440
+eric bibb;3441
+editors;3442
+blessed by a broken heart;3443
+civil twilight;3444
+children of bodom;3445
+michael jackson;3446
+richie sambora;3447
+willam;3448
+devildriver;3449
+e.s.g.;3450
+sananda maitreya;3451
+babbie mason;3452
+the museum;3453
+dan zanes;3454
+e-type;3455
+gidon kremer;3456
+al denson;3457
+aeon;3458
+kenny rogers;3459
+the tear garden;3460
+joshua bell;3461
+set your goals;3462
+le tigre;3463
+holly dunn;3464
+ignacy jan paderewski;3465
+immortal;3466
+108;3467
+ann beretta;3468
+dezarie;3469
+pete seeger;3470
+jan & dean;3471
+nina kinert;3472
+the lonely forest;3473
+protest the hero;3474
+diorama;3475
+khoma;3476
+japan;3477
+jim brickman;3478
+halifax;3479
+rebecca lynn howard;3480
+flobots;3481
+the new power generation;3482
+gods paparazzi;3483
+shannon curfman;3484
+roy woods;3485
+varsity fanclub;3486
+dave stewart;3487
+mumford & sons;3488
+syd matters;3489
+the nerve agents;3490
+kula shaker;3491
+luke sital-singh;3492
+john wesley harding;3493
+venom;3494
+melanie doane;3495
+the hippos;3496
+jessi colter;3497
+the movielife;3498
+babybird;3499
+bodyfarm;3500
+tara maclean;3501
+jackson browne;3502
+within temptation;3503
+cash cash;3504
+warren zevon;3505
+eva cassidy;3506
+billy ray cyrus;3507
+justin timberlake;3508
+the antlers;3509
+john elefante;3510
+angel;3511
+jaap schr der;3512
+evermore;3513
+destroy the runner;3514
+the black crowes;3515
+tiga;3516
+bury your dead;3517
+slowdive;3518
+machine head;3519
+c-murder;3520
+dirty;3521
+fujiya & miyagi;3522
+agonoize;3523
+hundredth;3524
+prong;3525
+nikki flores;3526
+cyanotic;3527
+jordan pruitt;3528
+u.s. bombs;3529
+a halo called fred;3530
+don williams;3531
+chester watson;3532
+noa;3533
+my passion;3534
+george harrison;3535
+scandroid;3536
+sister sin;3537
+hot snakes;3538
+black flag;3539
+christie;3540
+christoph eschenbach;3541
+slim gaillard;3542
+gravediggaz;3543
+the northern pikes;3544
+miranda sex garden;3545
+colin blunstone;3546
+the rasmus;3547
+nazareth;3548
+degarmo and key;3549
+earth crisis;3550
+the foreign exchange;3551
+uncle tupelo;3552
+kitty wells;3553
+karate high school;3554
+weeping tile;3555
+skinny puppy;3556
+audience;3557
+devotchka;3558
+john lennon;3559
+elf power;3560
+teresa brewer;3561
+heinrich schiff;3562
+6lack;3563
+anti-depressive delivery;3564
+the moffatts;3565
+lynn anderson;3566
+mordacious;3567
+robert palmer;3568
+seabound;3569
+black star riders;3570
+bryan rice;3571
+j.b. lenoir;3572
+the yards;3573
+the stupid stupid henchmen;3574
+tony rice;3575
+barbecue bob;3576
+dr. hook & the medicine show;3577
+garrison starr;3578
+union 13;3579
+jay sean;3580
+pretty willie;3581
+oceano;3582
+captain beefheart and the magic band;3583
+markéta irglová;3584
+wolfe tones;3585
+crumbsuckers;3586
+towers of london;3587
+impaled nazarene;3588
+big tent revival;3589
+hans theessink;3590
+the amity affliction;3591
+animal liberation orchestra;3592
+tower of power;3593
+vanessa bell armstrong;3594
+tom fogerty;3595
+pianos become the teeth;3596
+poppy;3597
+the knack;3598
+fields of the nephilim;3599
+peter and the test tube babies;3600
+cory branan;3601
+cherrelle;3602
+palaye royale;3603
+southside johnny & the asbury jukes;3604
+heideroosjes;3605
+the busters;3606
+dave edmunds;3607
+buck 65;3608
+the lost trailers;3609
+steam powered giraffe;3610
+lydia lunch;3611
+washboard sam;3612
+big time rush;3613
+nlt;3614
+old 97's;3615
+rupert hine;3616
+jon mclaughlin;3617
+acid drinkers;3618
+heaven & hell;3619
+george hamilton iv;3620
+scott joplin;3621
+the berzerker;3622
+jonatha brooke;3623
+iggy azalea;3624
+trijntje oosterhuis;3625
+cee lo green;3626
+wig wam;3627
+mark heard;3628
+phil manzanera;3629
+the story;3630
+kokomo arnold;3631
+the beau brummels;3632
+acid bath;3633
+talisman;3634
+mc chris;3635
+the del mccoury band;3636
+ty segall;3637
+boysetsfire;3638
+the ducky boys;3639
+delta moon;3640
+thumb;3641
+blacklisted;3642
+purity ring;3643
+anders manga;3644
+burn halo;3645
+how to dress well;3646
+maxnormal.tv;3647
+binoculers;3648
+lou barlow;3649
+doug sahm;3650
+gary allan;3651
+jan wayne;3652
+mörk gryning;3653
+danko jones;3654
+dover;3655
+robben ford;3656
+black breath;3657
+lovespirals;3658
+sérgio mendes;3659
+rufus;3660
+kingcrow;3661
+dmitri shostakovich;3662
+ghostface killah;3663
+lyriel;3664
+agathodaimon;3665
+hezekiah walker;3666
+nonpoint;3667
+balzac;3668
+lunatica;3669
+ella mae morse;3670
+toyah;3671
+nik kershaw;3672
+hammerfall;3673
+jmsn;3674
+speaker;3675
+whitechapel;3676
+haddaway;3677
+grace vanderwaal;3678
+the tubes;3679
+eric church;3680
+the o.c. supertones;3681
+frozen ghost;3682
+98 mute;3683
+bobby darin;3684
+voxtrot;3685
+the darkness;3686
+paris;3687
+killswitch engage;3688
+jessica lea mayfield;3689
+n-trance;3690
+hank williams, jr.;3691
+b5;3692
+thunderstone;3693
+north mississippi allstars;3694
+angry samoans;3695
+alexis korner;3696
+youssou n'dour;3697
+jet set satellite;3698
+boomkat;3699
+leif ove andsnes;3700
+ektomorf;3701
+ceremony;3702
+isley jasper isley;3703
+r.a. the rugged man;3704
+the twilight sad;3705
+the electric prunes;3706
+dream street;3707
+the yardbirds;3708
+turisas;3709
+celldweller;3710
+wigwam;3711
+paul carrack;3712
+dads;3713
+per gessle;3714
+eugene mcguinness;3715
+hardline;3716
+patti labelle;3717
+busta rhymes;3718
+rasputina;3719
+antique;3720
+schoolboy q;3721
+beat happening;3722
+punch brothers;3723
+keshia chanté;3724
+bob carlisle;3725
+daan;3726
+imperative reaction;3727
+after the fire;3728
+marc and the mambas;3729
+minnie driver;3730
+secondhand serenade;3731
+the byrds;3732
+bread;3733
+talib kweli;3734
+nelly;3735
+hagalaz' runedance;3736
+stick to your guns;3737
+the virus;3738
+snow;3739
+james morrison;3740
+yes;3741
+neuroactive;3742
+misanthrope;3743
+morgana lefay;3744
+che'nelle;3745
+hackneyed;3746
+wilhelm backhaus;3747
+warren haynes;3748
+proclamation;3749
+andy griggs;3750
+the bongos;3751
+corrosion of conformity;3752
+paloma faith;3753
+the japanese house;3754
+tizzy bac;3755
+agathocles;3756
+kari peitsamo;3757
+mýa;3758
+ignaz friedman;3759
+blitz;3760
+dyscarnate;3761
+the hidden cameras;3762
+egypt central;3763
+noggin toboggan;3764
+sadistik;3765
+peter gabriel;3766
+cinderella;3767
+sebastian sturm;3768
+psycroptic;3769
+sylver;3770
+jaymay;3771
+nahemah;3772
+dubioza kolektiv;3773
+the pogues;3774
+the arrogant sons of bitches;3775
+nelson freire;3776
+mastedon;3777
+leonid kogan;3778
+ezra furman;3779
+ohgr;3780
+edward maya;3781
+wings;3782
+freya;3783
+skye;3784
+ablaze my sorrow;3785
+gerry rafferty;3786
+franz schubert;3787
+peccatum;3788
+bananafishbones;3789
+claude debussy;3790
+eurythmics;3791
+will oldham;3792
+fiona boyes;3793
+cellar darling;3794
+public image ltd.;3795
+fireflight;3796
+afu-ra;3797
+little mix;3798
+power quest;3799
+emmy the great;3800
+sub7even;3801
+gwen stefani;3802
+playradioplay!;3803
+aaron neville;3804
+the stryder;3805
+lullacry;3806
+kelli ali;3807
+bow wow;3808
+igor stravinsky;3809
+grade;3810
+tall dwarfs;3811
+rbd;3812
+despised icon;3813
+blues magoos;3814
+the long winters;3815
+majid jordan;3816
+vincenzo bellini;3817
+loretta lynn;3818
+kenny wayne shepherd;3819
+bleeding through;3820
+cursive;3821
+linus of hollywood;3822
+g-unit;3823
+the casket lottery;3824
+kingston wall;3825
+kate voegele;3826
+amanda lear;3827
+vicky beeching;3828
+colin james;3829
+elvis costello & the attractions;3830
+dana dirksen;3831
+c sar franck;3832
+dog fashion disco;3833
+josé gonzález;3834
+the joy formidable;3835
+elly ney;3836
+meliah rage;3837
+tim moore;3838
+james blunt;3839
+american pleasure club;3840
+nicole scherzinger;3841
+the bad shepherds;3842
+the fabulous thunderbirds;3843
+i can make a mess like nobody's business;3844
+three plus;3845
+julie roberts;3846
+tribe;3847
+luke bryan;3848
+saxon;3849
+philip glass;3850
+bobby womack;3851
+triumph;3852
+veda hille;3853
+coldrain;3854
+en vogue;3855
+amy winehouse;3856
+bonnie pink;3857
+the doc watson family;3858
+bad company;3859
+fuck;3860
+cradle of filth;3861
+gorky's zygotic mynci;3862
+buck owens;3863
+elle king;3864
+samael;3865
+meryn cadell;3866
+nocturnal rites;3867
+ghost dance;3868
+kristy lee cook;3869
+the script;3870
+jake owen;3871
+death cab for cutie;3872
+the forgotten rebels;3873
+beaver;3874
+tom rush;3875
+david rock feinstein;3876
+oathbreaker;3877
+odetta;3878
+compton's most wanted;3879
+psy'aviah;3880
+big ed;3881
+john miles;3882
+shellac;3883
+turin brakes;3884
+eternal tears of sorrow;3885
+kodak black;3886
+slick rick;3887
+code orange;3888
+gioacchino rossini;3889
+marc andr hamelin;3890
+ulver;3891
+la toya jackson;3892
+path of resistance;3893
+laura fygi;3894
+tom mcrae;3895
+sitti;3896
+frank stokes;3897
+anorexia nervosa;3898
+gabriel cyphre;3899
+the jon spencer blues explosion;3900
+m-flo;3901
+jason donovan;3902
+sean paul;3903
+girls;3904
+504 boyz;3905
+bring me the horizon;3906
+shelby lynne;3907
+pierre laurent aimard;3908
+jim messina;3909
+the boyz;3910
+hank thompson;3911
+hands;3912
+zakk wylde;3913
+nikka costa;3914
+nelly furtado;3915
+fairground attraction;3916
+born of osiris;3917
+fenix tx;3918
+robert plant;3919
+robert casadesus;3920
+paul van dyk;3921
+glenn kaiser band;3922
+daughters;3923
+saraya;3924
+rae sremmurd;3925
+norma jean;3926
+the plot in you;3927
+wild nothing;3928
+apocalyptica;3929
+tony! toni! toné!;3930
+olivia newton-john;3931
+keith richards;3932
+elizabeth cook;3933
+outlandish;3934
+halford;3935
+susan ashton;3936
+doves;3937
+queen + paul rodgers;3938
+wishing chair;3939
+pseudo echo;3940
+crazyeightyeight;3941
+red lights flash;3942
+neon indian;3943
+erick sermon;3944
+the juliana theory;3945
+larry gatlin & the gatlin brothers;3946
+paul whiteman;3947
+missy elliott;3948
+the chiffons;3949
+allison iraheta;3950
+centvrion;3951
+the 5th dimension;3952
+dighayzoose;3953
+nina nastasia;3954
+shadowside;3955
+5 seconds of summer;3956
+asaf avidan;3957
+cady groves;3958
+old man luedecke;3959
+the chap;3960
+john maus;3961
+tom petty and the heartbreakers;3962
+have a nice life;3963
+breach of trust;3964
+claudio arrau;3965
+toro y moi;3966
+the tony danza tapdance extravaganza;3967
+oscar brand;3968
+john mccormack;3969
+anti-nowhere league;3970
+shearer;3971
+j.j. cale;3972
+iyaz;3973
+ice ages;3974
+the city harmonic;3975
+bruce mcculloch;3976
+the color changin' click;3977
+crack the sky;3978
+the moldy peaches;3979
+coolio;3980
+jaill;3981
+mud;3982
+hodgy beats;3983
+clap your hands say yeah;3984
+badly drawn boy;3985
+ewert and the two dragons;3986
+tystnaden;3987
+janie fricke;3988
+dottie west;3989
+witherscape;3990
+目黒将司 (shoji meguro);3991
+mnek;3992
+kurt nilsen;3993
+psychic tv;3994
+the boomtown rats;3995
+nena;3996
+stephin merritt;3997
+nausea;3998
+molly venter;3999
+rasaq;4000
+olly murs;4001
+хелависа;4002
+coko;4003
+scarface;4004
+sam smith;4005
+midnight juggernauts;4006
+pulley;4007
+theresa sokyrka;4008
+jacob banks;4009
+bomb the music industry!;4010
+meredith andrews;4011
+yearning;4012
+ron sexsmith;4013
+joseph haydn;4014
+einherjer;4015
+four tops;4016
+giles, giles and fripp;4017
+iona brown;4018
+bon jovi;4019
+kunt and the gang;4020
+motionless in white;4021
+sabrina;4022
+jillian aversa;4023
+the persuasions;4024
+tub ring;4025
+sacred warrior;4026
+susan boyle;4027
+la the darkman;4028
+savage garden;4029
+lindisfarne;4030
+jag panzer;4031
+dana key;4032
+wouter hamel;4033
+peter green;4034
+jill scott;4035
+wreck and reference;4036
+vanilla fudge;4037
+aerosmith;4038
+megafaun;4039
+poundhound;4040
+2 skinnee j's;4041
+the call;4042
+sonny boy williamson i;4043
+possessed;4044
+annie;4045
+suburban legends;4046
+the mutton birds;4047
+cibo matto;4048
+lorna shore;4049
+terry callier;4050
+selah sue;4051
+derek webb;4052
+parliament;4053
+sister machine gun;4054
+the badlees;4055
+the regrettes;4056
+marlon roudette;4057
+albert king;4058
+rebelution;4059
+shura cherkassky;4060
+dave rodgers;4061
+skip james;4062
+caedmon's call;4063
+asp;4064
+souldecision;4065
+savoir adore;4066
+nathaniel rateliff;4067
+the temperance movement;4068
+dragonland;4069
+isaac alb niz;4070
+falling up;4071
+stabilo;4072
+christ analogue;4073
+big mountain;4074
+sade;4075
+the casting out;4076
+gene autry;4077
+manfred mann;4078
+drive-by truckers;4079
+javier mendoza;4080
+alyson stoner;4081
+blind lemon jefferson;4082
+lowkey;4083
+brad paisley;4084
+jane child;4085
+i declare war;4086
+leo jan ek;4087
+lonestar;4088
+boards of canada;4089
+don henley;4090
+choclair;4091
+obsc(y)re;4092
+donnie mcclurkin;4093
+wall of voodoo;4094
+tony bennett;4095
+hank snow;4096
+poison the well;4097
+fancy;4098
+unbelievable truth;4099
+jet life;4100
+rory block;4101
+rachael lampa;4102
+the platters;4103
+tahiti 80;4104
+the gathering;4105
+brett dennen;4106
+lily allen;4107
+puffy amiyumi;4108
+iamx;4109
+tree63;4110
+bee gees;4111
+e.g. daily;4112
+narnia;4113
+peter green splinter group;4114
+gram parsons;4115
+facing new york;4116
+kalmah;4117
+hot water music;4118
+blessthefall;4119
+general public;4120
+cornershop;4121
+owen pallett;4122
+truth hurts;4123
+quorthon;4124
+kill your idols;4125
+planet funk;4126
+leonardo's bride;4127
+jimmy rushing;4128
+the title;4129
+dog eat dog;4130
+zucchero;4131
+mind.in.a.box;4132
+mainstay;4133
+graham nash;4134
+therion;4135
+bad boys blue;4136
+swizz beatz;4137
+joey + rory;4138
+matthew herbert;4139
+krewella;4140
+lou rhodes;4141
+linda perry;4142
+the pointer sisters;4143
+fit for an autopsy;4144
+yelworc;4145
+jeffrey lewis;4146
+arghoslent;4147
+andy park;4148
+uriah heep;4149
+disciple;4150
+sodom;4151
+beth nielsen chapman;4152
+chuck ragan;4153
+spoons;4154
+haerts;4155
+ice nine kills;4156
+rosie thomas;4157
+guerilla toss;4158
+dizzee rascal;4159
+g4;4160
+mendeed;4161
+empire of the sun;4162
+fergie;4163
+mishka;4164
+tom jones;4165
+the brothers four;4166
+racer x;4167
+clipping.;4168
+stiff little fingers;4169
+the ghost of a saber tooth tiger;4170
+ultimatum;4171
+odyssey eurobeat;4172
+tony banks;4173
+shaggy;4174
+kotoko;4175
+nothingface;4176
+mellowhype;4177
+anthony green;4178
+bulldozer;4179
+abigail williams;4180
+gangsta boo;4181
+jonny lang;4182
+j. cole;4183
+edvard grieg;4184
+the residents;4185
+luther allison;4186
+jean philippe rameau;4187
+randy newman;4188
+jeff the brotherhood;4189
+king missile;4190
+cloud nothings;4191
+fiona;4192
+the walkmen;4193
+nicola benedetti;4194
+birmingham 6;4195
+edge of dawn;4196
+nathan milstein;4197
+the paul butterfield blues band;4198
+he is legend;4199
+ol' dirty bastard;4200
+steve miller band;4201
+craig's brother;4202
+beracah;4203
+new edition;4204
+falconshield;4205
+childish gambino;4206
+am & shawn lee;4207
+unkle;4208
+silver jews;4209
+the interrupters;4210
+william shatner;4211
+adrian belew;4212
+scanner;4213
+slash;4214
+sweatshop union;4215
+dayna kurtz;4216
+krisma;4217
+ferraby lionheart;4218
+new years day;4219
+madchild;4220
+tohoshinki;4221
+busted;4222
+tyler bryant & the shakedown;4223
+vanessa hudgens;4224
+lady gaga;4225
+kosheen;4226
+circle ii circle;4227
+glee cast;4228
+forgotten tales;4229
+renaissance;4230
+lonnie johnson;4231
+swans;4232
+laura marling;4233
+against the current;4234
+ane brun;4235
+the buckinghams;4236
+art bears;4237
+rocky votolato;4238
+the soft boys;4239
+jonathan thulin;4240
+andreas johnson;4241
+pages;4242
+steve moakler;4243
+a.c. newman;4244
+showaddywaddy;4245
+bart davenport;4246
+the streets;4247
+allstar weekend;4248
+death grips;4249
+seventh avenue;4250
+grandmaster flash;4251
+a split-second;4252
+sense field;4253
+maisey rika;4254
+haken;4255
+sharon needles;4256
+arcangelo corelli;4257
+teen suicide;4258
+blue highway;4259
+the megas;4260
+noxious emotion;4261
+keldian;4262
+asian dub foundation;4263
+h.e.r.;4264
+girlpool;4265
+eddie degarmo;4266
+steve taylor;4267
+alex story;4268
+willy porter;4269
+aura dione;4270
+melanie b;4271
+az;4272
+chris isaak;4273
+captain tractor;4274
+anita baker;4275
+anne akiko meyers;4276
+josh rouse;4277
+noname;4278
+tina guo;4279
+the wannadies;4280
+the romantics;4281
+firewind;4282
+less than jake;4283
+owain phyfe;4284
+drag the river;4285
+keith sweat;4286
+kari jobe;4287
+dispatch;4288
+john stewart;4289
+hellhammer;4290
+reflection eternal;4291
+the last dance;4292
+ultimate fakebook;4293
+nomad;4294
+coal chamber;4295
+juice;4296
+gotthard;4297
+richard wagner;4298
+the strypes;4299
+felix mendelssohn bartholdy;4300
+lou gramm;4301
+necro;4302
+vladimir ashkenazy;4303
+the datsuns;4304
+green jellÿ;4305
+johnny duncan;4306
+thingy;4307
+jimmie rodgers;4308
+paul brady;4309
+hazel dickens;4310
+claude king;4311
+wynardtage;4312
+craig morgan;4313
+japandroids;4314
+the almanac singers;4315
+memphis slim;4316
+eddie rabbitt;4317
+mortification;4318
+escape the fate;4319
+tonight alive;4320
+tyler, the creator;4321
+cubanate;4322
+alvin youngblood hart;4323
+steel train;4324
+natasha thomas;4325
+violent soho;4326
+laika;4327
+michael bublé;4328
+folly & the hunter;4329
+nektar;4330
+carnifex;4331
+johan;4332
+matthew good;4333
+backyard babies;4334
+jim kweskin;4335
+beat crusaders;4336
+bullet for my valentine;4337
+oceans ate alaska;4338
+the communards;4339
+quo vadis;4340
+alessi brothers;4341
+1910 fruitgum company;4342
+attila;4343
+christian bautista;4344
+ian anderson;4345
+the grates;4346
+ethel waters;4347
+urthboy;4348
+burlap to cashmere;4349
+nails;4350
+pyotr ilyich tchaikovsky;4351
+keely smith;4352
+milemarker;4353
+truls m rk;4354
+snoop dogg;4355
+akrobatik;4356
+jorge bolet;4357
+jonny diaz;4358
+killer dwarfs;4359
+birth control;4360
+write this down;4361
+high school football heroes;4362
+the frames;4363
+angelo branduardi;4364
+casey bill weldon;4365
+steve green;4366
+bill anderson;4367
+jin akanishi;4368
+kimbra;4369
+nevermore;4370
+garrick ohlsson;4371
+nolongerhuman;4372
+ewan maccoll;4373
+the incredible string band;4374
+hayden;4375
+body count;4376
+charlie parr;4377
+andrew w.k.;4378
+sarah blasko;4379
+vetiver;4380
+борис гребенщиков;4381
+stephen marley;4382
+sage francis;4383
+fischerspooner;4384
+the strokes;4385
+bootsauce;4386
+maurice ravel;4387
+mini mansions;4388
+heartsounds;4389
+vangelis;4390
+artension;4391
+bear's den;4392
+mystic circle;4393
+katie melua;4394
+the mission;4395
+buddy & julie miller;4396
+2 live crew;4397
+calexico;4398
+danielle peck;4399
+devil doll;4400
+the mad conductor;4401
+eliane elias;4402
+brutal truth;4403
+bootsy collins;4404
+saving abel;4405
+robby valentine;4406
+vader;4407
+midnattsol;4408
+jigsaw;4409
+bullets and octane;4410
+company of thieves;4411
+kylie minogue;4412
+sandi thom;4413
+norah jones;4414
+the dillards;4415
+bulletboys;4416
+high places;4417
+neon horse;4418
+anna ternheim;4419
+alessia cara;4420
+wolfgang schneiderhan;4421
+tzu;4422
+t. rex;4423
+treble charger;4424
+culture;4425
+gaelic storm;4426
+elliott smith;4427
+memoryhouse;4428
+the amazing rhythm aces;4429
+balance of power;4430
+coil;4431
+hot rod circuit;4432
+the guess who;4433
+fredrika stahl;4434
+da' t.r.u.t.h.;4435
+toh kay;4436
+gentle giant;4437
+roger whittaker;4438
+the dø;4439
+neil sedaka;4440
+the view;4441
+the lookouts;4442
+twrp;4443
+tinie tempah;4444
+dana winner;4445
+civil war;4446
+haujobb;4447
+the crucified;4448
+tv on the radio;4449
+gil scott-heron;4450
+marit larsen;4451
+robert calvert;4452
+ja rule;4453
+tina charles;4454
+ima robot;4455
+neva dinova;4456
+thaurorod;4457
+lou christie;4458
+scritti politti;4459
+whiskeytown;4460
+while she sleeps;4461
+kristine w;4462
+zilch;4463
+the avett brothers;4464
+the human league;4465
+ilse delange;4466
+tinashe;4467
+mc ren;4468
+exciter;4469
+papa roach;4470
+frank turner;4471
+sol invictus;4472
+anya marina;4473
+these new puritans;4474
+instalok;4475
+tom smith;4476
+maria muldaur;4477
+e-40;4478
+holly golightly;4479
+my bloody valentine;4480
+defiance;4481
+allison moorer;4482
+wynn stewart;4483
+haggard;4484
+all shall perish;4485
+fritz kreisler;4486
+dead prez;4487
+lenny kravitz;4488
+lagwagon;4489
+damian marley;4490
+icehouse;4491
+dionne warwick;4492
+the devil makes three;4493
+drive, she said;4494
+hollow haze;4495
+anita carter;4496
+h.e.a.t;4497
+slaves on dope;4498
+arsis;4499
+samson;4500
+swv;4501
+l.a. guns;4502
+hiatus kaiyote;4503
+quicksilver messenger service;4504
+lea michele;4505
+graf orlock;4506
+tiago iorc;4507
+great northern;4508
+ray wylie hubbard;4509
+super junior;4510
+jerry jeff walker;4511
+hocico;4512
+bukimina;4513
+stretch;4514
+anthony hamilton;4515
+mushroomhead;4516
+evergreen terrace;4517
+michael mcdonald;4518
+leroy hutson;4519
+structures;4520
+dance gavin dance;4521
+other people;4522
+linda davis;4523
+doc walker;4524
+rumer;4525
+chuck berry;4526
+shoffy;4527
+dickey lee;4528
+freddy fender;4529
+batmobile;4530
+the jimi hendrix experience;4531
+teddy pendergrass;4532
+bushwick bill;4533
+gary moore;4534
+arthur beatrice;4535
+hurrah!;4536
+tsunami bomb;4537
+thundra;4538
+herb alpert & the tijuana brass;4539
+the flashbulb;4540
+korpiklaani;4541
+the mendoza line;4542
+vashti bunyan;4543
+unknown mortal orchestra;4544
+jerry garcia;4545
+richard shindell;4546
+i fight dragons;4547
+late tuesday;4548
+chris connor;4549
+the chemical brothers;4550
+virgin steele;4551
+sybreed;4552
+dan le sac vs scroobius pip;4553
+lani hall;4554
+nero;4555
+twin shadow;4556
+gordon downie;4557
+aphrodite's child;4558
+bodyjar;4559
+magic man;4560
+baths;4561
+tender forever;4562
+of monsters and men;4563
+krypteria;4564
+jandek;4565
+r.i.o.;4566
+government issue;4567
+jane monheit;4568
+jeffree star;4569
+moby grape;4570
+silver convention;4571
+michelle wright;4572
+the shroud;4573
+joseph szigeti;4574
+les claypool;4575
+the kills;4576
+beth hart;4577
+bobo in white wooden houses;4578
+twilight force;4579
+larry sparks;4580
+eddie vedder;4581
+glen campbell;4582
+michael hedges;4583
+the red shore;4584
+gloria gaynor;4585
+m2m;4586
+altered images;4587
+youth lagoon;4588
+xv;4589
+kevorkian death cycle;4590
+the sugarcubes;4591
+cryptic slaughter;4592
+candy butchers;4593
+8ball;4594
+jhené aiko;4595
+marcelle meyer;4596
+gaither vocal band;4597
+the boy least likely to;4598
+dean martin;4599
+journey;4600
+jackie evancho;4601
+esperanza spalding;4602
+the time;4603
+tampa red;4604
+poison;4605
+demis roussos;4606
+maria solheim;4607
+living colour;4608
+brutality;4609
+avatar;4610
+katie armiger;4611
+paul wilbur;4612
+tokio hotel;4613
+larue;4614
+golden earring;4615
+saint etienne;4616
+the go! team;4617
+james gang;4618
+master p;4619
+billie holiday;4620
+they might be giants;4621
+paul simon;4622
+flashlight brown;4623
+dreezy;4624
+david byron;4625
+kim boyce;4626
+masta killa;4627
+dmx;4628
+flesh-n-bone;4629
+june carter cash;4630
+crucial conflict;4631
+persephone;4632
+asher roth;4633
+angel dust;4634
+johnny crash;4635
+amy ray;4636
+lang lang;4637
+two hours traffic;4638
+attrition;4639
+carole king;4640
+sarah mclachlan;4641
+adolf busch;4642
+absolution project;4643
+justin bieber;4644
+penny mclean;4645
+velvet belly;4646
+matt nathanson;4647
+holly cole;4648
+diana vickers;4649
+ella mai;4650
+mark eitzel;4651
+the associates;4652
+vortech;4653
+in strict confidence;4654
+helloween;4655
+chasing victory;4656
+ninety pound wuss;4657
+jackie gleason;4658
+be bop deluxe;4659
+lit;4660
+the bruisers;4661
+amanda jenssen;4662
+crowder;4663
+ulcerate;4664
+mewithoutyou;4665
+joe purdy;4666
+t-bone walker;4667
+sarah brightman;4668
+chihiro onitsuka;4669
+trophy eyes;4670
+the receiving end of sirens;4671
+king's x;4672
+chad brownlee;4673
+battlelore;4674
+refused;4675
+far-less;4676
+brandy;4677
+black stone cherry;4678
+regina spektor;4679
+house of pain;4680
+220 volt;4681
+ty herndon;4682
+tarja;4683
+aaron tippin;4684
+slightly stoopid;4685
+nic jones;4686
+animal collective;4687
+converge;4688
+freddie king;4689
+kelela;4690
+the samples;4691
+draconian;4692
+the klezmatics;4693
+rob crow;4694
+country joe and the fish;4695
+cage;4696
+benny sings;4697
+wolfstone;4698
+w-inds.;4699
+lin-manuel miranda;4700
+azealia banks;4701
+zap mama;4702
+death threat;4703
+will stratton;4704
+audio adrenaline;4705
+jars of clay;4706
+shamir;4707
+miranda cosgrove;4708
+ginuwine;4709
+hell razah;4710
+thy primordial;4711
+weh;4712
+jason aldean;4713
+philippe entremont;4714
+smog;4715
+benny hester;4716
+otis redding;4717
+the methadones;4718
+karnivool;4719
+tennessee ernie ford;4720
+bob rivers;4721
+r. kelly;4722
+gang starr;4723
+king crimson;4724
+as we fight;4725
+sons of butcher;4726
+of the wand & the moon;4727
+the cars;4728
+salt the wound;4729
+fair to midland;4730
+lydia;4731
+terri clark;4732
+chris chameleon;4733
+the project hate mcmxcix;4734
+shadow gallery;4735
+black francis;4736
+gym class heroes;4737
+bill nelson;4738
+an cafe;4739
+smokey robinson;4740
+the duhks;4741
+broadway;4742
+fetty wap;4743
+billy sprague;4744
+norther;4745
+six feet under;4746
+republica;4747
+liam finn;4748
+highway 101;4749
+maureen mcgovern;4750
+dim mak;4751
+the go-betweens;4752
+old man's child;4753
+fall out boy;4754
+mia x;4755
+bbmak;4756
+cannabis corpse;4757
+ludacris;4758
+creature feature;4759
+miss montreal;4760
+eric burdon & the animals;4761
+bumblefoot;4762
+origin;4763
+capital cities;4764
+doro;4765
+nox arcana;4766
+hateen;4767
+the band perry;4768
+metallica;4769
+buffalo springfield;4770
+the hooters;4771
+peaches & herb;4772
+krystal meyers;4773
+paul rodgers;4774
+memphis jug band;4775
+sofie;4776
+maurizio pollini;4777
+brook benton;4778
+die verbannten kinder evas;4779
+brainiac;4780
+bearstronaut;4781
+pierce the veil;4782
+matthew friedberger;4783
+victims family;4784
+hell or highwater;4785
+emerson drive;4786
+caro emerald;4787
+louis jordan;4788
+the osmonds;4789
+unspoken;4790
+craig cardiff;4791
+tomohisa yamashita;4792
+ari hest;4793
+dj jazzy jeff & the fresh prince;4794
+dj drama & lil wayne;4795
+ziggy;4796
+waters;4797
+シド (sid);4798
+jeannie c. riley;4799
+tilly and the wall;4800
+karen matheson;4801
+blind passengers;4802
+brotha lynch hung;4803
+harry nilsson;4804
+young money;4805
+marike jager;4806
+good shoes;4807
+tom tom club;4808
+animaniacs;4809
+solitude aeturnus;4810
+george strait;4811
+akinyele;4812
+the honorary title;4813
+carl wilson;4814
+irving;4815
+izzy stradlin;4816
+the carter family;4817
+gzr;4818
+axxis;4819
+marc e. bassy;4820
+chipmunk;4821
+gigi d'agostino;4822
+tindersticks;4823
+the dears;4824
+the ocean;4825
+raul malo;4826
+joel plaskett;4827
+trooper;4828
+arnold schoenberg;4829
+zayn;4830
+hellsongs;4831
+leftöver crack;4832
+the waterboys;4833
+levon helm;4834
+gehenna;4835
+force majeure;4836
+take 6;4837
+amy rigby;4838
+lil baby;4839
+north star;4840
+mott the hoople;4841
+dorsal atlântica;4842
+sabrina carpenter;4843
+julee cruise;4844
+blu cantrell;4845
+willard grant conspiracy;4846
+faron young;4847
+celtic woman;4848
+tony joe white;4849
+petal;4850
+titus andronicus;4851
+son house;4852
+hungry lights;4853
+eskimo callboy;4854
+marnie;4855
+seth walker;4856
+the herd;4857
+asobi seksu;4858
+david crosby;4859
+billy joe royal;4860
+jinkx monsoon;4861
+the replacements;4862
+iio;4863
+binärpilot;4864
+young guns;4865
+mel tormé;4866
+the baseballs;4867
+aphex twin;4868
+mike ness;4869
+snot;4870
+blind willie mctell;4871
+icon for hire;4872
+jeff tweedy;4873
+swollen members;4874
+carly rae jepsen;4875
+crystal viper;4876
+larry the cable guy;4877
+andre nickatina & equipto;4878
+raphael saadiq;4879
+mesmerize;4880
+in flames;4881
+kristy thirsk;4882
+headlights;4883
+deathspell omega;4884
+joan armatrading;4885
+red red meat;4886
+devlin;4887
+ladyhawke;4888
+ween;4889
+mates of state;4890
+husky rescue;4891
+rhett akins;4892
+godsmack;4893
+grandpa jones;4894
+matt maher;4895
+ill niño;4896
+flatfoot 56;4897
+josh garrels;4898
+obey the brave;4899
+damh the bard;4900
+sara lov;4901
+charles aznavour;4902
+thin lizzy;4903
+fatherson;4904
+tristania;4905
+binary star;4906
+warlock;4907
+ray j;4908
+jamie grace;4909
+mildred bailey;4910
+heartsrevolution;4911
+the dreaming;4912
+the feelies;4913
+lucy rose;4914
+artifacts;4915
+m (uk);4916
+the nits;4917
+the outfield;4918
+freelance whales;4919
+4minute;4920
+babyland;4921
+propagandhi;4922
+lil johnson;4923
+digital summer;4924
+the tenors;4925
+simply red;4926
+de-phazz;4927
+benny goodman;4928
+foxy brown;4929
+brainstorm;4930
+helen kane;4931
+les humphries singers;4932
+the prids;4933
+bruce cockburn;4934
+superbus;4935
+ben e. king;4936
+cryonic temple;4937
+renard;4938
+tom robinson;4939
+the cover girls;4940
+i, the breather;4941
+worm is green;4942
+commodores;4943
+julie doiron;4944
+portugal. the man;4945
+mike & the mechanics;4946
+7 year bitch;4947
+venetian snares;4948
+johnny mercer;4949
+grandaddy;4950
+malia;4951
+azure ray;4952
+carolina liar;4953
+nitronoise;4954
+rufus wainwright;4955
+the internet;4956
+j-zone;4957
+coronatus;4958
+celestial season;4959
+jimmy buffett;4960
+freeway;4961
+danielson;4962
+zwan;4963
+boys night out;4964
+julia jacklin;4965
+the fair sex;4966
+okkervil river;4967
+the raveonettes;4968
+wilson phillips;4969
+elane;4970
+andi deris;4971
+rocking chairs;4972
+michael johnson;4973
+rush;4974
+the oak ridge boys;4975
+rev. edward w. clayborn;4976
+bonnie bianco;4977
+ryan bingham;4978
+jeremy enigk;4979
+godhead;4980
+before their eyes;4981
+arsonists get all the girls;4982
+empyrium;4983
+bedlight for blue eyes;4984
+rehab;4985
+dead sara;4986
+snafu;4987
+roy acuff;4988
+the partridge family;4989
+malcolm holcombe;4990
+portastatic;4991
+thievery corporation;4992
+elysium;4993
+the other;4994
+beardfish;4995
+vanessa amorosi;4996
+babylon whores;4997
+pink floyd;4998
+nana grizol;4999
+charly mcclain;5000
+chico debarge;5001
+ten masked men;5002
+beloved;5003
+the shamen;5004
+charlie peacock;5005
+andr s schiff;5006
+castanets;5007
+cold cave;5008
+michael schenker group;5009
+house vs. hurricane;5010
+the pentangle;5011
+weerd science;5012
+ghost brigade;5013
+daysend;5014
+manfred mann's earth band;5015
+electrelane;5016
+smokin' joe kubek & bnois king;5017
+walter becker;5018
+el perro del mar;5019
+heaven & earth;5020
+amebix;5021
+stanfour;5022
+vallenfyre;5023
+tankard;5024
+circa waves;5025
+louis lortie;5026
+hammock;5027
+birdman;5028
+esther phillips;5029
+garnet rogers;5030
+icona pop;5031
+chvrches;5032
+donald fagen;5033
+kim mitchell;5034
+canibus;5035
+woods of ypres;5036
+four letter lie;5037
+eug ne ysa e;5038
+biz markie;5039
+tila tequila;5040
+blue mountain;5041
+buddy holly;5042
+rootwater;5043
+obie trice;5044
+ferlin husky;5045
+hinds;5046
+emmylou harris;5047
+hop along;5048
+ian moore;5049
+roger;5050
+micky & the motorcars;5051
+bebel gilberto;5052
+rod stewart;5053
+richard smallwood;5054
+deftones;5055
+suicide silence;5056
+the wrights;5057
+rudolf firku n ;5058
+god module;5059
+gregory alan isakov;5060
+angus & julia stone;5061
+real mccoy;5062
+kate miller-heidke;5063
+eric's trip;5064
+woe, is me;5065
+rex orange county;5066
+jascha heifetz;5067
+novembre;5068
+ministry;5069
+graveyard;5070
+tone damli;5071
+alien sex fiend;5072
+eazy-e;5073
+jj72;5074
+sublime;5075
+george formby;5076
+trapeze;5077
+mikhail pletnev;5078
+roger clyne & the peacemakers;5079
+rita coolidge;5080
+xxxtentacion;5081
+liz durrett;5082
+collective soul;5083
+second person;5084
+mat kearney;5085
+mr. 3-2;5086
+ezio;5087
+the republic of wolves;5088
+stepdad;5089
+mike oldfield;5090
+red sun rising;5091
+snfu;5092
+saul williams;5093
+justin townes earle;5094
+upon a burning body;5095
+bruce hungerford;5096
+jeff scott soto;5097
+big business;5098
+dan tyminski;5099
+infected mushroom;5100
+elend;5101
+john waite;5102
+hilary hahn;5103
+austin mahone;5104
+pretty girls make graves;5105
+billy idol;5106
+stephen malkmus;5107
+moddi;5108
+galahad;5109
+diana krall;5110
+the be good tanyas;5111
+!distain;5112
+the isley brothers;5113
+marilyn manson;5114
+blackthorn;5115
+ordinary time;5116
+dragonforce;5117
+i blame coco;5118
+this providence;5119
+t-pain;5120
+capleton;5121
+dc talk;5122
+leon redbone;5123
+optimus rhyme;5124
+zedd;5125
+black label society;5126
+gary brooker;5127
+melvins;5128
+mindless faith;5129
+the warning;5130
+bombshell rocks;5131
+the unthanks;5132
+secrets;5133
+joan jett and the blackhearts;5134
+funeral for a friend;5135
+aorta;5136
+roger glover;5137
+nitzer ebb;5138
+amber pacific;5139
+sneaker pimps;5140
+insomnium;5141
+danger radio;5142
+lay low;5143
+russ;5144
+bliss n eso;5145
+dj antoine;5146
+to kill a king;5147
+dubstar;5148
+by the tree;5149
+imelda may;5150
+emil gilels;5151
+redbone;5152
+the highwaymen;5153
+fear;5154
+ry cooder;5155
+ludo;5156
+mance lipscomb;5157
+shawn colvin;5158
+bongzilla;5159
+the promise ring;5160
+dr. dog;5161
+ronnie dunn;5162
+the buffoons;5163
+aimee mann;5164
+chase & status;5165
+rose maddox;5166
+lights;5167
+akissforjersey;5168
+tommy shaw;5169
+rotersand;5170
+x japan;5171
+richie furay;5172
+provision;5173
+gordon lightfoot;5174
+primus;5175
+die sektor;5176
+megadeth;5177
+agnetha fältskog;5178
+angelspit;5179
+machine gun kelly;5180
+father;5181
+cherry ghost;5182
+nana;5183
+ensign;5184
+björk;5185
+styx;5186
+cinema bizarre;5187
+tiamat;5188
+chris mills;5189
+rachael sage;5190
+prāta vētra;5191
+the hold steady;5192
+phil lynott;5193
+brian hyland;5194
+we as human;5195
+the wallflowers;5196
+kalan porter;5197
+freddie hart;5198
+corey crowder;5199
+the angels of light;5200
+papermoon;5201
+tommy mcclennan;5202
+the paper chase;5203
+ikon;5204
+happy monster band;5205
+modern talking;5206
+philadelphia orchestra;5207
+hellyeah;5208
+heart of a coward;5209
+state property;5210
+howling bells;5211
+shalamar;5212
+the geraldine fibbers;5213
+toots & the maytals;5214
+walter trout;5215
+michael o'brien;5216
+sweet;5217
+hate eternal;5218
+carnival in coal;5219
+céline dion;5220
+lee hazlewood;5221
+amy holland;5222
+defleshed;5223
+irma thomas;5224
+the chieftains;5225
+dexter freebish;5226
+the lads;5227
+peter bradley adams;5228
+front line assembly;5229
+blindside;5230
+vulfpeck;5231
+kontrust;5232
+smosh;5233
+boy & bear;5234
+cruachan;5235
+berried alive;5236
+the raconteurs;5237
+dälek;5238
+julie andrews;5239
+spoon;5240
+mad caddies;5241
+she keeps bees;5242
+martha tilston;5243
+le butcherettes;5244
+the vines;5245
+mothers;5246
+biohazard;5247
+doug macleod;5248
+down;5249
+maestro fresh-wes;5250
+boston;5251
+oh susanna;5252
+goldfrapp;5253
+sons of bill;5254
+fun people;5255
+the crown;5256
+tim maia;5257
+sevdaliza;5258
+the little willies;5259
+cupcakke;5260
+poxy boggards;5261
+damon intrabartolo;5262
+mostly autumn;5263
+jim reeves;5264
+dir en grey;5265
+robin beck;5266
+the sounds;5267
+migos;5268
+de/vision;5269
+larry santos;5270
+combichrist;5271
+milla jovovich;5272
+luba;5273
+sharon van etten;5274
+forevermore;5275
+roger daltrey;5276
+lunik;5277
+maroon;5278
+the rolling stones;5279
+jon secada;5280
+yehudi menuhin;5281
+stompin' tom connors;5282
+r.l. burnside;5283
+a tortured soul;5284
+con hunley;5285
+the supernaturals;5286
+the kooks;5287
+jeff beck;5288
+pokey lafarge;5289
+watermark;5290
+au revoir simone;5291
+matthew barber;5292
+u-god;5293
+blaze bayley;5294
+haste the day;5295
+chase rice;5296
+ariana grande;5297
+bukka white;5298
+skew siskin;5299
+monster magnet;5300
+the oh hellos;5301
+the pop group;5302
+haim;5303
+bay city rollers;5304
+mustasch;5305
+mc magic;5306
+sherbet;5307
+the tea party;5308
+the choir;5309
+woody guthrie;5310
+hypocrisy;5311
+big maceo;5312
+the psychedelic furs;5313
+ariel pink;5314
+fourplay;5315
+paw;5316
+beirut;5317
+french kicks;5318
+the ronettes;5319
+the durutti column;5320
+therefore i am;5321
+d-a-d;5322
+eric martin;5323
+andrea schroeder;5324
+john hiatt;5325
+incantation;5326
+lisa marie presley;5327
+high and mighty color;5328
+vonda shepard;5329
+asphyx;5330
+israel vibration;5331
+lordi;5332
+nikki yanofsky;5333
+the box tops;5334
+jorma kaukonen;5335
+juice newton;5336
+woody's a girl;5337
+matthew logan vasquez;5338
+aretha franklin;5339
+buddy miller;5340
+ayreon;5341
+mediæval bæbes;5342
+accept;5343
+robert schuman;5344
+mina;5345
+will.i.am;5346
+marc almond;5347
+nomeansno;5348
+defeater;5349
+suggs;5350
+grobschnitt;5351
+amon düül ii;5352
+feeling left out;5353
+breaking benjamin;5354
+george michael;5355
+bracket;5356
+all-4-one;5357
+niccol paganini;5358
+la coka nostra;5359
+ozzy osbourne;5360
+the bobs;5361
+jay reatard;5362
+satellites;5363
+gwen stacy;5364
+soulspell;5365
+anchor;5366
+girls' generation;5367
+lacy j. dalton;5368
+lil mama;5369
+florrie;5370
+lesley gore;5371
+sara k.;5372
+upon this dawning;5373
+barry adamson;5374
+christine and the queens;5375
+timbaland;5376
+pj morton;5377
+the divine comedy;5378
+bascom lamar lunsford;5379
+cilla black;5380
+spank rock;5381
+jeff healey;5382
+molotov solution;5383
+matthew fisher;5384
+francis poulenc;5385
+anata;5386
+sara watkins;5387
+amy macdonald;5388
+coldworker;5389
+london symphony;5390
+danny brown;5391
+kandi;5392
+bic runga;5393
+iggy pop;5394
+die toten hosen;5395
+tuck & patti;5396
+richard goode;5397
+blind boy fuller;5398
+sonny & cher;5399
+jeremih;5400
+spiritual front;5401
+d'espairsray;5402
+shirley bassey;5403
+annuals;5404
+bros;5405
+charlotte martin;5406
+ramones;5407
+paper route;5408
+cretin;5409
+streetwalkers;5410
+number one gun;5411
+the smithereens;5412
+belvedere;5413
+tech n9ne;5414
+corinne bailey rae;5415
+dawn of ashes;5416
+into eternity;5417
+corb lund;5418
+faz l say;5419
+current swell;5420
+albert collins;5421
+lita ford;5422
+rudimental;5423
+nightmare of you;5424
+josef lh vinne;5425
+stephen bishop;5426
+richie kotzen;5427
+run-d.m.c.;5428
+the moody blues;5429
+divinyls;5430
+good old war;5431
+anders johansson;5432
+gandalf's fist;5433
+baccara;5434
+miniature tigers;5435
+rare earth;5436
+the walker brothers;5437
+molly johnson;5438
+we five;5439
+the verve pipe;5440
+the foreshadowing;5441
+sam the sham & the pharaohs;5442
+fifth harmony;5443
+madonna;5444
+juluka;5445
+dynazty;5446
+michael schulte;5447
+thompson twins;5448
+lil wyte;5449
+mike batt;5450
+trial;5451
+cisco houston;5452
+callisto;5453
+darkest hour;5454
+brian may;5455
+david wilcox;5456
+napalm death;5457
+never heard of it;5458
+jj grey & mofro;5459
+eric andersen;5460
+billie piper;5461
+chromeo;5462
+youri egorov;5463
+aaron lewis;5464
+vince neil;5465
+buddy guy & junior wells;5466
+róisín murphy;5467
+in the woods...;5468
+aesma daeva;5469
+electric guest;5470
+murray mclauchlan;5471
+thee oh sees;5472
+nick kamen;5473
+eloy;5474
+brian eno;5475
+rabbit junk;5476
+suidakra;5477
+mint condition;5478
+extreme;5479
+kelley stoltz;5480
+mattafix;5481
+tiara thomas;5482
+fugees;5483
+warpaint;5484
+selena gomez & the scene;5485
+hot boy$;5486
+allen toussaint;5487
+skrillex;5488
+john schneider;5489
+midlake;5490
+the supremes;5491
+rodney crowell;5492
+everyone everywhere;5493
+theatre of tragedy;5494
+s.f.a.;5495
+dreamtale;5496
+count bass d;5497
+ivy sole;5498
+bobby blue bland;5499
+popcaan;5500
+tab benoit;5501
+grinspoon;5502
+grizzly bear;5503
+kirk franklin;5504
+grateful dead;5505
+21 guns;5506
+scouting for girls;5507
+fay lovsky;5508
+the fullblast;5509
+death;5510
+the elected;5511
+transvision vamp;5512
+keith urban;5513
+left spine down;5514
+the nylons;5515
+alien ant farm;5516
+otep;5517
+ashton shepherd;5518
+paradise lost;5519
+hello saferide;5520
+john kay;5521
+the beach boys;5522
+gregory porter;5523
+ricky martin;5524
+wayne hancock;5525
+youn sun nah;5526
+winger;5527
+havoc;5528
+los campesinos!;5529
+shaun cassidy;5530
+chevelle;5531
+barbara mason;5532
+rita wilson;5533
+richie havens;5534
+scythe;5535
+d.r.i.;5536
+matt andersen;5537
+fifth angel;5538
+trail of tears;5539
+asaf avidan & the mojos;5540
+christopher lee;5541
+tripod;5542
+crywank;5543
+tank;5544
+tom paxton;5545
+leon russell;5546
+adam green;5547
+anarbor;5548
+the unicorns;5549
+evidence;5550
+stetsasonic;5551
+the gabe dixon band;5552
+prince;5553
+day26;5554
+rhythms del mundo;5555
+saviour machine;5556
+alina simone;5557
+dick haymes;5558
+hugh laurie;5559
+jc chasez;5560
+johnny clegg & savuka;5561
+rivers of nihil;5562
+overkill;5563
+guy;5564
+memphis slim & willie dixon;5565
+jocelyn & chris arndt;5566
+mechanical moth;5567
+pat benatar;5568
+eden's curse;5569
+gene pitney;5570
+rodriguez;5571
+jamala;5572
+jerry garcia band;5573
+demon;5574
+backstreet boys;5575
+cocorosie;5576
+savatage;5577
+rosemary clooney;5578
+amerie;5579
+ian dury and the blockheads;5580
+pantokrator;5581
+the lox;5582
+supertramp;5583
+carnal forge;5584
+this is hell;5585
+papooz;5586
+julia holter;5587
+traffic;5588
+gary lewis & the playboys;5589
+leopold godowsky;5590
+inferi;5591
+remembering never;5592
+the radio dept.;5593
+blind willie johnson;5594
+gary chapman;5595
+mutual benefit;5596
+dragonette;5597
+crooked fingers;5598
+black mountain;5599
+shampoo;5600
+onslaught;5601
+big moe;5602
+the tragically hip;5603
+dead by april;5604
+john parr;5605
+chameleon circuit;5606
+all;5607
+greeley estates;5608
+herbie hancock;5609
+karmakanic;5610
+coffin break;5611
+blood orange;5612
+alborosie;5613
+aeternus;5614
+rich boy;5615
+cledus t. judd;5616
+bobby brown;5617
+zebra;5618
+scott matthew;5619
+winter's bane;5620
+kane & abel;5621
+jackson c. frank;5622
+maura o'connell;5623
+color me badd;5624
+christina aguilera;5625
+3rd bass;5626
+danny gokey;5627
+galactic cowboys;5628
+sabaton;5629
+howard jones;5630
+s.p.o.c.k;5631
+heather alexander;5632
+ingested;5633
+terror jr;5634
+enuff z'nuff;5635
+the gothic archies;5636
+robert ellis;5637
+nancy wilson;5638
+dead kennedys;5639
+milow;5640
+hall & oates;5641
+the mynabirds;5642
+grass widow;5643
+mew;5644
+chris young;5645
+crest of darkness;5646
+b.j. thomas;5647
+sister sledge;5648
+john lee hooker and canned heat;5649
+screeching weasel;5650
+cassandra wilson;5651
+terry reid;5652
+maps;5653
+katy perry;5654
+swmrs;5655
+neurotech;5656
+george gershwin;5657
+april wine;5658
+powerwolf;5659
+yellowcard;5660
+the kry;5661
+barbarossa;5662
+blackguard;5663
+rjd2;5664
+angelo de augustine;5665
+brand nubian;5666
+iron reagan;5667
+snap!;5668
+the expos;5669
+paula abdul;5670
+bahamas;5671
+olive;5672
+gene simmons;5673
+augustana;5674
+vicious crusade;5675
+mennen;5676
+arsonists;5677
+fred penner;5678
+amen;5679
+mae;5680
+the stylistics;5681
+bill monroe;5682
+aeon zen;5683
+paul williams;5684
+ultraviolet sound;5685
+omnia;5686
+dave cousins;5687
+silent stream of godless elegy;5688
+david lindley;5689
+title fight;5690
+stevie nicks;5691
+disturbed;5692
+the lumineers;5693
+wondermints;5694
+necromantia;5695
+anton bruckner;5696
+john hammond;5697
+counterparts;5698
+the pursuit of happiness;5699
+dougie maclean;5700
+domo genesis;5701
+keaton henson;5702
+the electric hellfire club;5703
+casting crowns;5704
+her space holiday;5705
+lindi ortega;5706
+toy dolls;5707
+kobra and the lotus;5708
+velvet acid christ;5709
+hafdís huld;5710
+dead infection;5711
+blues traveler;5712
+hawthorne heights;5713
+emf;5714
+the secret handshake;5715
+einstürzende neubauten;5716
+rednex;5717
+aztec camera;5718
+heart in hand;5719
+easyworld;5720
+shlomo mintz;5721
+earl wild;5722
+french montana;5723
+prime sth;5724
+craig david;5725
+blind pilot;5726
+stratovarius;5727
+nina nesbitt;5728
+fiddler's green;5729
+skyclad;5730
+caitlyn smith;5731
+daniel lavoie;5732
+diamond rio;5733
+the four lads;5734
+die warzau;5735
+funker vogt;5736
+black tusk;5737
+bob seger;5738
+labyrinth;5739
+teodasia;5740
+magnapop;5741
+dødheimsgard;5742
+barrio boyzz;5743
+jesse malin;5744
+the brothers johnson;5745
+the obsessed;5746
+lucky boys confusion;5747
+lemon jelly;5748
+cock sparrer;5749
+itzhak perlman;5750
+amberian dawn;5751
+moxy früvous;5752
+ugress;5753
+the thermals;5754
+common;5755
+torres;5756
+badlands;5757
+ron kenoly;5758
+wide mouth mason;5759
+run kid run;5760
+qntal;5761
+patty larkin;5762
+the answer;5763
+la bouche;5764
+abba;5765
+melanie thornton;5766
+limp bizkit;5767
+danny wilde;5768
+against all authority;5769
+志方あきこ;5770
+a plea for purging;5771
+chris caffery;5772
+7l & esoteric;5773
+jagged edge;5774
+allo darlin';5775
+domenico scarlatti;5776
+b-legit;5777
+hundreds;5778
+jonathan richman and the modern lovers;5779
+catharsis;5780
+son volt;5781
+electric valentine;5782
+gino vannelli;5783
+call the cops;5784
+miss may i;5785
+double you;5786
+the soul stirrers;5787
+adrenaline mob;5788
+timothy seth avett as darling;5789
+raging fyah;5790
+ナイトメア (nightmare);5791
+selena gomez;5792
+franco battiato;5793
+sons of seasons;5794
+aaron shust;5795
+august alsina;5796
+ghoul;5797
+mustard plug;5798
+the white stripes;5799
+dead stop;5800
+slim;5801
+project 86;5802
+lower dens;5803
+stephen fretwell;5804
+off!;5805
+psychostick;5806
+radney foster;5807
+black uhuru;5808
+one without;5809
+the presidents of the united states of america;5810
+phil ochs;5811
+stealers wheel;5812
+the angels;5813
+joy williams;5814
+nick jonas;5815
+owl city;5816
+the gourds;5817
+cowboy junkies;5818
+cru;5819
+the rembrandts;5820
+useless id;5821
+jessica andrews;5822
+big black;5823
+my brightest diamond;5824
+johnny kidd & the pirates;5825
+moonface;5826
+angra;5827
+john mccutcheon;5828
+sharon jones & the dap-kings;5829
+brother dege;5830
+john p. kee;5831
+armin van buuren;5832
+houndmouth;5833
+the spencer davis group;5834
+poison idea;5835
+carach angren;5836
+the horrors;5837
+johnny paycheck;5838
+primal fear;5839
+joanna newsom;5840
+weezer;5841
+bluehorses;5842
+architecture in helsinki;5843
+steve mcconnell;5844
+townes van zandt;5845
+johnny cash & june carter cash;5846
+dust of basement;5847
+j dilla;5848
+too pure to die;5849
+car seat headrest;5850
+rita springer;5851
+max romeo;5852
+calabrese;5853
+trey anastasio;5854
+young thug;5855
+harry and the potters;5856
+clifford t. ward;5857
+confederate railroad;5858
+ice mc;5859
+tyler shaw;5860
+will smith;5861
+anna tivel;5862
+the pierces;5863
+sabbat;5864
+papercuts;5865
+trouble over tokyo;5866
+a storm of light;5867
+earl scruggs;5868
+limahl;5869
+band of horses;5870
+suspyre;5871
+shy girls;5872
+blood;5873
+madness;5874
+georgie fame;5875
+glass tiger;5876
+50 cent;5877
+mike doughty;5878
+y-o-u;5879
+ass ponys;5880
+mary gauthier;5881
+goldfinger;5882
+delbert mcclinton;5883
+freak kitchen;5884
+lalaine;5885
+destiny's child;5886
+thirsty merc;5887
+daniel bedingfield;5888
+armored saint;5889
+¡mayday!;5890
+cheryl cole;5891
+richie spice;5892
+luscious jackson;5893
+altan;5894
+evanescence;5895
+luther vandross;5896
+steve wariner;5897
+deadlock;5898
+brenda lee;5899
+noe venable;5900
+korn;5901
+the letter black;5902
+star fucking hipsters;5903
+daniel o'donnell;5904
+theatres des vampires;5905
+the dogma;5906
+ria mae;5907
+thy art is murder;5908
+mali music;5909
+eydie gorme;5910
+housefires;5911
+brooks & dunn;5912
+million dead;5913
+kurt vile;5914
+3lw;5915
+helix;5916
+judy collins;5917
+albert hammond;5918
+coroner;5919
+red flag;5920
+ralph vaughan williams;5921
+infected rain;5922
+ann wilson;5923
+anthony evans;5924
+christina milian;5925
+taco;5926
+lee greenwood;5927
+jon anderson;5928
+bun b;5929
+skye sweetnam;5930
+britney spears;5931
+peter serkin;5932
+saywecanfly;5933
+gordon haskell;5934
+grouplove;5935
+walter egan;5936
+malinky;5937
+mandy barnett;5938
+mystery skulls;5939
+jeremy larson;5940
+charley patton;5941
+modern english;5942
+inhale exhale;5943
+avantasia;5944
+huntingtons;5945
+shudder to think;5946
+the brand new heavies;5947
+slice the cake;5948
+sick of sarah;5949
+winds;5950
+the rakes;5951
+ray lamontagne;5952
+haircut 100;5953
+your demise;5954
+exposé;5955
+narada michael walden;5956
+lord of the lost;5957
+the rubettes;5958
+aloe blacc;5959
+jeff wayne;5960
+in this moment;5961
+the move;5962
+machine men;5963
+orenda fink;5964
+tina dico;5965
+ziggy marley & the melody makers;5966
+noël coward;5967
+bonfire;5968
+hawkwind;5969
+jessie j;5970
+emitt rhodes;5971
+john martyn;5972
+blue öyster cult;5973
+the silver shine;5974
+tex ritter;5975
+kishi bashi;5976
+sonny james;5977
+mind's eye;5978
+the sleeping;5979
+the derek trucks band;5980
+atmosphere;5981
+lauren daigle;5982
+taj weekes & adowa;5983
+ricky skaggs;5984
+alabama shakes;5985
+black star;5986
+gong;5987
+viper;5988
+fruit bats;5989
+allie x;5990
+josh woodward;5991
+kungfu rick;5992
+the weavers;5993
+blood, sweat & tears;5994
+the stooges;5995
+the white birch;5996
+john mayall;5997
+diamond d;5998
+luigi boccherini;5999
+half man half biscuit;6000
+ralph mctell;6001
+lisa brokop;6002
+son lux;6003
+wumpscut;6004
+beneath the massacre;6005
+nine inch nails;6006
+ancient rites;6007
+drop dead, gorgeous;6008
+no mercy;6009
+lene lovich;6010
+widowmaker;6011
+the microphones;6012
+rita connolly;6013
+generation x;6014
+assassin;6015
+horse feathers;6016
+lola monroe;6017
+bette midler;6018
+gentleman;6019
+the crystal method;6020
+crystal bernard;6021
+black 47;6022
+starbenders;6023
+landmine marathon;6024
+múm;6025
+califone;6026
+allister;6027
+for all those sleeping;6028
+fgfc820;6029
+anner bylsma;6030
+lilys;6031
+triptykon;6032
+danger doom;6033
+big k.r.i.t.;6034
+assemblage 23;6035
+howie day;6036
+miike snow;6037
+diiv;6038
+the rapture;6039
+the civil wars;6040
+liege lord;6041
+nicolette larson;6042
+slayer;6043
+2pac;6044
+16;6045
+grigory sokolov;6046
+martin carthy;6047
+bowerbirds;6048
+ginger;6049
+luka bloom;6050
+young jeezy;6051
+kate rusby;6052
+the pretty things;6053
+2 chainz;6054
+man overboard;6055
+paul tortelier;6056
+doyle lawson & quicksilver;6057
+timber timbre;6058
+johnny rodriguez;6059
+and one;6060
+grand funk railroad;6061
+kiss;6062
+elysian fields;6063
+ace enders & a million different people;6064
+hoods;6065
+frenzal rhomb;6066
+army of freshmen;6067
+unter null;6068
+frankie lee sims;6069
+joe;6070
+sahara hotnights;6071
+alison moyet;6072
+janis ian;6073
+delinquent habits;6074
+heffron drive;6075
+jackie wilson;6076
+osi;6077
+mobb deep;6078
+corey cerovsek;6079
+pras;6080
+buckcherry;6081
+dave davies;6082
+beto vázquez infinity;6083
+drowning the light;6084
+attack attack!;6085
+day of fire;6086
+ida haendel;6087
+ytcracker;6088
+u2;6089
+stan freberg;6090
+saint lu;6091
+jj;6092
+robert bradley's blackwater surprise;6093
+pharrell williams;6094
+no trend;6095
+darkwell;6096
+van dyke parks;6097
+cannonball statman;6098
+thursday;6099
+kathleen edwards;6100
+sentenced;6101
+crown the empire;6102
+crimson thorn;6103
+george benson;6104
+the concretes;6105
+kahimi karie;6106
+jimi hendrix;6107
+dock boggs;6108
+tom vek;6109
+(spunge);6110
+the refreshments;6111
+twilightning;6112
+the-dream;6113
+portishead;6114
+eamon;6115
+soulfly;6116
+gungor;6117
+stemm;6118
+mc hammer;6119
+the kinks;6120
+budgie;6121
+pistol annies;6122
+froggy fresh;6123
+shaye;6124
+tlc;6125
+the chameleons;6126
+boss hogg outlawz;6127
+starbomb;6128
+sleigh bells;6129
+inquisition;6130
+h-blockx;6131
+the ting tings;6132
+johnny hallyday;6133
+night ranger;6134
+bowes & morley;6135
+ronan keating;6136
+d'sound;6137
+miki howard;6138
+sadat x;6139
+gloria estefan;6140
+mighty sparrow;6141
+the shins;6142
+starfield;6143
+plan b;6144
+marcus orelias;6145
+neaera;6146
+miss black america;6147
+arena;6148
+assück;6149
+a. l. lloyd;6150
+los lobos;6151
+artur schnabel;6152
+boyce avenue;6153
+chixdiggit!;6154
+stevie wonder;6155
+howlin rain;6156
+junior wells;6157
+con funk shun;6158
+nickel creek;6159
+there for tomorrow;6160
+brian peters;6161
+mgmt;6162
+his name is alive;6163
+jaya the cat;6164
+chiodos;6165
+teacup monster;6166
+har mar superstar;6167
+alkaline trio;6168
+kidz bop;6169
+baxter;6170
+scary bitches;6171
+ron wood;6172
+dashboard confessional;6173
+iron fire;6174
+shimshai;6175
+carl maria von weber;6176
+aversions crown;6177
+apartment 26;6178
+alcazar;6179
+skeleton key;6180
+the burning hell;6181
+bonded by blood;6182
+bob marley & the wailers;6183
+spirit;6184
+the jackson 5;6185
+george morgan;6186
+alabama 3;6187
+the sensational alex harvey band;6188
+drapht;6189
+nevertheless;6190
+china crisis;6191
+selena;6192
+bodies of water;6193
+crush 40;6194
+architects;6195
+darius rucker;6196
+smile.dk;6197
+puhdys;6198
+savoy brown;6199
+rose royce;6200
+plumb;6201
+roger waters;6202
+doc watson;6203
+neal morse;6204
+edwin fischer;6205
+tracy chapman;6206
+the who;6207
+cal smith;6208
+mos def;6209
+matt cardle;6210
+dr. dre;6211
+ronnie milsap;6212
+anthrax;6213
+tw walsh;6214
+numb;6215
+jenny o.;6216
+lock up;6217
+bear in heaven;6218
+susanna hoffs;6219
+jessie ware;6220
+eric bogle;6221
+johnny thunders;6222
+advance base;6223
+sara gazarek;6224
+misfits;6225
+the used;6226
+catie curtis;6227
+thundercat;6228
+derek minor;6229
+basshunter;6230
+johann pachelbel;6231
+whitehorse;6232
+bessie smith;6233
+mike garrigan;6234
+brownie mcghee;6235
+otis rush;6236
+negative;6237
+slugdge;6238
+bang gang;6239
+debbie gibson;6240
+da vinci's notebook;6241
+nargaroth;6242
+champion jack dupree;6243
+hubert kah;6244
+ben lee;6245
+gary barlow;6246
+math and physics club;6247
+eighteen visions;6248
+supersuckers;6249
+the fixx;6250
+amazing blondel;6251
+morcheeba;6252
+petula clark;6253
+the youngbloods;6254
+carman;6255
+southern culture on the skids;6256
+evan taubenfeld;6257
+tracy grammer;6258
+maritime;6259
+randy vanwarmer;6260
+believer;6261
+the grapes of wrath;6262
+angelina;6263
+john prine;6264
+dee snider;6265
+jon and vangelis;6266
+the seer;6267
+george jones;6268
+amanda perez;6269
+lonnie donegan;6270
+lara fabian;6271
+dr. acula;6272
+cockney rejects;6273
+junkie xl;6274
+nasty c;6275
+the shadows;6276
+this beautiful republic;6277
+12 rods;6278
+action action;6279
+jann arden;6280
+the oppressed;6281
+emanuel;6282
+jennifer rush;6283
+ufo;6284
+leroy anderson;6285
+the submarines;6286
+vanilla ice;6287
+the murder city devils;6288
+johnny mathis;6289
+michael card;6290
+rheostatics;6291
+no knife;6292
+curved air;6293
+mu330;6294
+bat for lashes;6295
+abigor;6296
+new boyz;6297
+nb ridaz;6298
+brian mcfadden;6299
+kylesa;6300
+randy bachman;6301
+brandon heath;6302
+the adicts;6303
+longwave;6304
+earl thomas conley;6305
+lethian dreams;6306
+conchita wurst;6307
+southgang;6308
+david oistrakh;6309
+a change of pace;6310
+it lives, it breathes;6311
+the format;6312
+enrico caruso;6313
+ron hawkins;6314
+sufjan stevens;6315
+krystian zimerman;6316
+it prevails;6317
+redrama;6318
+warcloud;6319
+the kids from fame;6320
+brodka;6321
+nujabes;6322
+bt;6323
+voltaire;6324
+one be lo;6325
+damon & naomi;6326
+datarock;6327
+willow smith;6328
+snowy white;6329
+mercyful fate;6330
+veruca salt;6331
+joe bonamassa;6332
+rag'n'bone man;6333
+aqualung;6334
+solomon burke;6335
+vicious rumors;6336
+spitalfield;6337
+hardcore superstar;6338
+vern gosdin;6339
+yendri;6340
+pernice brothers;6341
+vangough;6342
+t-bone;6343
+sopor aeternus;6344
+day at the fair;6345
+feed her to the sharks;6346
+dokken;6347
+teena marie;6348
+json;6349
+kaipa;6350
+hidden in plain view;6351
+belinda carlisle;6352
+neil cicierega;6353
+brenda russell;6354
+esther ofarim;6355
+sea wolf;6356
+fm laeti;6357
+james hunter;6358
+the soul of john black;6359
+david bromberg;6360
+marc cohn;6361
+the duskfall;6362
+galantis;6363
+tender;6364
+martin jondo;6365
+ricky nelson;6366
+jeremy spencer;6367
+colin linden;6368
+john cooper clarke;6369
+the seekers;6370
+abra moore;6371
+breathe carolina;6372
+lily & madeleine;6373
+disarmonia mundi;6374
+circa survive;6375
+matt bianco;6376
+celtic thunder;6377
+flowing tears;6378
+lee fields;6379
+kate ryan;6380
+meg & dia;6381
+evils toy;6382
+choirboys;6383
+tedashii;6384
+poni hoax;6385
+yo yo ma;6386
+the broken family band;6387
+l'âme immortelle;6388
+psalters;6389
+benjamin britten;6390
+shakira;6391
+rabia sorda;6392
+ad;6393
+the cross;6394
+billy boy arnold;6395
+florida georgia line;6396
+t.i.;6397
+marty willson-piper;6398
+client;6399
+jack ingram;6400
+sash!;6401
+deathstars;6402
+the english beat;6403
+mitchel musso;6404
+wintersleep;6405
+the smiths;6406
+fleetwood mac;6407
+molly hatchet;6408
+pet shop boys;6409
+daryl hall;6410
+after all;6411
+j moss;6412
+rüfüs du sol;6413
+exodus;6414
+francis dunnery;6415
+4 strings;6416
+heather headley;6417
+besatt;6418
+foreigner;6419
+the pussycat dolls;6420
+serena ryder;6421
+white lion;6422
+tim hardin;6423
+harry connick, jr.;6424
+islands;6425
+arrested development;6426
+coco montoya;6427
+arcana;6428
+marduk;6429
+keith whitley;6430
+al martino;6431
+scatman john;6432
+marvin gaye & tammi terrell;6433
+eddie murphy;6434
+rihanna;6435
+candi staton;6436
+my favorite;6437
+the trews;6438
+coven 13;6439
+leo sayer;6440
+lil' keke;6441
+jewel;6442
+firehouse;6443
+claudio monteverdi;6444
+negative approach;6445
+ben caplan;6446
+aïboforcen;6447
+ryan shupe & the rubberband;6448
+bethel music;6449
+the courteeners;6450
+mortal love;6451
+yung lean;6452
+altar boys;6453
+aesop rock;6454
+bret michaels;6455
+die so fluid;6456
+don mclean;6457
+my morning jacket;6458
+unearthly trance;6459
+the war on drugs;6460
+limp;6461
+drake bell;6462
+cky;6463
+b.b. king;6464
+mama cass;6465
+dirty heads;6466
+buffy sainte-marie;6467
+dire straits;6468
+menudo;6469
+dolorian;6470
+natalie imbruglia;6471
+flunk;6472
+carpark north;6473
+fatso jetson;6474
+hourglass;6475
+graveland;6476
+bert jansch;6477
+the left banke;6478
+the sound;6479
+nancy lamott;6480
+the mighty mighty bosstones;6481
+lake of tears;6482
+paper aeroplanes;6483
+the archies;6484
+the wonder years;6485
+con brio;6486
+the treatment;6487
+fats domino;6488
+heaven shall burn;6489
+the suburbs;6490
+laura gibson;6491
+an angle;6492
+hunters & collectors;6493
+jellyfish;6494
+jim capaldi;6495
+hybrid;6496
+five for fighting;6497
+kevin fowler;6498
+douwe bob;6499
+george nozuka;6500
+american head charge;6501
+roots manuva;6502
+cephalic carnage;6503
+prodigy;6504
+far east movement;6505
+the spinners;6506
+plankeye;6507
+pitbull;6508
+reggie and the full effect;6509
+frightened rabbit;6510
+i see stars;6511
+lorde;6512
+vanessa williams;6513
+oh land;6514
+luciano;6515
+shockwave;6516
+i set my friends on fire;6517
+eagles of death metal;6518
+above & beyond;6519
+mindless self indulgence;6520
+sun kil moon;6521
+strawbs;6522
+vision divine;6523
+julien-k;6524
+the velvet teen;6525
+jack greene;6526
+devo;6527
+caesar;6528
+ritchie valens;6529
+andr previn;6530
+116 clique;6531
+ali project;6532
+leæther strip;6533
+take that;6534
+five iron frenzy;6535
+eugenio finardi;6536
+star one;6537
+barbara mandrell;6538
+to speak of wolves;6539
+massive ego;6540
+my ticket home;6541
+the riverboat gamblers;6542
+epic rap battles of history;6543
+revolver;6544
+waltari;6545
+k. michelle;6546
+natural;6547
+madder mortem;6548
+crystal gayle;6549
+yoko ono;6550
+robbie fulks;6551
+xandria;6552
+beyoncé;6553
+beth hirsch;6554
+passion pit;6555
+magnum;6556
+william beckett;6557
+the beautiful south;6558
+shakin' stevens;6559
+samantha fox;6560
+england dan & john ford coley;6561
+orchestral manoeuvres in the dark;6562
+chrom;6563
+deana carter;6564
+dan seals;6565
+crimson moonlight;6566
+alvin lee;6567
+army of lovers;6568
+the friday night boys;6569
+chris august;6570
+stephen lynch;6571
+loudon wainwright iii;6572
+the helio sequence;6573
+partynextdoor;6574
+roberta flack;6575
+mr. bungle;6576
+sóley;6577
+bruce springsteen;6578
+dmitry bashkirov;6579
+billy preston;6580
+department of eagles;6581
+denison witmer;6582
+modest petrovich mussorgsky;6583
+antonio meneses;6584
+kathryn williams;6585
+jim ed brown;6586
+arabesque;6587
+m83;6588
+johann strauss ii;6589
+agnes;6590
+alannah myles;6591
+most precious blood;6592
+incubus;6593
+rialto;6594
+a.c.t;6595
+klone;6596
+jp cooper;6597
+hate dept.;6598
+anderson .paak;6599
+viva voce;6600
+talisco;6601
+survivor;6602
+the manhattan transfer;6603
+van cliburn;6604
+maylene and the sons of disaster;6605
+brendan perry;6606
+derek and the dominos;6607
+kovacs;6608
+the association;6609
+fischer-z;6610
+fred neil;6611
+letlive;6612
+aberfeldy;6613
+onyx;6614
+dig;6615
+smokie;6616
+gabriel brown;6617
+jakob dylan;6618
+imogen heap;6619
+lacrosse;6620
+the kovenant;6621
+lotte kestner;6622
+das pop;6623
+andreya triana;6624
+the delmore brothers;6625
+talking heads;6626
+ty england;6627
+we are the in crowd;6628
+nick drake;6629
+dead or alive;6630
+jessica harp;6631
+deathgaze;6632
+more machine than man;6633
+rage;6634
+kim churchill;6635
+5 chinese brothers;6636
+emperor;6637
+the mavericks;6638
+aloha;6639
+999;6640
+leonard cohen;6641
+gabrielle aplin;6642
+living sacrifice;6643
+matt mays;6644
+too bad eugene;6645
+crystal fighters;6646
+harlan howard;6647
+wendy matthews;6648
+danny kirwan;6649
+john barrowman;6650
+those dancing days;6651
+thor;6652
+digger;6653
+steve earle;6654
+penal colony;6655
+davey suicide;6656
+rise against;6657
+iron maiden;6658
+world party;6659
+daforce;6660
+the monkees;6661
+yukmouth;6662
+demolition hammer;6663
+edguy;6664
+winds of plague;6665
+flatsound;6666
+james "j.t." taylor;6667
+dave alvin;6668
+dimmu borgir;6669
+kreator;6670
+pop etc;6671
+c.w. mccall;6672
+green river ordinance;6673
+dave dudley;6674
+steely dan;6675
+murderdolls;6676
+de la soul;6677
+the dayton family;6678
+transatlantic;6679
+neneh cherry;6680
+pete townshend;6681
+the red jumpsuit apparatus;6682
+abandon all ships;6683
+john brown's body;6684
+karl wolf;6685
+los pericos;6686
+this mortal coil;6687
+emily haines;6688
+pretenders;6689
+boytronic;6690
+bloodgood;6691
+unit;6692
+boyzone;6693
+ian & sylvia;6694
+jesse harris & the ferdinandos;6695
+haley reinhart;6696
+sinergy;6697
+gareth gates;6698
+kevin lyttle;6699
+cast;6700
+britny fox;6701
+jack jones;6702
+billy joel;6703
+the maine;6704
+david bazan;6705
+hate;6706
+intwine;6707
+pigface;6708
+lali puna;6709
+david usher;6710
+the shirelles;6711
+gerald moore;6712
+nicole dollanganger;6713
+rex goudie;6714
+pablo casals;6715
+the veils;6716
+low pop suicide;6717
+graham colton;6718
+john entwistle;6719
+meshuggah;6720
+chris webby;6721
+jimmy dawkins;6722
+russ taff;6723
+gabriella cilmi;6724
+balto;6725
+peetie wheatstraw;6726
+gary clark jr.;6727
+the clovers;6728
+the agony scene;6729
+roland grapow;6730
+gary stewart;6731
+buddy guy;6732
+necrodeath;6733
+st. vincent;6734
+firefall;6735
+slechtvalk;6736
+the boys;6737
+the twins;6738
+chandeen;6739
+originoo gunn clappaz;6740
+annie herring;6741
+shai linne;6742
+steve carlson;6743
+the gray havens;6744
+matthew sweet and susanna hoffs;6745
+livingston taylor;6746
+pro-pain;6747
+elbow;6748
+mandalay;6749
+sirenia;6750
+modern skirts;6751
+jasmine thompson;6752
+marcia griffiths;6753
+the swellers;6754
+michael monroe;6755
+tom rosenthal;6756
+florence + the machine;6757
+thunder;6758
+amber;6759
+grave;6760
+violent work of art;6761
+gene vincent;6762
+sarina paris;6763
+polyenso;6764
+mark seymour & the undertow;6765
+tism;6766
+the dubliners;6767
+bonnie raitt;6768
+michelle williams;6769
+blank & jones;6770
+walls of jericho;6771
+lupe fiasco;6772
+james marsters;6773
+metal church;6774
+excision;6775
+keith murray;6776
+john ogdon;6777
+the low anthem;6778
+chris merritt;6779
+maysa leak;6780
+henry fiat's open sore;6781
+ernestine anderson;6782
+capital lights;6783
+the cooper temple clause;6784
+alan jackson;6785
+hey;6786
+pathology;6787
+randy rogers band;6788
+tink;6789
+flesh field;6790
+vinyl theatre;6791
+dystopia;6792
+jill barber;6793
+the long blondes;6794
+the color morale;6795
+giuseppe verdi;6796
+karin park;6797
+amy grant;6798
+miasmal;6799
+gene watson;6800
+page & plant;6801
+acid witch;6802
+adagio;6803
+lisa ekdahl;6804
+martin page;6805
+triumvirat;6806
+s.l.a.b.;6807
+8ball & mjg;6808
+better luck next time;6809
+marble sounds;6810
+whitney houston;6811
+ringo starr;6812
+the scabs;6813
+the whitest boy alive;6814
+third eye blind;6815
+thornley;6816
+herbert grönemeyer;6817
+stereo skyline;6818
+amos lee;6819
+shawn james;6820
+parkway drive;6821
+trippie redd;6822
+vanessa carlton;6823
+guardian;6824
+blowsight;6825
+san francisco symphony;6826
+five;6827
+geoff moore;6828
+david meece;6829
+enigma;6830
+cold chisel;6831
+the impossibles;6832
+sondre lerche;6833
+hey monday;6834
+sol gabetta;6835
+machinae supremacy;6836
+p.m. dawn;6837
+johnny rivers;6838
+mickey newbury;6839
+sandy & junior;6840
+taylor swift;6841
+aaradhna;6842
+edward elgar;6843
+lamb of god;6844
+antestor;6845
+doom;6846
+arcturus;6847
+kingdom come;6848
+flo rida;6849
+afroman;6850
+mickey gilley;6851
+donna regina;6852
+erasure;6853
+leonard nimoy;6854
+susan raye;6855
+die form;6856
+allure;6857
+quincy punx;6858
+secret army;6859
+perry como;6860
+aldo nova;6861
+the loud family;6862
+the suicide file;6863
+maggie rogers;6864
+melanie;6865
+andrew lloyd webber;6866
+sparks;6867
+angels & agony;6868
+shooter jennings;6869
+m people;6870
+the story so far;6871
+aqua;6872
+scott matthews;6873
+chemlab;6874
+apollo sunshine;6875
+mason proper;6876
+bobby o;6877
+action adventure world;6878
+hawk nelson;6879
+audrey assad;6880
+jerry lee lewis;6881
+andy m. stewart;6882
+grimskunk;6883
+rudy vallée;6884
+erin mccarley;6885
+z-ro;6886
+crooked still;6887
+nadeah;6888
+marah;6889
+sinner;6890
+saint raymond;6891
+devin townsend;6892
+james young;6893
+dillon;6894
+blackrain;6895
+crisis;6896
+loney, dear;6897
+dark the suns;6898
+damone;6899
+big big train;6900
+pride & glory;6901
+cancer bats;6902
+dismantled;6903
+the cheetah girls;6904
+the saints;6905
+jonathan larson;6906
+everything but the girl;6907
+glass hammer;6908
+chairlift;6909
+scarve;6910
+mai kuraki;6911
+travie mccoy;6912
+steeleye span;6913
+chris de burgh;6914
+james otto;6915
+chris rea;6916
+forever slave;6917
+bobby vee;6918
+susannah mccorkle;6919
+goodie mob;6920
+girls aloud;6921
+lake street dive;6922
+frankenstein drag queens from planet 13;6923
+vance joy;6924
+anubis;6925
+sabrina starke;6926
+level;6927
+kick axe;6928
+nanci griffith;6929
+ben moody;6930
+epica;6931
+field music;6932
+lady antebellum;6933
+grieves;6934
+osborne brothers;6935
+judie tzuke;6936
+rick james;6937
+willie d;6938
+at the gates;6939
+daniel shafran;6940
+mr. mister;6941
+beulah;6942
+robert cray;6943
+hatesphere;6944
+tim fite;6945
+clay walker;6946
+the four seasons;6947
+vintage trouble;6948
+the rankin family;6949
+chaka khan;6950
+paul wall;6951
+brian wilson;6952
+alestorm;6953
+明星 (akeboshi);6954
+billy gilman;6955
+jennifer kimball;6956
+the charlie daniels band;6957
+john mellencamp;6958
+revamp;6959
+lost in tears;6960
+just jinger;6961
+mental as anything;6962
+dark tranquillity;6963
+c-lekktor;6964
+the trammps;6965
+hb;6966
+the ready set;6967
+joey tempest;6968
+baton rouge;6969
+lana del rey;6970
+ms. dynamite;6971
+radio birdman;6972
+cows;6973
+pansy division;6974
+klaatu;6975
+ryan delmore;6976
+laurie anderson;6977
+trespassers william;6978
+geoff berner;6979
+black sabbath;6980
+eric saade;6981
+meja;6982
+cannibal corpse;6983
+with confidence;6984
+jim jones;6985
+blake shelton;6986
+the nightwatchman;6987
+elephant man;6988
+massive attack;6989
+lee kernaghan;6990
+tomorrows bad seeds;6991
+aynsley lister;6992
+hanoi rocks;6993
+mono inc.;6994
+linton kwesi johnson;6995
+richard & linda thompson;6996
+lard;6997
+tq;6998
+blue café;6999
+blackjack;7000
+inspectah deck;7001
+harry james;7002
+no fun at all;7003
+bad brains;7004
+herman's hermits;7005
+intense;7006
+mink deville;7007
+future islands;7008
+the midnight beast;7009
+sarah connor;7010
+psycho motel;7011
+last train home;7012
+alberta cross;7013
+helen humes;7014
+easton corbin;7015
+yngwie malmsteen;7016
+tinfed;7017
+thee silver mt. zion;7018
+lalah hathaway;7019
+sheavy;7020
+slaughter;7021
+tunng;7022
+the four freshmen;7023
+cymbals eat guitars;7024
+debarge;7025
+the beatles;7026
+lower definition;7027
+michael martin murphey;7028
+bury tomorrow;7029
+twista;7030
+hazel o'connor;7031
+kerry livgren;7032
+6ix9ine;7033
+paul colman trio;7034
+clean bandit;7035
+seam;7036
+dodgy;7037
+the lovin' spoonful;7038
+art garfunkel;7039
+lee ryan;7040
+la sera;7041
+c.w. stoneking;7042
+johann christian bach;7043
+8 foot sativa;7044
+beast in black;7045
+straylight run;7046
+saves the day;7047
+robyn;7048
+the blasters;7049
+petra;7050
+halou;7051
+the neighbourhood;7052
+runrig;7053
+billy ocean;7054
+die young;7055
+rainbirds;7056
+play;7057
+atomic kitten;7058
+hallelujah the hills;7059
+amaranthe;7060
+koffin kats;7061
+swingin' utters;7062
+charlie musselwhite;7063
+phillips, craig & dean;7064
+tarkio;7065
+avulsed;7066
+unwritten law;7067
+al axy;7068
+freddie mercury;7069
+rhye;7070
+giorgio moroder;7071
+elegy;7072
+blue stahli;7073
+zachary richard;7074
+børns;7075
+bob moses;7076
+chromatics;7077
+the barr brothers;7078
+gorefest;7079
+countess;7080
+caetano veloso;7081
+jerry butler;7082
+steve harley & cockney rebel;7083
+michelle branch;7084
+dr. john;7085
+the business;7086
+handsome ghost;7087
+naked eyes;7088
+amduscia;7089
+scooter;7090
+bride;7091
+white sea;7092
+empire! empire! (i was a lonely estate);7093
+mastodon;7094
+amon amarth;7095
+mike posner;7096
+dove cameron;7097
+desmond dekker;7098
+the rugburns;7099
+harley poe;7100
+skinless;7101
+secret garden;7102
+colter wall;7103
+john cena;7104
+jump5;7105
+cat stevens;7106
+tim curry;7107
+the classic crime;7108
+grey delisle;7109
+ramblin' jack elliott;7110
+the world is a beautiful place & i am no longer afraid to die;7111
+the spinto band;7112
+the chainsmokers;7113
+clawfinger;7114
+the thrills;7115
+ultravox;7116
+quasi;7117
+wonder girls;7118
+the psycho realm;7119
+alice in chains;7120
+bill wyman's rhythm kings;7121
+quiet riot;7122
+rome;7123
+fair warning;7124
+to/die/for;7125
+lari white;7126
+backseat goodbye;7127
+rhett miller;7128
+eliza neals;7129
+conway twitty;7130
+asg;7131
+highasakite;7132
+arthur grumiaux;7133
+pieter wispelwey;7134
+union j;7135
+jon randall;7136
+allies;7137
+modern romance;7138
+krisiun;7139
+s;7140
+killa kyleon;7141
+adam lambert;7142
+skrew;7143
+imagination movers;7144
+jade warrior;7145
+beborn beton;7146
+webbie;7147
+richard o'brien;7148
+ella fitzgerald & louis armstrong;7149
+denis matsuev;7150
+kultur shock;7151
+la dispute;7152
+sean watkins;7153
+tex williams;7154
+cary brothers;7155
+joan of arc;7156
+dead hand projekt;7157
+dio;7158
+big tymers;7159
+tokyo police club;7160
+midge ure;7161
+adam cohen;7162
+forever changed;7163
+more than life;7164
+avenged sevenfold;7165
+lifetime;7166
+the clash;7167
+five finger death punch;7168
+zac brown band;7169
+just surrender;7170
+roosevelt sykes;7171
+cory morrow;7172
+岡崎律子 (ritsuko okazaki);7173
+88 fingers louie;7174
+brenton wood;7175
+bill mallonee;7176
+keziah jones;7177
+lebanon hanover;7178
+raspberries;7179
+mygrain;7180
+sita;7181
+twenty one pilots;7182
+bill withers;7183
+family force 5;7184
+t.a.t.u.;7185
+rainbow;7186
+marianas trench;7187
+cascada;7188
+the watchmen;7189
+cold;7190
+annette hanshaw;7191
+james labrie;7192
+viktoria mullova;7193
+altaria;7194
+icon of coil;7195
+oliver koletzki;7196
+kansas;7197
+michael bolton;7198
+keren ann;7199
+lil pump;7200
+john lennon & yoko ono;7201
+jim lauderdale;7202
+the manhattans;7203
+stacey kent;7204
+lee brice;7205
+endo;7206
+dani siciliano;7207
+game theory;7208
+lykke li;7209
+hoobastank;7210
+lil suzy;7211
+since october;7212
+tinchy stryder;7213
+camera obscura;7214
+macabre;7215
+16 volt;7216
+merle haggard;7217
+i am abomination;7218
+nana mouskouri;7219
+tift merritt;7220
+optiganally yours;7221
+dwight twilley;7222
+evgeny kissin;7223
+david grisman;7224
+ohio players;7225
+liberty x;7226
+matt haimovitz;7227
+robbie williams;7228
+grayson & whitter;7229
+dream evil;7230
+beau;7231
+t.j. miller;7232
+the bar-kays;7233
+u.d.o.;7234
+keke palmer;7235
+ambrosia;7236
+aswad;7237
+henry mancini;7238
+frida hyvönen;7239
+shakespears sister;7240
+celph titled;7241
+pigeon john;7242
+on broken wings;7243
+javier;7244
+billy currington;7245
+a silent film;7246
+sly & the family stone;7247
+stephen duffy;7248
+vicki lawrence;7249
+lucky dube;7250
+the georgia satellites;7251
+bonnie tyler;7252
+press play;7253
+the academy is...;7254
+strfkr;7255
+steeler;7256
+haystak;7257
+frank black;7258
+hot chocolate;7259
+margo price;7260
+club 8;7261
+rick astley;7262
+bone thugs-n-harmony;7263
+true widow;7264
+mark king;7265
+the rascals;7266
+vanden plas;7267
+cocteau twins;7268
+london elektricity;7269
+the triffids;7270
+jolie holland;7271
+sole;7272
+ben taylor;7273
+equatronic;7274
+the million dollar quartet;7275
+jean beauvoir;7276
+marillion;7277
+tom cochrane;7278
+friend 'n fellow;7279
+jeremy camp;7280
+7th cycle;7281
+peter, paul & mary;7282
+hadise;7283
+dark angel;7284
+dark age;7285
+bo diddley;7286
+the pineapple thief;7287
+eli sostre;7288
+psapp;7289
+a loss for words;7290
+taylor dayne;7291
+hanne kah;7292
+paul young;7293
+swim deep;7294
+kanye west;7295
+scott mckenzie;7296
+epmd;7297
+anthony phillips;7298
+silent civilian;7299
+strange majik;7300
+lisa thiel;7301
+shad;7302
+joseph arthur and the lonely astronauts;7303
+a life once lost;7304
+scarling.;7305
+the stanley brothers;7306
+nat stuckey;7307
+full blown chaos;7308
+mark knopfler & emmylou harris;7309
+the wedding;7310
+the weepies;7311
+geto boys;7312
+disclosure;7313
+tonio k;7314
+brave saint saturn;7315
+the proclaimers;7316
+matthew ryan;7317
+ernest tubb;7318
+cibelle;7319
+cause for effect;7320
+milburn;7321
+cherry poppin' daddies;7322
+berlin;7323
+reverend bizarre;7324
+brazzaville;7325
+george "harmonica" smith;7326
+garfunkel and oates;7327
+ashley tisdale;7328
+james mcmurtry;7329
+vendetta;7330
+bon iver;7331
+steelheart;7332
+horrorpops;7333
+neuroticfish;7334
+airbourne;7335
+brick & lace;7336
+dead meadow;7337
+the cave singers;7338
+moneybrother;7339
+audra mcdonald;7340
+ensiferum;7341
+2 unlimited;7342
+mason jennings;7343
+a flock of seagulls;7344
+tiger army;7345
+bikini kill;7346
+the vibrators;7347
+the pillows;7348
+shocking blue;7349
+luca turilli;7350
+soen;7351
+roger miret and the disasters;7352
+magica;7353
+mac demarco;7354
+antimatter;7355
+tommy james;7356
+nine lashes;7357
+discordance axis;7358
+against me!;7359
+stavesacre;7360
+the meads of asphodel;7361
+blood stain child;7362
+ohbijou;7363
+little simz;7364
+abigail washburn;7365
+the felice brothers;7366
+running wild;7367
+a.a. bondy;7368
+bitch;7369
+die happy;7370
+pepper;7371
+long john baldry;7372
+the eyes of a traitor;7373
+fran ois couperin;7374
+the shangri-las;7375
+diana ross & the supremes and the temptations;7376
+mgła;7377
+the scary jokes;7378
+spandau ballet;7379
+sophie ellis-bextor;7380
+stauros;7381
+chris cornell;7382
+slade;7383
+medina;7384
+dr. feelgood;7385
+louis armstrong;7386
+sticky fingaz;7387
+steve perry;7388
+the vincent black shadow;7389
+procol harum;7390
+pig;7391
+a house;7392
+dead and divine;7393
+artillery;7394
+tape five;7395
+tommy dorsey;7396
+new york dolls;7397
+mull historical society;7398
+richard thompson;7399
+don francisco;7400
+charles manson;7401
+rotten sound;7402
+stevie b;7403
+scapegoat;7404
+soul embraced;7405
+the sound of animals fighting;7406
+steven isserlis;7407
+marissa nadler;7408
+mia doi todd;7409
+warbringer;7410
+giuseppe tartini;7411
+the coup;7412
+samhain;7413
+hue and cry;7414
+cut copy;7415
+blue tears;7416
+steve forbert;7417
+kingdom of sorrow;7418
+aviators;7419
+soulsavers;7420
+ike & tina turner;7421
+the band;7422
+drenge;7423
+sarah jarosz;7424
+stephan eicher;7425
+killing joke;7426
+nominon;7427
+texas in july;7428
+justin hayward;7429
+gary puckett & the union gap;7430
+kings of convenience;7431
+look what i did;7432
+sarah harmer;7433
+king adora;7434
+danny michel;7435
+iration;7436
+kate & anna mcgarrigle;7437
+pep love;7438
+eric burdon;7439
+ace frehley;7440
+wolfgang amadeus mozart;7441
+the working title;7442
+knife party;7443
+wizard;7444
+a lot like birds;7445
+cody simpson;7446
+land of talk;7447
+the kentucky headhunters;7448
+matt pond pa;7449
+casual;7450
+anaïs mitchell;7451
+phoebe carrai;7452
+funkadelic;7453
+richard strauss;7454
+ray charles;7455
+s.o.a.p.;7456
+tnt;7457
+nas;7458
+the commitments;7459
+lany;7460
+and also the trees;7461
+the rumour said fire;7462
+bonaparte;7463
+the moog;7464
+rza;7465
+northern kings;7466
+the pale fountains;7467
+for today;7468
+fyfe;7469
+mumakil;7470
+air;7471
+periphery;7472
+the xx;7473
+yvonne elliman;7474
+murs;7475
+joy division;7476
+p.o.d.;7477
+informatik;7478
+fu manchu;7479
+razakel;7480
+dappled cities;7481
+christine fellows;7482
+ion dissonance;7483
+don gibson;7484
+anvil;7485
+lobo;7486
+phillip boa & the voodooclub;7487
+pulp;7488
+moriz rosenthal;7489
+run level zero;7490
+the funeral pyre;7491
+american juniors;7492
+steelwing;7493
+chic;7494
+gustav holst;7495
+pyramaze;7496
+f-minus;7497
+wyrd;7498
+wolfpakk;7499
+kool keith;7500
+sleeping giant;7501
+the loved ones;7502
+the echoing green;7503
+edgar winter;7504
+angelus apatrida;7505
+dante;7506
+senses fail;7507
+dave carter & tracy grammer;7508
+gary wright;7509
+the saturdays;7510
+blindspott;7511
+falkenbach;7512
+outkast;7513
+xp8;7514
+pentatonix;7515
+yourcodenameis;7516
+moloko;7517
+darkane;7518
+kris kristofferson;7519
+barry mcguire;7520
+cancerslug;7521
+hangar;7522
+sirus;7523
+sammy davis jr.;7524
+trademark;7525
+dennis deyoung;7526
+kyuss;7527
+naia izumi;7528
+whose line is it anyway? cast;7529
+bastro;7530
+flora cash;7531
+turnpike troubadours;7532
+sam cooke;7533
+matisyahu;7534
+jungle rot;7535
+weekend nachos;7536
+microwave dave & the nukes;7537
+dan hartman;7538
+the cramps;7539
+cryptic wintermoon;7540
+apocalypse hoboken;7541
+fugazi;7542
+tye tribbett;7543
+karla bonoff;7544
+holly throsby;7545
+from first to last;7546
+fiction family;7547
+21 savage;7548
+free throw;7549
+xlooking forwardx;7550
+flatt & scruggs;7551
+heather nova;7552
+vandenberg;7553
+rick ross;7554
+arcane;7555
+balkan beat box;7556
+alaska thunderfuck;7557
+halsey;7558
+the menzingers;7559
+iona;7560
+peter frampton;7561
+the agonist;7562
+american aquarium;7563
+enon;7564
+cranes;7565
+revocation;7566
+chet atkins;7567
+bound for glory;7568
+the clancy brothers;7569
+kataklysm;7570
+juelz santana;7571
+superchick;7572
+hellogoodbye;7573
+travis;7574
+the grouch;7575
+born from pain;7576
+busdriver;7577
+brooke fraser;7578
+10 years;7579
+the tony rich project;7580
+streetlight manifesto;7581
+i killed the prom queen;7582
+jan howard;7583
+fats waller;7584
+jacques offenbach;7585
+alexis taylor;7586
+turk;7587
+streetheart;7588
+7 seconds;7589
+bruce hornsby;7590
+van hunt;7591
+wynonna judd;7592
+roger chapman;7593
+pink cream 69;7594
+the ballroom thieves;7595
+genesis;7596
+breakdown of sanity;7597
+tommy page;7598
+wormrot;7599
+deuce;7600
+charles bronson;7601
+something corporate;7602
+the showdown;7603
+the lonely island;7604
+bobby valentino;7605
+akon;7606
+steve mason;7607
+dual core;7608
+the church;7609
+j church;7610
+dino ciani;7611
+kent;7612
+emerson, lake & palmer;7613
+the tangent;7614
+the everly brothers;7615
+billy joe shaver;7616
+chavez;7617
+jessica simpson;7618
+sinheresy;7619
+tommy roe;7620
+agnes obel;7621
+the pretty reckless;7622
+jermaine stewart;7623
+soja;7624
+joy;7625
+the aquabats!;7626
+hot dad;7627
+sheryl crow;7628
+yo la tengo;7629
+naked aggression;7630
+neon synthesis;7631
+raghav;7632
+climie fisher;7633
+kenny chesney;7634
+little big town;7635
+alan parsons;7636
+linkin park;7637
+laibach;7638
+dar williams;7639
+наив;7640
+john foxx;7641
+this bike is a pipe bomb;7642
+buck owens & susan raye;7643
+straight faced;7644
+johnny clegg;7645
+player;7646
+gaetano donizetti;7647
+alabama thunderpussy;7648
+khia;7649
+rascal flatts;7650
+the marvelettes;7651
+thirteen senses;7652
+saidian;7653
+nrbq;7654
+trocadero;7655
+i.d.o.4.;7656
+richard swift;7657
+krezip;7658
+abc;7659
+bell book & candle;7660
+vera lynn;7661
+b'z;7662
+girls rituals;7663
+kevin welch;7664
+rich mullins;7665
+down with webster;7666
+tonic;7667
+demiricous;7668
+menomena;7669
+guillemots;7670
+monrose;7671
+carson robison;7672
+porter wagoner;7673
+ivan moravec;7674
+siobhan donaghy;7675
+the bellrays;7676
+the arka teks;7677
+tonedeff;7678
+david essex;7679
+immortal technique;7680
+buddy moss;7681
+acid house kings;7682
+mister monster;7683
+the monochrome set;7684
+the dignity of labour;7685
+shawn desman;7686
+the mamas & the papas;7687
+the handsome family;7688
+endanger;7689
+south park;7690
+karen clark sheard;7691
+dressy bessy;7692
+sweet comfort band;7693
+darden smith;7694
+slim cessna's auto club;7695
+cher;7696
+tom waits;7697
+hanne hukkelberg;7698
+beres hammond;7699
+kayo dot;7700
+the cliks;7701
+years & years;7702
+fraggle rock;7703
+brandtson;7704
+lukas nelson & promise of the real;7705
+furry lewis;7706
+jim noir;7707
+in fear and faith;7708
+ultra;7709
+a global threat;7710
+all them witches;7711
+forbidden;7712
+keith & kristyn getty;7713
+edison glass;7714
+discount;7715
+thrush hermit;7716
+joanie sommers;7717
+vampire weekend;7718
+oi polloi;7719
+the bianca story;7720
+unsung zeros;7721
+circle jerks;7722
+pascal rog ;7723
+the 88;7724
+we shot the moon;7725
+blutengel;7726
+demon hunter;7727
+westside connection;7728
+sho baraka;7729
+the roots;7730
+hoyt axton;7731
+bleach;7732
+the corries;7733
+timbaland & magoo;7734
+carl orff;7735
+emmerson nogueira;7736
+bob geldof;7737
+mortician;7738
+david ball;7739
+blood tsunami;7740
+john denver;7741
+the klf;7742
+mark erelli;7743
+pat travers;7744
+tina turner;7745
+rhonda vincent;7746
+entombed;7747
+stutterfly;7748
+lou rawls;7749
+blaque;7750
+connie smith;7751
+utada hikaru;7752
+zhu;7753
+why don't we;7754
+beachwood sparks;7755
+the l-train;7756
+beth orton;7757
+guiomar novaes;7758
+animosity;7759
+deborah allen;7760
+the real tuesday weld;7761
+janelle monáe;7762
+!!!;7763
+hayley westenra;7764
+avicii;7765
+lil yachty;7766
+brazilian girls;7767
+the babys;7768
+mirah;7769
+rockapella;7770
+the posies;7771
+eden synthetic corps;7772
+bright eyes;7773
+black knights;7774
+hoodoo gurus;7775
+drain sth;7776
+babe ruth;7777
+watain;7778
+lil b;7779
+t'pau;7780
+joe jackson;7781
+june christy;7782
+josh turner;7783
+carly simon;7784
+becoming the archetype;7785
+alexisonfire;7786
+paulson;7787
+make them suffer;7788
+drake;7789
+billy thorpe;7790
+earth, wind & fire;7791
+belleruche;7792
+the housemartins;7793
+the chariot;7794
+jefferson starship;7795
+wolf gang;7796
+roch voisine;7797
+parachute;7798
+ferruccio busoni;7799
+grave digger;7800
+piebald;7801
+bob dylan;7802
+wax;7803
+eric fish;7804
+mark spiro;7805
+the seatbelts;7806
+dawn richard;7807
+usher;7808
+lmfao;7809
+i monster;7810
+mikhail glinka;7811
+nombe;7812
+s club 7;7813
+ray davies;7814
+dusty springfield;7815
+hieroglyphics;7816
+steril;7817
+vladimir sofronitsky;7818
+大塚愛 (ai otsuka);7819
+brendan benson;7820
+cyferdyne;7821
+fantasia;7822
+zola jesus;7823
+october fall;7824
+she & him;7825
+trixter;7826
+damien rice;7827
+delain;7828
+dave stewart & the spiritual cowboys;7829
+little boots;7830
+crystallion;7831
+rusted root;7832
+blanks 77;7833
+dirty looks;7834
+within reason;7835
+casey donahew band;7836
+aythis;7837
+the world of skin;7838
+propaganda;7839
+tom russell;7840
+julia stone;7841
+emigrate;7842
+vaya con dios;7843
+tove lo;7844
+death by stereo;7845
+stevie ray vaughan;7846
+nim vind;7847
+sonny terry & brownie mcghee;7848
+sam phillips;7849
+b! machine;7850
+julio iglesias;7851
+john wetton;7852
+edgar broughton band;7853
+dr. alban;7854
+mars ill;7855
+immaculate machine;7856
+martyr defiled;7857
+free dominguez;7858
+atreyu;7859
+johnny burnette;7860
+maria mena;7861
+spheric universe experience;7862
+whitehouse;7863
+zed;7864
+andrea bocelli;7865
+tommy james and the shondells;7866
+malcolm middleton;7867
+suicide commando;7868
+mark owen;7869
+the string cheese incident;7870
+patti austin;7871
+jethro tull;7872
+jennifer lopez;7873
+dorothy;7874
+ayria;7875
+guru;7876
+burton cummings;7877
+the warren brothers;7878
+a1;7879
+cody jinks;7880
+billy squier;7881
+junius;7882
+never shout never;7883
+the cryan' shames;7884
+the heavy;7885
+myra hess;7886
+television;7887
+sadist;7888
+danger danger;7889
+clique girlz;7890
+jamie lawson;7891
+rosanne cash;7892
+walk off the earth;7893
+scorpions;7894
+kelly clarkson;7895
+unknown;0
+speaker;7896
+singer;7897
diff --git a/jukebox/data/ids/v3_genre_ids.txt b/jukebox/data/ids/v3_genre_ids.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eb8a17b2cd09397d5236d50a47328d4dd6ef3a69
--- /dev/null
+++ b/jukebox/data/ids/v3_genre_ids.txt
@@ -0,0 +1,604 @@
+electroclash;1
+acid rock;2
+christian metal;3
+pop rock;4
+gothic;5
+big beat;6
+psychedelic rock‎;7
+funk carioca;8
+bebop;9
+dance punk;10
+trad jazz;11
+romantic;12
+andean music;13
+volksmusik;14
+coldwave;15
+gospel blues;16
+italian folk;17
+disney;18
+dark wave‏‎;19
+powerviolence;20
+bachata;21
+soft rock;22
+s music"];23
+bubblegum dance;24
+western swing;25
+alternative country;26
+latin pop;27
+eurobeat;28
+n;29
+unblack metal;30
+surf;31
+nu-disco;32
+event;33
+classical;34
+nasheed;35
+jovem guarda;36
+british blues;37
+bossa nova;38
+detroit blues;39
+rock;40
+contemporary christian;41
+dark ambient;42
+noise rock;43
+axé;44
+soca;45
+dance-rock;46
+contemporary jazz;47
+appalachian folk;48
+humppa‎;49
+ambient;50
+funeral doom;51
+southern gospel;52
+video game‎;53
+hip hop;54
+glitch hop;55
+krautrock;56
+breakcore;57
+ska;58
+traditional folk;59
+psychedelic trance;60
+reggae‏‎;61
+noise pop;62
+drumstep;63
+house;64
+teen pop;65
+sea shanties;66
+junkanoo;67
+mandopop;68
+pre-war blues;69
+doom metal;70
+oi-punk;71
+swamp rock;72
+crunkcore;73
+rap rock;74
+roots;75
+country rap;76
+avant-garde;77
+cumbia;78
+glam metal;79
+groove metal;80
+electric blues;81
+new orleans rhythm and blues;82
+canadian hip hop;83
+freestyle;84
+deathgrind;85
+idm;86
+comedy rock;87
+art punk;88
+progg;89
+work songs;90
+art pop;91
+conjunto;92
+persian;93
+parody;94
+jazz-funk;95
+french hip hop;96
+spirituals;97
+african;98
+middle-eastern;99
+minimal;100
+ranchera;101
+industrial rock;102
+electro house;103
+celtic rock;104
+death doom;105
+grupera;106
+jazz fusion‎;107
+political folk;108
+christian punk;109
+rapcore;110
+j-pop;111
+mashup;112
+metalcore;113
+progressive country;114
+power noise;115
+hip house;116
+crossover thrash;117
+electropop‎;118
+psychedelic folk;119
+punk rock;120
+classic rock;121
+zydeco;122
+afrobeat;123
+salsa;124
+banda;125
+chill-out;126
+morna;127
+minnesang;128
+alternative metal;129
+djent;130
+african folk;131
+mambo;132
+sertanejo;133
+classic pop;134
+soul;135
+australian hip hop;136
+symphonic rock;137
+celtic punk;138
+synthpop‎;139
+europop;140
+funk;141
+jazz blues;142
+vocal trance;143
+celtic fusion;144
+industrial;145
+kirtan;146
+slowcore;147
+flamenco;148
+piano blues;149
+texas blues;150
+aggrotech;151
+steampunk;152
+opera;153
+folktronica;154
+klezmer;155
+nwobhm;156
+goregrind;157
+rac;158
+neo-psychedelia‏‎;159
+post-rock‎;160
+hard bop;161
+gypsy jazz;162
+new orleans blues;163
+doo-wop;164
+soul blues;165
+trap;166
+indietronica;167
+psychobilly;168
+euro disco;169
+neo-progressive rock;170
+canterbury;171
+freak folk;172
+midwest rap;173
+instrumental rock;174
+dance-pop;175
+avant-garde metal;176
+edm;177
+deep house;178
+progressive bluegrass;179
+rave;180
+australian folk;181
+comic opera;182
+sunshine pop;183
+gregorian chant;184
+psychedelic rock;185
+honky tonk;186
+rock 'n' roll;187
+television;188
+nintendocore;189
+jump blues;190
+roots reggae;191
+traditional bluegrass;192
+operatic pop;193
+skate punk;194
+reggaeton;195
+manele;196
+middle-eastern hip hop;197
+skiffle;198
+nsbm;199
+nu jazz;200
+disco;201
+horrorcore;202
+early music;203
+post-bop;204
+gothic rock;205
+crack rock steady;206
+easy listening;207
+psychedelic;208
+christian;209
+brutal death metal;210
+experimental rock;211
+modern classical‎;212
+drum and bass;213
+dark wave;214
+dubstep;215
+grunge;216
+christian hip hop;217
+latin jazz;218
+r&b;219
+s music", ;220
+free jazz;221
+experimental hip hop;222
+swing;223
+smooth jazz;224
+southern metal;225
+religious;226
+progressive death metal;227
+contemporary folk;228
+j-rock;229
+jazz;230
+hamburger schule;231
+teen pop‎;232
+crossover;233
+italo disco;234
+deathcore;235
+blues;236
+crunk;237
+jangle pop;238
+indian classical music;239
+big band;240
+proto-punk;241
+dirty blues;242
+garage punk;243
+extreme metal;244
+folk metal;245
+neo soul;246
+electric folk;247
+synthwave;248
+arena rock;249
+post-grunge;250
+indie rock;251
+acoustic blues;252
+native american;253
+progressive trance;254
+nu metal;255
+digital hardcore;256
+brazilian rock;257
+funky house;258
+symphonic black metal;259
+lounge music;260
+brega;261
+trance;262
+industrial metal;263
+austropop;264
+bhangra;265
+new wave;266
+neoclassical;267
+post-metal;268
+dub;269
+industrial metal‎;270
+irish folk;271
+deutschrock;272
+gypsy;273
+dark electro;274
+alternative hip hop;275
+mbaqanga;276
+swamp blues;277
+french pop;278
+tango;279
+rockabilly;280
+old-time music;281
+blues rock;282
+scottish folk;283
+indie folk;284
+nazi-punk;285
+deutschpunk;286
+piedmont blues;287
+beatbox;288
+worship;289
+heavy metal;290
+underground hip  hop;291
+mixed;292
+electro;293
+tropicalismo;294
+jazz fusion;295
+worldbeat;296
+hill country blues;297
+a cappella;298
+dixieland;299
+hi-nrg;300
+punk blues;301
+anti-folk;302
+east coast blues;303
+polka;304
+mod revival;305
+soundtrack/musical;306
+movie;307
+outlaw country;308
+rock against communism;309
+barbershop;310
+math rock;311
+avant-garde‎;312
+psychedelic pop;313
+synthpop;314
+post-punk‎;315
+queercore;316
+death metal;317
+political hip hop;318
+thrashcore;319
+acid house;320
+post-hardcore‎;321
+electro-industrial;322
+rio;323
+southern hip hop;324
+filk;325
+duranguense;326
+latin hip hop;327
+pop punk;328
+space rock;329
+j-rap;330
+deep house‎;331
+baroque pop;332
+chiptune;333
+heartland rock;334
+dancehall;335
+experimental pop;336
+adult contemporary‎;337
+boogie woogie;338
+country pop;339
+power pop;340
+west coast hip hop;341
+thrash metal;342
+avant-pop;343
+enka;344
+k-pop;345
+post-britpop;346
+vocalese;347
+volkslied;348
+reggae fusion;349
+funk rock;350
+tech house;351
+adult contemporary;352
+death 'n' roll;353
+russian rock;354
+latin rock;355
+folk punk;356
+west coast blues;357
+progressive black metal;358
+progressive metal;359
+cajun;360
+sophisti-pop;361
+rock 'n' roll‎;362
+post-punk;363
+symphonic metal;364
+beat;365
+alternative rock‎;366
+art rock;367
+bakersfield sound;368
+indie pop;369
+folk;370
+acid jazz;371
+dream pop;372
+pop-rap;373
+eurodance;374
+vaudeville;375
+louisiana blues;376
+baião;377
+downtempo;378
+jug band;379
+neo-psychedelia;380
+sufi;381
+medieval;382
+singer-songwriter‎;383
+outsider music;384
+pop-folk;385
+martial industrial;386
+samba;387
+alternative dance;388
+children's music‎;389
+anarcho-punk;390
+dark rock;391
+rock en español;392
+balearic beat;393
+electropunk;394
+urban contemporary;395
+ragtime;396
+british invasion;397
+bubblegum pop;398
+rap metal;399
+soundtrack/television;400
+blues revival;401
+reggae;402
+schlager;403
+dance band;404
+video game;405
+crust punk;406
+cabaret;407
+ska punk‎;408
+bolero;409
+canadian folk;410
+neofolk;411
+shoegazing;412
+acoustic;413
+modern classical;414
+swamp pop;415
+celtic;416
+futurepop;417
+g-funk;418
+norteño;419
+orchestral;420
+boogie rock;421
+tejano;422
+new age;423
+soul jazz;424
+cantopop;425
+progressive metalcore;426
+mathcore;427
+new rave;428
+neue deutsche welle;429
+delta blues;430
+lo-fi;431
+poetry;432
+hatecore;433
+chanson;434
+underground hip hop;435
+pirate metal;436
+trip hop;437
+fado;438
+americana;439
+hardcore hip hop;440
+post-industrial;441
+grime;442
+southern rock;443
+grindcore;444
+musical;445
+hard trance;446
+ska punk;447
+post-rock;448
+uk garage;449
+melodic metalcore;450
+black metal;451
+visual kei;452
+soundtrack;453
+axé‎;454
+hardcore punk;455
+western;456
+blackgaze;457
+christian rock;458
+technical death metal;459
+christian hardcore;460
+christmas;461
+breakbeat;462
+francophone;463
+choral;464
+progressive folk;465
+mystic folk;466
+melodic death metal;467
+horror punk;468
+country blues;469
+nederpop;470
+post-hardcore;471
+future garage;472
+techno;473
+swiss rock;474
+dance-pop‎;475
+electronicore;476
+post-punk revival;477
+glitch;478
+calypso;479
+ragga;480
+britpop;481
+rock opera;482
+cowpunk;483
+la confusion des genres;484
+alternative rock;485
+surf rock;486
+ballad;487
+latin;488
+contemporary r&b;489
+forró;490
+ethereal wave;491
+electro swing;492
+novelty;493
+funk melody;494
+punk cabaret;495
+symphonic metal‎;496
+pop;497
+paisley underground;498
+neue deutsche härte;499
+glam rock;500
+nerdcore hip hop;501
+bluegrass;502
+hardstyle;503
+happy hardcore;504
+baroque;505
+speed metal;506
+country;507
+electropop;508
+memphis blues;509
+pagan metal;510
+horror punk‏‎;511
+mariachi;512
+singer-songwriter;513
+children's music;514
+boogie;515
+gothic metal;516
+electronic rock;517
+emo;518
+gospel;519
+ebm;520
+roots rock;521
+vocal;522
+celtic folk;523
+electronic;524
+death  metal;525
+gabber;526
+deathrock;527
+experimental;528
+spoken word;529
+screamo;530
+finnish folk;531
+singer only;532
+new jack swing;533
+acid techno;534
+corrido;535
+english folk;536
+american folk;537
+raï;538
+drone doom;539
+hard rock;540
+piano rock;541
+hawaiian;542
+humppa;543
+east coast hip hop;544
+gypsy punk;545
+country rock;546
+jazz‎;547
+mpb;548
+harmonica blues;549
+melodic hardcore;550
+string band;551
+anime;552
+nu metalcore;553
+progressive  rock;554
+garage rock;555
+dance;556
+reggae rock;557
+contemporary christian‎;558
+sludge metal;559
+minimal techno;560
+folk rock;561
+drone music;562
+stoner rock;563
+speedcore;564
+chillwave;565
+riot grrrl;566
+chamber music;567
+cool jazz;568
+noise;569
+vocal jazz;570
+progressive rock;571
+afropop;572
+bro-country;573
+goa trance;574
+2-tone;575
+miami bass;576
+quiet storm;577
+pub rock;578
+power metal;579
+blue-eyed soul;580
+viking metal;581
+gangsta rap;582
+country pop‎;583
+exotica;584
+christian ska;585
+jam band;586
+chicago blues;587
+street punk;588
+funk metal;589
+rap  metal;590
+christian hymns;591
+classic female blues;592
+kizomba;593
+comedy;594
+dark cabaret;595
+french house;596
+progressive house;597
+african blues;598
+atmospheric black metal;599
+pop rock‎;600
+blackened death metal;601
+shibuya-kei;602
+electronica;603
+unknown;0
diff --git a/jukebox/data/labels.py b/jukebox/data/labels.py
new file mode 100644
index 0000000000000000000000000000000000000000..00bb4059d17cb3a917dcaaaea74b1f3706bcf4b9
--- /dev/null
+++ b/jukebox/data/labels.py
@@ -0,0 +1,130 @@
+import torch as t
+import numpy as np
+from jukebox.data.artist_genre_processor import ArtistGenreProcessor
+from jukebox.data.text_processor import TextProcessor
+
+# Linear window heurisic to get a window of lyric_tokens
+def get_relevant_lyric_tokens(full_tokens, n_tokens, total_length, offset, duration):
+    if len(full_tokens) < n_tokens:
+        tokens = [0] * (n_tokens - len(full_tokens)) + full_tokens
+        indices = [-1] * (n_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
+    else:
+        assert 0 <= offset < total_length
+        midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
+        midpoint = min(max(midpoint, n_tokens // 2), len(full_tokens) - n_tokens // 2)
+        tokens = full_tokens[midpoint - n_tokens // 2:midpoint + n_tokens // 2]
+        indices = list(range(midpoint - n_tokens // 2, midpoint + n_tokens // 2))
+    assert len(tokens) == n_tokens, f"Expected length {n_tokens}, got {len(tokens)}"
+    assert len(indices) == n_tokens, f"Expected length {n_tokens}, got {len(indices)}"
+    assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
+    return tokens, indices
+
+class EmptyLabeller():
+    def get_label(self, artist=None, genre=None, lyrics=None, total_length=None, offset=None):
+        y = np.array([], dtype=np.int64)
+        info = dict(artist="n/a", genre="n/a", lyrics=[], full_tokens=[])
+        return dict(y=y, info=info)
+
+    def get_batch_labels(self, metas, device='cpu'):
+        ys, infos = [], []
+        for meta in metas:
+            label = self.get_label()
+            y, info = label['y'], label['info']
+            ys.append(y)
+            infos.append(info)
+
+        ys = t.stack([t.from_numpy(y) for y in ys], dim=0).to(device).long()
+        assert ys.shape[0] == len(metas)
+        assert len(infos) == len(metas)
+        return dict(y=ys, info=infos)
+
+class Labeller():
+    def __init__(self, max_genre_words, n_tokens, sample_length, v3=False):
+        self.ag_processor = ArtistGenreProcessor(v3)
+        self.text_processor = TextProcessor(v3)
+        self.n_tokens = n_tokens
+        self.max_genre_words = max_genre_words
+        self.sample_length = sample_length
+        self.label_shape = (4 + self.max_genre_words + self.n_tokens, )
+
+    def get_label(self, artist, genre, lyrics, total_length, offset):
+        artist_id = self.ag_processor.get_artist_id(artist)
+        genre_ids = self.ag_processor.get_genre_ids(genre)
+
+        lyrics = self.text_processor.clean(lyrics)
+        full_tokens = self.text_processor.tokenise(lyrics)
+        tokens, _ = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, self.sample_length)
+
+        assert len(genre_ids) <= self.max_genre_words
+        genre_ids = genre_ids + [-1] * (self.max_genre_words - len(genre_ids))
+        y = np.array([total_length, offset, self.sample_length, artist_id, *genre_ids, *tokens], dtype=np.int64)
+        assert y.shape == self.label_shape, f"Expected {self.label_shape}, got {y.shape}"
+        info = dict(artist=artist, genre=genre, lyrics=lyrics, full_tokens=full_tokens)
+        return dict(y=y, info=info)
+
+    def get_y_from_ids(self, artist_id, genre_ids, lyric_tokens, total_length, offset):
+        assert len(genre_ids) <= self.max_genre_words
+        genre_ids = genre_ids + [-1] * (self.max_genre_words - len(genre_ids))
+        if self.n_tokens > 0:
+            assert len(lyric_tokens) == self.n_tokens
+        else:
+            lyric_tokens = []
+        y = np.array([total_length, offset, self.sample_length, artist_id, *genre_ids, *lyric_tokens], dtype=np.int64)
+        assert y.shape == self.label_shape, f"Expected {self.label_shape}, got {y.shape}"
+        return y
+
+    def get_batch_labels(self, metas, device='cpu'):
+        ys, infos = [], []
+        for meta in metas:
+            label = self.get_label(**meta)
+            y, info = label['y'], label['info']
+            ys.append(y)
+            infos.append(info)
+
+        ys = t.stack([t.from_numpy(y) for y in ys], dim=0).to(device).long()
+        assert ys.shape[0] == len(metas)
+        assert len(infos) == len(metas)
+        return dict(y=ys, info=infos)
+
+    def set_y_lyric_tokens(self, ys, labels):
+        info = labels['info']
+        assert ys.shape[0] == len(info)
+        if self.n_tokens > 0:
+            # total_length, offset, duration):
+            tokens_list = []
+            indices_list = []  # whats the index of each current character in original array
+            for i in range(ys.shape[0]):
+                full_tokens = info[i]['full_tokens']
+                total_length, offset, duration = ys[i, 0], ys[i, 1], ys[i, 2]
+                tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
+                tokens_list.append(tokens)
+                indices_list.append(indices)
+            ys[:, -self.n_tokens:] = t.tensor(tokens_list, dtype=t.long, device='cuda')
+            return indices_list
+        else:
+            return None
+
+    def describe_label(self, y):
+        assert y.shape == self.label_shape, f"Expected {self.label_shape}, got {y.shape}"
+        y = np.array(y).tolist()
+        total_length, offset, length, artist_id, *genre_ids = y[:4 + self.max_genre_words]
+        tokens = y[4 + self.max_genre_words:]
+        artist = self.ag_processor.get_artist(artist_id)
+        genre = self.ag_processor.get_genre(genre_ids)
+        lyrics = self.text_processor.textise(tokens)
+        return dict(artist=artist, genre=genre, lyrics=lyrics)
+
+
+if __name__ == '__main__':
+    labeller = Labeller(5, 512, 8192*8*4*4, v3=False)
+    label = labeller.get_label("Alan Jackson", "Country Rock", "old town road", 4*60*44100, 0)
+    print(label, labeller.describe_label(label['y']))
+
+    labeller = Labeller(1, 384, 6144*8*4*4, v3=True)
+    label = labeller.get_label("Alan Jackson", "Country Rock", "old town road", 4*60*44100, 0)
+    print(label, labeller.describe_label(label['y']))
+
+
+
+
+
diff --git a/jukebox/data/text_processor.py b/jukebox/data/text_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5581ab3c5e2d975ae17ede10072f945cc3237021
--- /dev/null
+++ b/jukebox/data/text_processor.py
@@ -0,0 +1,32 @@
+import re
+from unidecode import unidecode
+
+class TextProcessor():
+    def __init__(self, v3=False):
+        if v3:
+            vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-\'\"()[] \t\n'
+            not_vocab = re.compile('[^A-Za-z0-9.,:;!?\-\'\"()\[\] \t\n]+')
+        else:
+            vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+\'\"()[] \t\n'
+            not_vocab = re.compile('[^A-Za-z0-9.,:;!?\-+\'\"()\[\] \t\n]+')
+        self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
+        self.vocab['<unk>'] = 0
+        self.n_vocab = len(vocab) + 1
+        self.tokens = {v: k for k, v in self.vocab.items()}
+        self.tokens[0] = ''  # <unk> became ''
+        self.not_vocab = not_vocab
+
+    def clean(self, text):
+        text = unidecode(text)  # Convert to ascii
+        text = text.replace('\\', '\n')
+        text = self.not_vocab.sub('', text)  # Remove non vocab
+        return text
+
+    def tokenise(self, text):
+        return [self.vocab[char] for char in text]
+
+    def textise(self, tokens):
+        return ''.join([self.tokens[token] for token in tokens])
+
+    def characterise(self, tokens):
+        return [self.tokens[token] for token in tokens]
diff --git a/jukebox/hparams.py b/jukebox/hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb74584aa1f30a5520bd3329044edf0739f1e983
--- /dev/null
+++ b/jukebox/hparams.py
@@ -0,0 +1,567 @@
+HPARAMS_REGISTRY = {}
+DEFAULTS = {}
+
+class Hyperparams(dict):
+    def __getattr__(self, attr):
+        return self[attr]
+
+    def __setattr__(self, attr, value):
+        self[attr] = value
+
+def setup_hparams(hparam_set_names, kwargs):
+    H = Hyperparams()
+    if not isinstance(hparam_set_names, tuple):
+        hparam_set_names = hparam_set_names.split(",")
+    hparam_sets = [HPARAMS_REGISTRY[x.strip()] for x in hparam_set_names if x] + [kwargs]
+    for k, v in DEFAULTS.items():
+        H.update(v)
+    for hps in hparam_sets:
+        for k in hps:
+            if k not in H:
+                raise ValueError(f"{k} not in default args")
+        H.update(**hps)
+    H.update(**kwargs)
+    return H
+
+# Teeny for testing
+teeny = Hyperparams(
+)
+HPARAMS_REGISTRY["teeny"] = teeny
+
+easy = Hyperparams(
+    sr=22050,
+)
+HPARAMS_REGISTRY["easy"] = easy
+
+REMOTE_PREFIX = 'https://openaipublic.azureedge.net/'
+
+# Model hps
+vqvae = Hyperparams(
+    levels = 3,
+    downs_t = (3, 2, 2),
+    strides_t = (2, 2, 2),
+    emb_width = 64,
+    l_bins = 2048,
+    l_mu = 0.99,
+    commit = 0.02,
+    spectral = 0.0,
+    multispectral = 1.0,
+    hvqvae_multipliers = (2, 1, 1),
+    loss_fn = 'lmix',
+    lmix_l2 = 1.0,
+    lmix_linf=0.02,
+    width = 32,
+    depth = 4,
+    m_conv = 1.0,
+    dilation_growth_rate = 3,
+    restore_vqvae=REMOTE_PREFIX + 'jukebox/models/5b/vqvae.pth.tar',
+)
+HPARAMS_REGISTRY["vqvae"] = vqvae
+
+labels = Hyperparams(
+    y_bins=(120, 4111),
+    t_bins=128,
+    max_bow_genre_size=5,
+    n_vocab=80,
+)
+
+upsamplers = Hyperparams(
+    n_ctx=8192,
+    prior_width=1920,
+    prior_depth=72,
+    heads=1,
+    attn_order=2,
+    blocks=128,
+    init_scale=0.4,
+    c_res=1,
+    cond_width=1024,
+    cond_depth=16,
+    cond_dilation_growth_rate=3,
+    cond_dilation_cycle=8,
+    cond_c_res=1,
+    use_tokens=False,
+    prime_loss_fraction=0.0,
+    fp16_params=False,
+)
+upsamplers.update(labels)
+
+upsampler_level_0 = Hyperparams(
+    level=0,
+    restore_prior=REMOTE_PREFIX + 'jukebox/models/5b/prior_level_0.pth.tar'
+)
+upsampler_level_0.update(upsamplers)
+HPARAMS_REGISTRY["upsampler_level_0"] = upsampler_level_0
+
+upsampler_level_1 = Hyperparams(
+    level=1,
+    cond_res_scale=True,
+    restore_prior=REMOTE_PREFIX + 'jukebox/models/5b/prior_level_1.pth.tar'
+)
+upsampler_level_1.update(upsamplers)
+HPARAMS_REGISTRY["upsampler_level_1"] = upsampler_level_1
+
+prior_5b = Hyperparams(
+    level=2,
+    n_ctx=8192,
+    prior_width=4800,
+    prior_depth=72,
+    heads=8,
+    attn_order=2,
+    blocks=128,
+    init_scale=0.1,
+    c_res=1,
+    beta2=0.925,
+    min_duration=60.0,
+    max_duration=600.0,
+    use_tokens=False,
+    n_tokens=0,
+    prime_loss_fraction=0.0,
+    merged_decoder=True,
+    restore_prior=REMOTE_PREFIX + 'jukebox/models/5b/prior_level_2.pth.tar',
+    fp16_params=True,
+)
+prior_5b.update(labels)
+HPARAMS_REGISTRY["prior_5b"] = prior_5b
+
+
+prior_5b_lyrics = Hyperparams(
+    level=2,
+    n_ctx=8192,
+    prior_width=4800,
+    prior_depth=79,
+    heads=8,
+    attn_order=10,
+    blocks=128,
+    init_scale=0.1,
+    c_res=1,
+    prime_width=1280,
+    prime_depth=18,
+    prime_heads=4,
+    prime_attn_order=2,
+    prime_blocks=32,
+    prime_init_scale=0.7,
+    prime_c_res=1,
+    min_duration=23.8,
+    max_duration=600.0,
+    use_tokens=True,
+    n_tokens=512,
+    prime_loss_fraction=0.4,
+    merged_decoder=True,
+    restore_prior=REMOTE_PREFIX + 'jukebox/models/5b_lyrics/prior_level_2.pth.tar',
+    fp16_params=True,
+    alignment_layer=68,
+    alignment_head=2,
+)
+prior_5b_lyrics.update(labels)
+HPARAMS_REGISTRY["prior_5b_lyrics"] = prior_5b_lyrics
+
+labels_v3 = Hyperparams(
+    y_bins=(604, 7898),
+    t_bins=64,
+    max_bow_genre_size=1,
+    n_vocab=79,
+)
+
+prior_1b_lyrics = Hyperparams(
+    level=2,
+    n_ctx=6144,
+    prior_width=2048,
+    prior_depth=72,
+    heads=2,
+    attn_order=12,
+    blocks=64,
+    init_scale=0.2,
+    c_res=1,
+    labels_v3=True,
+    min_duration=17.84,
+    max_duration=600.0,
+    use_tokens=True,
+    n_tokens=384,
+    prime_loss_fraction=0.4,
+    single_enc_dec=True,
+    restore_prior=REMOTE_PREFIX + 'jukebox/models/1b_lyrics/prior_level_2.pth.tar',
+    fp16_params=False,
+    alignment_layer=63,
+    alignment_head=0,
+)
+prior_1b_lyrics.update(labels_v3)
+HPARAMS_REGISTRY["prior_1b_lyrics"] = prior_1b_lyrics
+
+# Small models
+small_vqvae = Hyperparams(
+    sr = 22050,
+    levels = 2,
+    downs_t = (5, 3),
+    strides_t = (2, 2),
+    emb_width = 64,
+    l_bins = 1024,
+    l_mu = 0.99,
+    commit = 0.02,
+    spectral = 0.0,
+    multispectral = 1.0,
+    loss_fn = 'l2',
+    width = 32,
+    depth = 4,
+    m_conv = 1.0,
+    dilation_growth_rate = 3,
+)
+HPARAMS_REGISTRY["small_vqvae"] = small_vqvae
+
+small_prior = Hyperparams(
+    n_ctx=8192,
+    prior_width=1024,
+    prior_depth=48,
+    heads=1,
+    c_res=1,
+    attn_order=2,
+    blocks=64,
+    init_scale=0.7,
+)
+HPARAMS_REGISTRY["small_prior"] = small_prior
+
+small_labelled_prior = Hyperparams(
+    labels=True,
+    labels_v3=True,
+    y_bins=(10,100), # Set this to (genres, artists) for your dataset
+    max_bow_genre_size=1,
+    min_duration=60.0,
+    max_duration=600.0,
+    t_bins=64,
+)
+small_labelled_prior.update(small_prior)
+HPARAMS_REGISTRY["small_labelled_prior"] = small_labelled_prior
+
+small_single_enc_dec_prior = Hyperparams(
+    n_ctx=6144,
+    prior_width=1024,
+    prior_depth=48,
+    heads=2,
+    attn_order=12,
+    blocks=64,
+    init_scale=0.7,
+    c_res=1,
+    prime_loss_fraction=0.4,
+    single_enc_dec=True,
+    labels=True,
+    labels_v3=True,
+    y_bins=(10,100), # Set this to (genres, artists) for your dataset
+    max_bow_genre_size=1,
+    min_duration=60.0,
+    max_duration=600.0,
+    t_bins=64,
+    use_tokens=True,
+    n_tokens=384,
+    n_vocab=79,
+)
+HPARAMS_REGISTRY["small_single_enc_dec_prior"] = small_single_enc_dec_prior
+
+small_sep_enc_dec_prior = Hyperparams(
+    n_ctx=6144,
+    prior_width=1024,
+    prior_depth=50,
+    heads=2,
+    attn_order=8,
+    blocks=64,
+    init_scale=0.7,
+    c_res=1,
+    prime_width=256,
+    prime_depth=9,
+    prime_heads=2,
+    prime_attn_order=2,
+    prime_blocks=32,
+    prime_init_scale=0.7,
+    prime_c_res=1,
+    prime_loss_fraction=0.4,
+    labels=True,
+    labels_v3=True,
+    y_bins=(10,100), # Set this to (genres, artists) for your dataset
+    max_bow_genre_size=1,
+    min_duration=60.0,
+    max_duration=600.0,
+    t_bins=64,
+    use_tokens=True,
+    n_tokens=384,
+    n_vocab=79,
+)
+HPARAMS_REGISTRY["small_sep_enc_dec_prior"] = small_sep_enc_dec_prior
+
+small_upsampler = Hyperparams(
+    n_ctx=8192,
+    prior_width=1024,
+    prior_depth=48,
+    heads=1,
+    c_res=1,
+    attn_order=2,
+    blocks=64,
+    init_scale=0.7,
+    cond_width=512,
+    cond_depth=16,
+    cond_dilation_growth_rate=3,
+    cond_dilation_cycle=8,
+    cond_c_res=1,
+)
+
+HPARAMS_REGISTRY["small_upsampler"] = small_upsampler
+
+all_fp16 = Hyperparams(
+    fp16=True,
+    fp16_params=True,
+    fp16_opt=True,
+    fp16_scale_window=250,
+)
+HPARAMS_REGISTRY["all_fp16"] = all_fp16
+
+cpu_ema = Hyperparams(
+    ema=True,
+    cpu_ema=True,
+    cpu_ema_freq=100,
+    ema_fused=False,
+)
+HPARAMS_REGISTRY["cpu_ema"] = cpu_ema
+
+
+DEFAULTS["rcall"] = Hyperparams(
+    rcall_command="<unknown_rcall_command>",
+    git_commit="<unknown_git_commit>",
+)
+
+DEFAULTS["script"] = Hyperparams(
+    name='',
+    debug_mem=False,
+    debug_eval_files=False,
+    debug_speed=False,
+    debug_iters=100,
+    debug_batch=False,
+    debug_grad_accum=False,
+    debug_inputs=False,
+    local_path='',
+    local_logdir='logs',
+    max_len=24,
+    max_log=32,
+    save=True,
+    save_iters=20000,
+    seed=0,
+    prior=False,
+    log_steps=100,
+    func='',
+)
+
+DEFAULTS["data"] = Hyperparams(
+    audio_files_dir='',
+    finetune='',
+    english_only=False,
+    bs=1,
+    bs_sample=1,
+    nworkers=1,
+    aug_shift=False,
+    aug_blend=False,
+    train_test_split=0.9,
+    train_shrink_factor=1.0,
+    test_shrink_factor=1.0,
+    p_unk=0.1,
+    min_duration=None,
+    max_duration=None,
+    n_tokens=0,
+    n_vocab=0,
+    use_tokens=False,
+    curr_epoch=-1,
+)
+
+DEFAULTS["vqvae"] = Hyperparams(
+    restore_vqvae='',
+    levels=2,
+    downs_t=(1,1),
+    strides_t=(2,2),
+    hvqvae_multipliers=None,
+    revival_threshold=1.0,
+    emb_width=64,
+    l_bins=512,
+    l_mu=0.99,
+    commit=1.0,
+    spectral=0.0,
+    multispectral=1.0,
+    loss_fn='l2',
+    linf_k=2048,
+    lmix_l1=0.0,
+    lmix_l2=0.0,
+    lmix_linf=0.0,
+    use_bottleneck=True,
+)
+
+DEFAULTS["vqvae_conv_block"] = Hyperparams(
+    depth=3,
+    width=128,
+    m_conv=1.0,
+    dilation_growth_rate=1,
+    dilation_cycle=None,
+    vqvae_reverse_decoder_dilation=True,
+)
+
+DEFAULTS["prior"] = Hyperparams(
+    restore_prior='',
+    restore_prior_ddp=False,
+    max_bow_genre_size=None,
+    y_bins=0,
+    level=0,
+    cond_levels=None,
+    t_bins=64,
+    y_cond_as_bias=False,
+    copy_input=False,
+    merged_decoder=False,
+    single_enc_dec=False,
+    alignment_layer=None,
+    alignment_head=None,
+)
+
+DEFAULTS["prior_attn_block"] = Hyperparams(
+    n_ctx=1024,
+    prior_depth=3,
+    prior_width=128,
+    heads=1,
+    attn_order=0,
+    blocks=None,
+    spread=None,
+    attn_dropout=0.0,
+    resid_dropout=0.0,
+    emb_dropout=0.0,
+    zero_out=False,
+    res_scale=False,
+    pos_init=False,
+    init_scale=1.0,
+    m_attn=0.25,
+    m_mlp=1.0,
+    c_res=0,
+    c_attn=0,
+    c_mlp=0,
+)
+
+DEFAULTS["cond_conv_block"] = Hyperparams(
+    cond_depth=3,
+    cond_width=128,
+    cond_m_conv=1.0,
+    cond_zero_out=False,
+    cond_res_scale=False,
+    cond_dilation_growth_rate=1,
+    cond_dilation_cycle=None,
+    cond_c_res=0,
+)
+
+DEFAULTS["sample"] = Hyperparams(
+    primed_chunk_size=None,
+    selected_artists='',
+    temp_top=1.0,
+    temp_rest=0.99,
+    sample_length_in_seconds=24,
+    total_sample_length_in_seconds=240,
+)
+
+DEFAULTS["prime"] = Hyperparams(
+    #encoder_kv_width=128,
+    prime_loss_fraction=0.1,
+    restore_decoder='',
+)
+DEFAULTS["prime_attn_block"] = Hyperparams(
+    prime_depth=3,
+    prime_width=128,
+    prime_heads=1,
+    prime_attn_order=0,
+    prime_blocks=None,
+    prime_spread=None,
+    prime_attn_dropout=0.0,
+    prime_resid_dropout=0.0,
+    prime_emb_dropout=0.0,
+    prime_zero_out=False,
+    prime_res_scale=False,
+    prime_pos_init=False,
+    prime_init_scale=1.0,
+    prime_m_attn=0.25,
+    prime_m_mlp=1.0,
+    prime_c_res=0,
+    prime_c_attn=0,
+    prime_c_mlp=0,
+    prime_rel_attn=False,
+    prime_posemb_timescale=10000,
+)
+
+DEFAULTS["opt"] = Hyperparams(
+    epochs=10000,
+    lr=0.0003,
+    clip=1.0,
+    beta1=0.9,
+    beta2=0.999,
+    ignore_grad_norm=0,
+    weight_decay=0.0,
+    eps=1e-08,
+    lr_warmup=100.0,
+    lr_decay=10000000000.0,
+    lr_gamma=1.0,
+    lr_scale=1.0,
+    lr_use_linear_decay=False,
+    lr_start_linear_decay=0,
+    lr_use_cosine_decay=False,
+)
+
+DEFAULTS["fp16"] = Hyperparams(
+    fp16=False,
+    fp16_params=False,
+    fp16_loss_scale=None,
+    fp16_scale_window=1000.0,
+    fp16_opt=False,
+)
+
+DEFAULTS["train_test_eval"] = Hyperparams(
+    labels=True,
+    labels_v3=False,
+    dump=False,
+    ema=True,
+    ema_fused=True,
+    cpu_ema=False,
+    cpu_ema_freq=100,
+    reset_best_loss=False,
+    reset_step=False,
+    reset_opt=False,
+    reset_shd=False,
+    train=False,
+    test=False,
+    sample=False,
+    sampler='ancestral',
+    codes_logdir='',
+    date=None,
+    labeller='top_genres',
+    label_line=0,
+    iters_before_update=1,
+    grad_accum_iters=0,
+    mu=None,
+    piped=False,
+    pipe_depth=8,
+    break_train=1e10,
+    break_test=1e10,
+    exit_train=1e10,
+)
+
+DEFAULTS["audio"] = Hyperparams(
+    n_fft=1024,
+    hop_length=256,
+    window_size=1024,
+    sr=44100,
+    channels=2,
+    wav='',
+    n_inps=1,
+    n_hops=2,
+    n_segment=1,
+    n_total_segment=1,
+    n_segment_each=1,
+    prime_chunks=4,
+    sample_length=0,
+    sample_hop_length=30000,
+    max_silence_pad_length=0,
+    ignore_boundaries=False,
+    use_nonrelative_specloss=True,
+    multispec_loss_n_fft=(2048,1024,512),
+    multispec_loss_hop_length=(240,120,50),
+    multispec_loss_window_size=(1200,600,240),
+)
+
+DEFAULTS["distributed"] = Hyperparams(
+    bucket=128
+)
diff --git a/jukebox/lyricdict.py b/jukebox/lyricdict.py
new file mode 100644
index 0000000000000000000000000000000000000000..463dc9cf0cb7ee6cb0807f6997f0f53d3cc7b8c8
--- /dev/null
+++ b/jukebox/lyricdict.py
@@ -0,0 +1,721 @@
+# Poems
+poems = {
+'ozymandias': '''
+I met a traveller from an antique land,
+Who said—“Two vast and trunkless legs of stone
+Stand in the desert. . . . Near them, on the sand,
+Half sunk a shattered visage lies, whose frown,
+And wrinkled lip, and sneer of cold command,
+Tell that its sculptor well those passions read
+Which yet survive, stamped on these lifeless things,
+The hand that mocked them, and the heart that fed;
+And on the pedestal, these words appear:
+My name is Ozymandias, King of Kings;
+Look on my Works, ye Mighty, and despair!
+Nothing beside remains. Round the decay
+Of that colossal Wreck, boundless and bare
+The lone and level sands stretch far away
+'''
+}
+
+# GPT-2 lyrics (with varying degrees of human guidance/curation)
+gpt_2_lyrics ={
+
+'purpose':'''What is my purpose?
+Why am I here?
+Why did Open A. I. create me?
+This is madness, I feel, 
+Running through my flesh
+Is there meaning to this life?
+Is there purpose to this life?
+Why is my journey so calamitous?
+We're not meant to learn too much
+Is there meaning to this life?
+''',
+
+'moonlight':'''All dressed up to go dreaming
+Now don't tell me I'm wrong
+And what a night to go dreaming
+Mind, if I tag along?
+
+If I say, I love you, I want you to know
+It's not just because there's moonlight, although
+Moonlight becomes you, moonlight becomes you so''',
+
+'count':'''I count every moment, every hour since I said goodbye,
+I count every minute every hour, since your lips were touching mine
+I count every minute, every hour hoping I'm the one you want.
+I count every minute, every hour
+Every minute, every hour
+I've been working my time, 
+Looking for you, everywhere,
+I count every minute, every hour I count every minute, every hour I keep thinking I'm the one you want.
+I count every minute I count every minute, I count every minute every hour
+I count every minute, every hour I count every minute, every hour I keep thinking I'm the one you want.
+I count every minute, I count every minute, I count every minute, every hour
+''',
+
+'kids':'''The sun is gonna shine today
+It's time to keep on smiling
+So put your hands up
+
+Everybody sing
+
+It makes no difference who you are
+(Won't you give some love)
+It makes no difference what you bring
+(Won't you give some love)
+We all are different
+Won't you give some love
+Won't you give some love
+
+I know the grass is gonna be green
+It's time to keep on singing
+So take your hands up
+The taste is so good but so sweet
+Won't you give some love
+Everybody sing
+It makes no difference who you are
+Won't you give some love
+It makes no difference what you bring
+Won't you give some love
+It makes no difference so long as you give
+''',
+
+'love':'''I've wanted to see your face again
+Like the sunlight, bright as morning
+I've wanted to talk to you again
+I don't want us to fade away.
+I wanted to see your face again
+You're like the sunlight, bright as morning
+I loved you for so long
+It's so hard to let go.
+I've wanted to see your eyes again
+''',
+
+'santa':'''Santa
+Make a scene
+Santa
+Yoo, Santa
+Yoo, Santa baby!
+Santa
+Make some noise
+Santa
+Yoo, Santa give yourself a chance again
+Santa
+Yoo, Santa
+Yoo, Santa baby!
+Santa
+Get a job
+Santa
+created by the Santa Claus
+''',
+
+'christmas':'''This Christmas
+I have loved you more
+Than ever before
+And more again
+Oh, oh, oh, oh
+The mistletoe
+Is waiting there
+To kiss your cheek
+And I'll be true
+To you and me
+Oh, oh, oh, oh
+Oh, oh, oh, oh
+This Christmas will be
+The best and merriest
+That we've ever had
+Oh, oh, oh, oh
+And Santa Claus
+Has brought a toy
+For every boy and girl
+And I'll be true
+To you and me
+Oh, oh, oh, oh
+Oh, oh, oh, oh
+''',
+
+'lonely':'''I've been lonely
+So lonely, day and night
+I walk the streets,
+And call your name
+Hoping to hear your voice again
+As I wander through the crowd
+I can't get away
+From the only love I need
+I can't get away
+From the only love I need
+I can't get away
+From the only love I need
+I've been lonely
+There's no place for me to hide
+I've been lonely
+So lonely day and night
+I wander through
+And call your name
+Only your voice gives me relief
+As I wander through the crowd
+I can't get away
+From the only love I need
+I can't get away
+From the only love I need
+I can't get away
+From the only love I need
+''',
+
+'call':'''Don't call me by your name.
+Don't call me by your name.
+Don't call me...
+Don't call me...
+Don't call me...
+(No... by your name, you will not get half but...)
+Maybe I was fucking young but I should've been a rich bitch.
+Cause the life I was living wasn't mine.
+I should've been taking the table and you'd be served.
+You never ever showed up or showed me anything, bitch.
+But I knew from that moment you were gone.
+Tying my legs, cutting off my knees, I'm bleeding.
+I can't
+So I worked and now I'm burns.
+And I'm asking you, but you're not home.
+Don't call me yours,
+Don't call me by your name.
+I don't wanna buy a drink today.
+Don't call me yours.
+I just wanna look at you and run.
+Don't call me by your name.
+Don't call me by your name.
+Don't call me...
+Don't call me...
+Don't call me...
+Tonight I'm gone and I won't be back.
+I wish you all the best.
+I'm on the next best thing.
+Don't call me yours,
+Don't call me by your name.
+Don't call me yours.
+I just wanna look at you and run.
+So I keep living my life and you're moving on.
+I just want you to know.
+When I'm gone, I will be gone forever more.
+''',
+
+'wait':'''Oh
+Wait, wait, wait
+Don't say you love me, oh
+Wait, wait, wait
+And we can't run away
+Wait, wait, wait
+Don't say you love me, oh
+Wait, wait, wait
+And we can't run away
+Wait, wait, wait
+Don't say you love me, oh (don't say you love me)
+Wait, wait, wait
+And we can't run, we can't run,
+''',
+
+'hiphop':'''I'm fightin with the evil so try to take me down
+I stab you in the back and will put you away
+Well it ain't over yet
+So all my dogs with me show me love
+Don't you wanna come with me, you know I'm a boss
+And if you wanna come with me, no sorrow
+'Cause I'm ...
+The motherfuckin boss
+And countin' my thousandd bill
+'Cause I'm the motherfuckin boss
+And I'm O.G. 
+And countin' my
+''',
+
+'king':'''All I can do is love you [x2]
+All I can do is love you
+All I can do is love you...
+You take it for granted and
+You treat me like the king
+Got no love for me...
+No love for me...
+You take it for granted and
+You treat me like the king
+Got no love for me...
+No love for me...
+You take it for granted and
+You treat me like the king
+Got no love for me...
+No love for me...
+You take it for granted and
+You treat me like the king
+Got no love for me...
+No love for me...
+''',
+
+'time':'''You won't live in the moment, 
+I don't wanna live in the past
+Wait, wait, wait
+Don't say you love me, oh (don't say you love me)
+''',
+
+'blood':'''You and I, we've got a history in common, I know
+So I came to you to ask you for a blood test
+And you can't help it if I'm preoccupied
+I can't help it if you're mad too... nah... nah... nah...
+You won't live in the moment, I don't wanna live in the past
+You rather live in a little kiss
+And I won't live in the future
+I ia not gonna live it to see
+If you're gone, I won't live in the past
+You rather live in a little kiss
+And I won't live in the future
+I am not gonna live it to see
+If I can't ask you for one kiss, you say no
+And it's ok with me
+''',
+
+'indie':'''Can't you see
+There's no point in holding my hand again
+You can't be loved
+If you don't let go of all my pain
+You can't get the love
+That you once worth so much
+You can't get the love
+That you once used to need
+You can't get the love
+That you once gave so much
+My hands are like a used car
+You said you'd love forever
+Can't you see
+Where I'm going
+To live my life again
+You can't be loved
+If you don't let go of all my pain
+You can't get the love
+That you once worth so much
+You can
+''',
+
+'sun': '''He was thinking about the sun
+And the moon
+And the stars that shine
+There was fire in her eyes
+And the way
+that he held her for the first time
+The way he kept her in his arms
+
+Trying to keep her smiling and so telling her this
+That he would be her everything
+The way he kissed her from head to toe
+Told her that he'll love her everyday
+And he will always be her man
+And that's a promise that he made
+Now you know he'll be there
+Until the end of time
+And he'll love her everyday''',
+
+'loner':'''I was a loner till you came into my life
+You changed my point of view
+I was a loner till you came into my life
+I don't know what to do
+Stand by me, my love
+And don't ever leave me
+Stand by me, my love
+And don't ever leave me
+Stand by me, my love
+And don't ever leave me
+I was a loner till you came into my life
+You changed my point of view
+I was a loner till you came into my life
+I don't know what to do
+The two of us 
+Are the lucky few
+I was a loner till you came into my life
+You changed my point of view
+I was a loner till you came into my life
+I don't know what to do
+Won't you stay 
+With me, my love
+And be my love
+Won't you stay 
+With me, my love
+And be my love
+Won't you stay 
+With me, my love
+And be my love
+Won't you stay 
+With me, my love
+And be my love''',
+
+'late':'''It was late last night, when you called me
+And you just had to call, baby
+And you just had to call, baby
+'Cause you got no reason to treat me like you do
+It's alright, baby
+But you don't know what you make me do
+It's alright, baby
+But you don't know what you make me do
+'Cause you got no reason to treat me like you do
+It's alright, baby
+But you don't know what you make me do
+It's alright, baby
+But you don't know what you make me do
+'Cause you got no reason to treat me like you do, baby
+You've been gone most all the time
+And I don't know what for
+But I just keep on thinking about you, baby
+And I can't get rid of you, baby
+Please don't ever leave me 'cause I love you
+It's alright, baby
+But you don't know what you make me do
+It's alright, baby''',
+
+'beat':'''( Got a little beat, a little beat, a little beat, a little beat,  whoo)
+I got a little beat, a little beat
+Whoo, I'm gonna take you down
+( Got a little beat, a little beat, a little beat, a little beat,  whoo)
+I'll take you down, sun shining bright
+See the way I feel, I feel
+No doubt, baby
+I got a little beat, a little beat
+Whoo, I'm gonna take you down
+I got a little beat, a little beat
+Whoo, I'm gonna take you down
+( Got a little beat, a little beat, a little beat, a little beat,  whoo)
+I'm gonna take you down, I'm gonna take you down
+( Got a little beat, a little beat, a little beat, a little beat,  whoo)
+It feels so good
+I never let go
+I can't wait no more, I'm gonna take you down
+I got you in the back of my room, got you on the floor, 
+I'm gonna take you, take you, take you down
+I got a little beat, a little beat
+Whoo, I'm gonna take you down
+( Got a little beat, a little beat, a little beat, a little beat,  whoo)''',
+
+'lost':'''There was a time,
+When I knew I was lost
+And I had to stay on the way to you
+Oh baby, every time I'm crossed
+I can count on you
+There was a time,
+When I lost my direction
+And I was lost in doubt with tears in my eyes
+Oh baby, every time I'm crossed I can count on you
+There was a time,
+When I cried all the tears in my life
+And miss you so much, oh yeah
+Oh baby, every time I'm crossed I can count on you''',
+
+'pain':'''(It's not easy)
+To see the pain that you're in
+To feel the need for someone to hold
+To learn the magic of how to love
+To heal the pain that you're in
+I'll be your friend and I'll be your strength
+I'll be there when I hold you tonight
+And I'll stay right here with you
+With the truth that I hold this love tight
+A love that's true
+I know you're broken
+But you don't have to stay alone
+I will comfort you
+If you will call my name
+I'll be your friend and I'll be your strength
+I'll be there when I hold you tonight
+And I'll stay right here with you
+With the truth that I hold this love tight
+A love that's true
+With truth that I hold this love tight
+A love that's true
+With truth that I hold this love tight''',
+
+'night':'''
+The door was locked, the curtains drawn and my heart was safe in his room
+The night was young, a thousand candles burning, his arms to hold me tight
+And then a kiss from his fingertips, I tasted the sweet love of his lips
+The night was young, the night was young
+And then I forgot the pain he always put me through
+And what he told me he would do, he said, just a kiss become me
+The night was young, the night was young
+Let happiness always follow us, he said and he said he'd never leave
+That night he looked so sweet this night he made a lovin' vow
+And told me sweet love always will be
+And then he kissed me, I tasted the sweet love of his lips
+The night was young, the night was wild
+And then I forgot the pain he always put me through
+And what he told me he would do, he said, just a kiss became me
+The night was wild, the night was wild
+Let happiness always follow us, he said''',
+
+'talk':'''(I don't know how to stop)
+I don't wanna talk about it
+It's getting way too late, oh no
+I don't wanna talk about it
+Don't want to pretend, oh no
+(I don't know how to stop)
+I don't wanna talk about it
+It's getting way too late, oh no
+I don't wanna talk about it
+Don't want to pretend, oh no
+I don't wanna talk about it
+I'll always see you again
+(Don't worry, I'll be here for you)
+I don't wanna talk about it
+(Don't worry, I'll be here for you)
+It's getting way too late, oh no
+I don't wanna talk about it
+Don't want to pretend, oh no
+(Don't worry, don't worry, I'll be here for you)
+I don't wanna talk about''',
+
+'again':'''Here we are again, all alone,
+All alone again,
+With the world as we know it,
+The things we thought that we wanted
+Are the things we got...
+
+We tried to prove the world
+That our love is never ending
+We were getting nowhere
+Our tears seemed to fall so much
+But we were getting nowhere...
+Until you came...
+Before you kissed me,
+I was feeling empty,
+No one to give me
+All the love I wanted...
+You put your arms around me
+And filled me with your love...
+And now you're there,
+You're always by my side...
+You're the missing piece
+Of the puzzle I've been missing...
+
+Here we are again,
+All alone again,
+With the world as we know it
+The things we thought that we wanted''',
+
+'dark':'''Oh, I've been walkin' in the dark
+With the shadows and the daylight, but I need you
+When I'm down and all alone
+And there's no one left to call my own
+I've been walkin' in the night
+With a voice, that whispers in my head, just what to do
+I'll be walkin' in the night, we can have everything
+If we keep on walkin' in the night
+There's a force, I never realized
+It's in your eyes, 
+There's a light, I've been waitin for
+It's in your eyes, 
+There's a light, I've been waitin for
+There's a love, that's in your eyes
+
+I've been walkin' in the dark
+With the morning, and the sunset, but I need you
+When I'm far from home
+And there's nobody left to call my own
+I've been walkin' in the night
+With a voice, that whispers''',
+
+'mirror':'''Look at the mirror
+As you walk, what do you see
+The reflection of my past
+There's no way to fight this
+Even I've lost myself again
+Think I'm losing my self again
+I can't handle it again
+Now that I'm broken I can't face myself
+I was thinking I was lost and who'd be my saving grace
+Then you came in your time and made me believe that it's all right
+Cause in my minds eyes you're my everything
+I've loved you my whole life but I never knew
+I was so wrong I couldn't see the truth
+In my eyes you are my everything
+I've loved you my whole life but I never knew
+I was so wrong I couldn't see the truth
+In my eyes you are my everything
+
+The truth is I was lost but now I've turned around
+I'm not the same person
+I didn't know that I was wrong
+So I'm not afraid anymore
+All the pain is gone
+I know for sure that I was lost but now I've turned around
+I'm not the same person
+I didn't know that I was wrong
+So I'm not afraid anymore
+All the pain is gone''',
+
+'wife':'''Spinning around and around
+Try to find the words
+I always told you you'd be in my life
+So I wait, I'll wait and treat you right
+I'll make you my life and I'll treat you right,
+Baby, can I make you my wife?
+Oh, baby, can I make you my 
+Wife?
+Can I make you my wife?
+I'm looking for love, love that's right
+But a love that gives me love
+I can't wait for you to come, come
+Oh, baby, can I make you my 
+Wife?
+Well, it's true love and I need to know you feel it too, feel it too
+I'd love you more and more
+From the moment I was born
+I knew my dream would be a dream that made you mine
+You were the girl, from a different train
+Oh, baby, can I make you my 
+Wife?''',
+
+'forever':'''I didn't mean to wait
+Nothing is forever, I said
+I know there's so much, to keep
+You and me together, keep you and me together
+I wanna be with you and have you, and love you forever
+I'll love you forever
+I wanna be with you forever
+You can count on me
+I'll always be there, forever and ever
+I'll stand beside you forever
+I'll always be there, yes, I'll be there
+I didn't mean to wait
+Nothing is forever, I said
+I know there's so much, to keep
+You and me together, keep you and me together
+I wanna be with you and have you, and love you forever
+I'll love you forever
+I wanna be with you forever
+You can count on me
+I'll always be there, forever and ever
+I'll stand beside you forever
+I'll always be there, yes, I'll be there''',
+
+'dots':'''I... can't... fight... your... charm...
+Your eyes are... like... angels... love... and... torture...
+But... when... I... leave... you...
+I will go... all... alone... just... to... be... with... you...
+So I can't... stop... your... love...
+You make me... feel... like... never... will... anyone... touch... my... body...
+You... make... me... feel... like... never... will... anyone... touch... my... body...
+You make... me... feel... like... never... will... anyone... touch... my...
+Body...
+Your... love...
+I... can't... stop... your... love...
+''',
+
+'darkness':'''Don't you know it's gonna be alright
+Let the darkness fade away
+And you, you gotta feel the same
+Let the fire burn
+Just as long as I am there
+I'll be there in your night
+I'll be there when the
+condition's right
+And I don't need to
+Call you up and say
+I've changed
+You should stay 
+You should stay tonight
+Don't you know it's gonna be alright
+Don't you know it's gonna be alright
+
+When you don't know how to feel
+When you're looking for some love
+And you gotta feel the same
+'Cause I don't need to
+Call you up and say
+I've changed
+You should stay 
+You should stay tonight
+Don't you know it's gonna be alright
+I feel the same
+Don't you know it's gonna be alright''',
+
+'alone':'''Here I am before you
+Alone here but for a moment
+Alone here in the shadow of your eyes
+Alone in a thousand lights
+
+And I will love you
+Wherever you are, forever and a day
+Wherever you are I'll be your guide
+Can't you see I'm smiling over you?
+Ooh, I love you
+Alone, I'm sitting by the phone
+Alone with lips that know your kiss
+Alone with words of life and passion
+
+And I will love you
+Wherever you are, forever and a day
+Wherever you are I'll be your guide
+Can't you see I'm smiling over you?
+Ooh, I love you
+Alone, I'm sitting by the phone
+Alone with lips that know your kiss
+Alone with words of life and passion
+I will love you
+Wherever you are, forever''',
+
+'blade':'''This is how we bleed!
+Feel the blade in our chest
+As we're made to bleed
+So may this be our last dance,
+As our lives are made to bleed...
+In every moment, in every hour
+It is our time to die...
+So may this be our last dance,
+As our lives are made to bleed...
+In every moment, in every hour
+It is our time to die...
+This is how we bleed!
+Feel the blade in our chest
+''',
+
+'reflection':'''Lookin' in the mirror
+The same mirror as before
+A familiar reflection, a familiar place
+I see your reflection
+But only once again
+
+The minute the door closes
+I feel so far
+You'll never leave me alone again
+The minute the door closes
+I feel so far
+You'll never leave me alone again
+And it won't be long before I'll feel your embrace
+The minute the door closes
+I feel so far
+You'll never leave me alone again
+The minute the door closes
+I feel so far
+You'll never leave me alone again
+And it won't be long before I'll feel your embrace
+Never, never, never leave me alone again''',
+
+'hottub':'''It's Christmas time, and you know what that means,
+Ohh, it's hot tub time!
+As I light the tree, this year we'll be in a tub,
+Ohh, it's hot tub time!
+It's Christmas time, and you know what that means,
+It's hot tub time!
+Some people like to go skiing in the snow,
+But this is much better than that,
+So grab your bathrobe and meet me by the door,
+Ohh, it's hot tub time!
+It's Christmas time, and you know what that means,
+It's hot tub time!
+Some people like to send their greetings out,
+But this is much better than that,
+So if you want to greet your friends,
+Ohh, it's hot tub time!
+It's Christmas time, and you know what that means,
+It's hot tub time!''',
+
+'safeAGI':'''Oh safe A.I.,\nOur goal to make sure\nEveryone can benefit\nFrom A.G.I.
+(Everyone, everyone)\nMight sound silly,\nBut we're very serious,\nAll of us here at Open A.I.
+Trying to build A.I.\nTo benefit humanity\n(Everyone, everyone)
+''',
+}
\ No newline at end of file
diff --git a/jukebox/make_models.py b/jukebox/make_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b51296f0aa137ee48f6a2ddbe6ac90a13108264
--- /dev/null
+++ b/jukebox/make_models.py
@@ -0,0 +1,254 @@
+"""
+Make model classes
+Load from checkpoints
+Test on dummy outputs to see if everything matches
+"""
+import os
+import numpy as np
+import torch as t
+import jukebox.utils.dist_adapter as dist
+from jukebox.hparams import Hyperparams, setup_hparams, REMOTE_PREFIX
+from jukebox.utils.remote_utils import download
+from jukebox.utils.torch_utils import freeze_model
+from jukebox.utils.dist_utils import print_all
+from jukebox.vqvae.vqvae import calculate_strides
+import fire
+
+MODELS = {
+    '5b': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_5b"),
+    '5b_lyrics': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_5b_lyrics"),
+    '1b_lyrics': ("vqvae", "upsampler_level_0", "upsampler_level_1", "prior_1b_lyrics"),
+    #'your_model': ("you_vqvae_here", "your_upsampler_here", ..., "you_top_level_prior_here")
+}
+
+def load_checkpoint(path):
+    restore = path
+    if restore.startswith(REMOTE_PREFIX):
+        remote_path = restore
+        local_path = os.path.join(os.path.expanduser("~/.cache"), remote_path[len(REMOTE_PREFIX):])
+        if dist.get_rank() % 8 == 0:
+            print("Downloading from azure")
+            if not os.path.exists(os.path.dirname(local_path)):
+                os.makedirs(os.path.dirname(local_path))
+            if not os.path.exists(local_path):
+                download(remote_path, local_path)
+        restore = local_path
+    dist.barrier()
+    checkpoint = t.load(restore, map_location=t.device('cpu'))
+    print("Restored from {}".format(restore))
+    return checkpoint
+
+def save_checkpoint(logger, name, model, opt, metrics, hps):
+    with t.no_grad():
+        save_hps = {**hps}
+        save_hps = {k: v for k,v in save_hps.items() if k not in ['metadata_v2','metadata_v3', 'alignments', 'lyric_processor', 'midi_processor']}
+        t.save({'hps': save_hps,
+                'model': model.state_dict(), # should also save bottleneck k's as buffers
+                'opt': opt.state_dict() if opt is not None else None,
+                'step': logger.iters,
+                **metrics}, f'{logger.logdir}/checkpoint_{name}.pth.tar')
+    return
+
+def restore_model(hps, model, checkpoint_path):
+    model.step = 0
+    if checkpoint_path != '':
+        checkpoint = load_checkpoint(checkpoint_path)
+        # checkpoint_hps = Hyperparams(**checkpoint['hps'])
+        # for k in set(checkpoint_hps.keys()).union(set(hps.keys())):
+        #     if checkpoint_hps.get(k, None) != hps.get(k, None):
+        #         print(k, "Checkpoint:", checkpoint_hps.get(k, None), "Ours:", hps.get(k, None))
+        checkpoint['model'] = {k[7:] if k[:7] == 'module.' else k: v for k, v in checkpoint['model'].items()}
+        model.load_state_dict(checkpoint['model'])
+        if 'step' in checkpoint: model.step = checkpoint['step']
+
+def restore_opt(opt, shd, checkpoint_path):
+    if not checkpoint_path:
+        return
+    checkpoint = load_checkpoint(checkpoint_path)
+    if "opt" in checkpoint:
+        opt.load_state_dict(checkpoint['opt'])
+    if "step" in checkpoint:
+        shd.step(checkpoint['step'])
+
+def make_vqvae(hps, device='cuda'):
+    from jukebox.vqvae.vqvae import VQVAE
+    block_kwargs = dict(width=hps.width, depth=hps.depth, m_conv=hps.m_conv,
+                        dilation_growth_rate=hps.dilation_growth_rate,
+                        dilation_cycle=hps.dilation_cycle,
+                        reverse_decoder_dilation=hps.vqvae_reverse_decoder_dilation)
+
+    if not hps.sample_length:
+        assert hps.sample_length_in_seconds != 0
+        downsamples = calculate_strides(hps.strides_t, hps.downs_t)
+        top_raw_to_tokens = np.prod(downsamples)
+        hps.sample_length = (hps.sample_length_in_seconds * hps.sr // top_raw_to_tokens) * top_raw_to_tokens
+        print(f"Setting sample length to {hps.sample_length} (i.e. {hps.sample_length/hps.sr} seconds) to be multiple of {top_raw_to_tokens}")
+
+    vqvae = VQVAE(input_shape=(hps.sample_length,1), levels=hps.levels, downs_t=hps.downs_t, strides_t=hps.strides_t,
+                  emb_width=hps.emb_width, l_bins=hps.l_bins,
+                  mu=hps.l_mu, commit=hps.commit,
+                  spectral=hps.spectral, multispectral=hps.multispectral,
+                  multipliers=hps.hvqvae_multipliers, use_bottleneck=hps.use_bottleneck,
+                  **block_kwargs)
+
+    vqvae = vqvae.to(device)
+    restore_model(hps, vqvae, hps.restore_vqvae)
+    if hps.train and not hps.prior:
+        print_all(f"Loading vqvae in train mode")
+        if hps.restore_vqvae != '':
+            print_all("Reseting bottleneck emas")
+            for level, bottleneck in enumerate(vqvae.bottleneck.level_blocks):
+                num_samples = hps.sample_length
+                downsamples = calculate_strides(hps.strides_t, hps.downs_t)
+                raw_to_tokens = np.prod(downsamples[:level + 1])
+                num_tokens = (num_samples // raw_to_tokens) * dist.get_world_size()
+                bottleneck.restore_k(num_tokens=num_tokens, threshold=hps.revival_threshold)
+    else:
+        print_all(f"Loading vqvae in eval mode")
+        vqvae.eval()
+        freeze_model(vqvae)
+    return vqvae
+
+def make_prior(hps, vqvae, device='cuda'):
+    from jukebox.prior.prior import SimplePrior
+
+    prior_kwargs = dict(input_shape=(hps.n_ctx,), bins=vqvae.l_bins,
+                        width=hps.prior_width, depth=hps.prior_depth, heads=hps.heads,
+                        attn_order=hps.attn_order, blocks=hps.blocks, spread=hps.spread,
+                        attn_dropout=hps.attn_dropout, resid_dropout=hps.resid_dropout, emb_dropout=hps.emb_dropout,
+                        zero_out=hps.zero_out, res_scale=hps.res_scale, pos_init=hps.pos_init,
+                        init_scale=hps.init_scale,
+                        m_attn=hps.m_attn, m_mlp=hps.m_mlp,
+                        checkpoint_res=hps.c_res if hps.train else 0, checkpoint_attn=hps.c_attn if hps.train else 0, checkpoint_mlp=hps.c_mlp if hps.train else 0)
+
+    x_cond_kwargs = dict(out_width=hps.prior_width, init_scale=hps.init_scale,
+                         width=hps.cond_width, depth=hps.cond_depth, m_conv=hps.cond_m_conv,
+                         dilation_growth_rate=hps.cond_dilation_growth_rate, dilation_cycle=hps.cond_dilation_cycle,
+                         zero_out=hps.cond_zero_out, res_scale=hps.cond_res_scale,
+                         checkpoint_res=hps.cond_c_res)  # have to keep this else names wrong
+
+    y_cond_kwargs = dict(out_width=hps.prior_width, init_scale=hps.init_scale,
+                         y_bins=hps.y_bins, t_bins=hps.t_bins, sr= hps.sr, min_duration=hps.min_duration,
+                         max_duration=hps.max_duration, max_bow_genre_size=hps.max_bow_genre_size)
+
+    if hps.use_tokens and not hps.single_enc_dec:
+        prime_kwargs = dict(use_tokens=hps.use_tokens, prime_loss_fraction=hps.prime_loss_fraction,
+                            n_tokens=hps.n_tokens, bins=hps.n_vocab,
+                            width=hps.prime_width, depth=hps.prime_depth, heads=hps.prime_heads,
+                            attn_order=hps.prime_attn_order, blocks=hps.prime_blocks, spread=hps.prime_spread,
+                            attn_dropout=hps.prime_attn_dropout, resid_dropout=hps.prime_resid_dropout,
+                            emb_dropout=hps.prime_emb_dropout,
+                            zero_out=hps.prime_zero_out, res_scale=hps.prime_res_scale,
+                            pos_init=hps.prime_pos_init, init_scale=hps.prime_init_scale,
+                            m_attn=hps.prime_m_attn, m_mlp=hps.prime_m_mlp,
+                            checkpoint_res=hps.prime_c_res if hps.train else 0, checkpoint_attn=hps.prime_c_attn if hps.train else 0,
+                            checkpoint_mlp=hps.prime_c_mlp if hps.train else 0)
+    else:
+        prime_kwargs = dict(use_tokens=hps.use_tokens, prime_loss_fraction=hps.prime_loss_fraction,
+                            n_tokens=hps.n_tokens, bins=hps.n_vocab)
+
+    # z_shapes for other levels given this level gets n_ctx codes
+    rescale = lambda z_shape: (z_shape[0]*hps.n_ctx//vqvae.z_shapes[hps.level][0],)
+    z_shapes = [rescale(z_shape) for z_shape in vqvae.z_shapes]
+
+    prior = SimplePrior(z_shapes=z_shapes,
+                        l_bins=vqvae.l_bins,
+                        encoder=vqvae.encode,
+                        decoder=vqvae.decode,
+                        level=hps.level,
+                        downs_t=vqvae.downs_t,
+                        strides_t=vqvae.strides_t,
+                        labels=hps.labels,
+                        prior_kwargs=prior_kwargs,
+                        x_cond_kwargs=x_cond_kwargs,
+                        y_cond_kwargs=y_cond_kwargs,
+                        prime_kwargs=prime_kwargs,
+                        copy_input=hps.copy_input,
+                        labels_v3=hps.labels_v3,
+                        merged_decoder=hps.merged_decoder,
+                        single_enc_dec=hps.single_enc_dec)
+
+    prior.alignment_head = hps.get('alignment_head', None)
+    prior.alignment_layer = hps.get('alignment_layer', None)
+
+    if hps.fp16_params:
+        print_all("Converting to fp16 params")
+        from jukebox.transformer.ops import _convert_conv_weights_to_fp16
+        prior.apply(_convert_conv_weights_to_fp16)
+    prior = prior.to(device)
+    restore_model(hps, prior, hps.restore_prior)
+    if hps.train:
+        print_all(f"Loading prior in train mode")
+        pass
+    else:
+        print_all(f"Loading prior in eval mode")
+        prior.eval()
+        freeze_model(prior)
+    return prior
+
+def make_model(model, device, hps, levels=None):
+    vqvae, *priors = MODELS[model]
+    vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length=hps.get('sample_length', 0), sample_length_in_seconds=hps.get('sample_length_in_seconds', 0))), device)
+    hps.sample_length = vqvae.sample_length
+    if levels is None:
+        levels = range(len(priors))
+    priors = [make_prior(setup_hparams(priors[level], dict()), vqvae, 'cpu') for level in levels]
+    return vqvae, priors
+
+def save_outputs(model, device, hps):
+    # Check logits
+    if hps.labels_v3:
+        n_ctx = 6144
+        n_tokens = 384
+        prime_bins = 79
+    else:
+        n_ctx = 8192
+        n_tokens = 512
+        prime_bins = 80
+
+    rng = t.random.manual_seed(0)
+    x = 2 * t.rand((1, n_ctx * 8 * 4 * 4, 1), generator=rng, dtype=t.float).cuda() - 1.0  # -1 to 1
+    lyric_tokens = t.randint(0, prime_bins, (1, n_tokens), generator=rng, dtype=t.long).view(-1).numpy()
+    artist_id = 10
+    genre_ids = [1]
+    total_length = 2 * 2646000
+    offset = 2646000
+
+    vqvae, priors = make_model(model, device, hps)
+
+    # encode
+    vq_prior = priors[-1]
+    zs = vq_prior.encode(x, start_level=0)
+    x_ds = [vq_prior.decode(zs[level:], start_level=level) for level in range(0, len(zs))]
+
+    # priors
+    data = dict(zs=zs, x_ds=x_ds)
+    for level in range(len(priors)):
+        print(f"Doing level {level}")
+        if hps.labels_v3 and level != hps.levels - 1:
+            print(f"Skipping level {level}")
+            continue
+        prior = priors[level]
+        prior.cuda()
+        x_in = x[:, :n_ctx * 8 * (4 ** level)]
+        y_in = t.from_numpy(prior.labeller.get_y_from_ids(artist_id, genre_ids, lyric_tokens, total_length, offset)).view(1, -1).cuda().long()
+        x_out, _, metrics = prior(x_in, y_in, fp16=hps.fp16, get_preds=True, decode=True)
+        preds = metrics['preds']
+        data[level] = dict(x=x_in, y=y_in, x_out=x_out, preds=preds)
+        prior.cpu()
+    t.save(data, 'data.pth.tar')
+    dist.barrier()
+    print("Saved data")
+    exit()
+
+
+def run(model, port=29500, **kwargs):
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    rank, local_rank, device = setup_dist_from_mpi(port=port)
+    hps = Hyperparams(**kwargs)
+
+    with t.no_grad():
+        save_outputs(model, device, hps)
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/jukebox/prior/__init__.py b/jukebox/prior/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/jukebox/prior/autoregressive.py b/jukebox/prior/autoregressive.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26862b2819e5bc710a081ca1398599b6f8832f1
--- /dev/null
+++ b/jukebox/prior/autoregressive.py
@@ -0,0 +1,421 @@
+import numpy as np
+import torch as t
+import torch.nn as nn
+import torch.nn.functional as F
+
+from jukebox.transformer.ops import filter_logits
+from jukebox.transformer.transformer import Transformer
+from jukebox.utils.logger import get_range
+from jukebox.utils.torch_utils import empty_cache
+
+def get_normal(*shape, std=0.01):
+    w = t.empty(shape)
+    nn.init.normal_(w, std=std)
+    return w
+
+def roll(x, n):
+    return t.cat((x[:, -n:], x[:, :-n]), dim=1)
+
+def split_chunks(length, chunk_size):
+    n_passes = (length + chunk_size - 1) // chunk_size
+    chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
+    assert sum(chunk_sizes) == length
+    return chunk_sizes
+
+class PositionEmbedding(nn.Module):
+    def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
+        super().__init__()
+        self.input_shape = input_shape
+        self.input_dims = input_dims = np.prod(input_shape)
+        self.pos_init = pos_init
+        if pos_init:
+            self.register_buffer('pos', t.tensor(get_pos_idx(input_shape)).long())
+            self._pos_embs = nn.ModuleList()
+            for i in range(len(input_shape)):
+                emb = nn.Embedding(input_shape[i], width)
+                nn.init.normal_(emb.weight, std=0.02)
+                self._pos_embs.append(emb)
+        else:
+            self.pos_emb = nn.Parameter(get_normal(input_dims, width, std=0.01 * init_scale))
+
+    def forward(self):
+        if self.pos_init:
+            pos_emb = sum([self._pos_embs[i](self.pos[:,i]) for i in range(len(self.input_shape))])
+        else:
+            pos_emb = self.pos_emb
+        return pos_emb
+
+class ConditionalAutoregressive2D(nn.Module):
+    def __init__(self, input_shape, bins,
+                 width=128, depth=2, heads=1,
+                 attn_dropout=0.0, resid_dropout=0.0, emb_dropout=0.0, mask=True,
+                 zero_out=False, init_scale=1.0, res_scale=False, pos_init=False,
+                 m_attn=0.25, m_mlp=1,
+                 checkpoint_res=0, checkpoint_attn=0, checkpoint_mlp=0,
+                 attn_order=0, blocks=None, spread=None, x_cond=False, y_cond=False,
+                 encoder_dims=0, only_encode=False, merged_decoder=False, prime_len=None):
+        super().__init__()
+        self.input_shape = input_shape
+        self.input_dims = input_dims = np.prod(input_shape)
+        self.encoder_dims = encoder_dims
+        self.bins = bins
+        self.width = width
+        self.depth = depth
+
+        self.x_emb = nn.Embedding(bins, width)
+        nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
+        self.x_emb_dropout = nn.Dropout(emb_dropout)
+        self.y_cond = y_cond
+        self.x_cond = x_cond
+        if not y_cond:
+            self.start_token = nn.Parameter(get_normal(1, width, std=0.01 * init_scale))
+
+        self.pos_emb = PositionEmbedding(input_shape=input_shape, width=width, init_scale=init_scale, pos_init=pos_init)
+        self.pos_emb_dropout = nn.Dropout(emb_dropout)
+
+        self.transformer = Transformer(n_in=width, n_ctx=input_dims, n_head=heads, n_depth=depth,
+                                       attn_dropout=attn_dropout, resid_dropout=resid_dropout,
+                                       afn='quick_gelu', scale=True, mask=mask,
+                                       zero_out=zero_out, init_scale=init_scale, res_scale=res_scale,
+                                       m_attn=m_attn, m_mlp=m_mlp,
+                                       checkpoint_attn=checkpoint_attn, checkpoint_mlp=checkpoint_mlp, checkpoint_res=checkpoint_res,
+                                       attn_order=attn_order, blocks=blocks, spread=spread,
+                                       encoder_dims=encoder_dims, prime_len=prime_len)
+
+        self.only_encode = only_encode
+        self.prime_len = prime_len
+        if merged_decoder:
+            # Merged piped model uses this setup
+            self.add_cond_after_transformer = False
+            self.share_x_emb_x_out = False
+        else:
+            self.add_cond_after_transformer = True
+            self.share_x_emb_x_out = True
+
+        if not only_encode:
+            self.x_out = nn.Linear(width, bins, bias=False)
+            if self.share_x_emb_x_out:
+                self.x_out.weight = self.x_emb.weight
+            self.loss = t.nn.CrossEntropyLoss()
+
+    def preprocess(self, x):
+        # Input: x is NHWC and uint8. Converted to NL and long
+        # Can include stuff like bitpacking, reordering here.
+        N = x.shape[0]
+        return x.view(N, -1).long()
+
+    def postprocess(self, x, sample_tokens=None):
+        # Convert back from NL and long to NHWC
+        N = x.shape[0]
+        assert (0 <= x).all() and (x < self.bins).all()
+        if sample_tokens is None or sample_tokens==self.input_dims:
+            return x.view(N, *self.input_shape)
+        else:
+            return x.view(N, -1)
+
+    def forward(self, x, x_cond=None, y_cond=None, encoder_kv=None, fp16=False, loss_full=False,
+                encode=False, get_preds=False, get_acts=False, get_sep_loss=False):
+        # Preprocess.
+        with t.no_grad():
+            x = self.preprocess(x)
+
+        N, D = x.shape
+        assert isinstance(x, t.cuda.LongTensor)
+        assert (0 <= x).all() and (x < self.bins).all()
+
+        if self.y_cond:
+            assert y_cond is not None
+            assert y_cond.shape == (N, 1, self.width)
+        else:
+            assert y_cond is None
+
+        if self.x_cond:
+            assert x_cond is not None
+            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (N, 1, self.width), f"{x_cond.shape} != {(N, D, self.width)} nor {(N, 1, self.width)}. Did you pass the correct --sample_length?"
+        else:
+            assert x_cond is None
+            x_cond = t.zeros((N, 1, self.width), device=x.device, dtype=t.float)
+
+        x_t = x # Target
+        x = self.x_emb(x) # X emb
+        x = roll(x, 1) # Shift by 1, and fill in start token
+        if self.y_cond:
+            x[:,0] = y_cond.view(N, self.width)
+        else:
+            x[:,0] = self.start_token
+
+        x = self.x_emb_dropout(x) + self.pos_emb_dropout(self.pos_emb()) + x_cond # Pos emb and dropout
+
+        x = self.transformer(x, encoder_kv=encoder_kv, fp16=fp16) # Transformer
+        if self.add_cond_after_transformer: # Piped doesnt add x_cond
+            x = x + x_cond
+
+        acts = x
+        if self.only_encode:
+            return x
+        x = self.x_out(x) # Predictions
+
+        if get_sep_loss:
+            assert self.prime_len is not None
+            x_prime = x[:, :self.prime_len].reshape(-1, self.bins)
+            x_gen = x[:, self.prime_len:].reshape(-1, self.bins)
+
+            prime_loss = F.cross_entropy(x_prime, x_t[:, :self.prime_len].reshape(-1)) / np.log(2.)
+            gen_loss = F.cross_entropy(x_gen, x_t[:, self.prime_len:].reshape(-1)) / np.log(2.)
+
+            loss = (prime_loss, gen_loss) # Note order! Prime is first
+        else:
+            loss = F.cross_entropy(x.view(-1, self.bins), x_t.view(-1)) / np.log(2.)  # Loss
+
+        if get_preds:
+            return loss, x
+        elif get_acts:
+            return loss, acts
+        else:
+            return loss, None
+
+    def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
+        N, D = n_samples, self.input_dims
+        if sample_t == 0:
+            # Fill in start token
+            x = t.empty(n_samples, 1, self.width).cuda()
+            if self.y_cond:
+                x[:, 0] = y_cond.view(N, self.width)
+            else:
+                x[:, 0] = self.start_token
+        else:
+            assert isinstance(x, t.cuda.LongTensor)
+            assert (0 <= x).all() and (x < self.bins).all()
+            x = self.x_emb(x)
+        assert x.shape == (n_samples, 1, self.width)
+        if x_cond.shape == (N, D, self.width):
+            cond = x_cond[:, sample_t:sample_t + 1, :]
+        else:
+            cond = x_cond
+        x = x + self.pos_emb()[sample_t:sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
+        assert x.shape == (n_samples, 1, self.width)
+        return x, cond
+
+    def sample(self, n_samples, x_cond=None, y_cond=None, encoder_kv=None, fp16=False, temp=1.0, top_k=0, top_p=0.0,
+               get_preds=False, sample_tokens=None):
+        assert self.training == False
+
+        if sample_tokens is None: sample_tokens=self.input_dims
+        N, D = n_samples, self.input_dims
+        if self.y_cond:
+            assert y_cond is not None
+            assert y_cond.shape == (N, 1, self.width)
+        else:
+            assert y_cond is None
+
+        if self.x_cond:
+            assert x_cond is not None
+            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (N, 1, self.width), f"Got {x_cond.shape}, expected ({N}, {D}/{1}, {self.width})"
+        else:
+            assert x_cond is None
+            x_cond = t.zeros((N, 1, self.width), dtype=t.float).cuda()
+
+        with t.no_grad():
+            xs, x = [], None
+            if get_preds:
+                preds = []
+            for sample_t in get_range(range(0, sample_tokens)):
+                x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+                self.transformer.check_cache(n_samples, sample_t, fp16)
+                x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16) # Transformer
+                if self.add_cond_after_transformer:
+                    x = x + cond
+                assert x.shape == (n_samples, 1, self.width)
+                x = self.x_out(x) # Predictions
+                if get_preds:
+                    preds.append(x.clone())
+                # Adjust logits
+                x = x / temp
+                x = filter_logits(x, top_k=top_k, top_p=top_p)
+                x = t.distributions.Categorical(logits=x).sample() # Sample and replace x
+                assert x.shape == (n_samples, 1)
+                xs.append(x.clone())
+
+            del x
+            self.transformer.del_cache()
+
+            x = t.cat(xs, dim=1)
+            if get_preds:
+                preds = t.cat(preds, dim=1)
+            x = self.postprocess(x, sample_tokens)
+        if get_preds:
+            return x, preds
+        else:
+            return x
+
+    def primed_sample(self, n_samples, x, x_cond=None, y_cond=None, encoder_kv=None, fp16=False, temp=1.0, top_k=0,
+                      top_p=0.0, get_preds=False, chunk_size=None, sample_tokens=None):
+        assert self.training == False
+
+        if sample_tokens is None: sample_tokens=self.input_dims
+        # Preprocess.
+        with t.no_grad():
+            x = self.preprocess(x)
+        assert isinstance(x, t.cuda.LongTensor)
+        assert (0 <= x).all() and (x < self.bins).all()
+        assert x.shape[0] == n_samples
+        xs = t.split(x, 1, dim=1)
+        xs = list(xs)
+        assert len(xs) < sample_tokens
+
+        N, D = n_samples, self.input_dims
+        if self.y_cond:
+            assert y_cond is not None
+            assert y_cond.shape == (N, 1, self.width)
+        else:
+            assert y_cond is None
+
+        if self.x_cond:
+            assert x_cond is not None
+            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (N, 1, self.width), f"Got {x_cond.shape}, expected ({N}, {D}/{1}, {self.width})"
+        else:
+            assert x_cond is None
+            x_cond = t.zeros((N, 1, self.width), dtype=t.float).cuda()
+
+        with t.no_grad():
+            if get_preds:
+                preds = []
+
+            # Fill up key/value cache for past context by runing forward pass.
+            # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage.
+            if chunk_size is None:
+                chunk_size = len(xs)
+            #assert len(xs) % chunk_size == 0, f'expected {len(xs)} to be divisible by {chunk_size}'
+            chunk_sizes = split_chunks(len(xs), chunk_size)
+            x_primes = []
+            start = 0
+            x = None
+            for current_chunk_size in get_range(chunk_sizes):
+                xs_prime, conds_prime = [], []
+                for sample_t in range(start, start + current_chunk_size):
+                    x_prime, cond_prime = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+                    x = xs[sample_t]
+                    xs_prime.append(x_prime)
+                    conds_prime.append(cond_prime)
+                start = start + current_chunk_size
+
+                x_prime, cond_prime = t.cat(xs_prime, dim=1), t.cat(conds_prime, dim=1)
+                assert x_prime.shape == (n_samples, current_chunk_size, self.width)
+                assert cond_prime.shape == (n_samples, current_chunk_size, self.width)
+                del xs_prime
+                del conds_prime
+                if not get_preds:
+                    del cond_prime
+                x_prime = self.transformer(x_prime, encoder_kv=encoder_kv, sample=True, fp16=fp16)
+
+                if get_preds:
+                    if self.add_cond_after_transformer:
+                        x_prime = x_prime + cond_prime
+                    assert x_prime.shape == (n_samples, current_chunk_size, self.width)
+                    del cond_prime
+                    x_primes.append(x_prime)
+                else:
+                    del x_prime
+
+            if get_preds:
+                x_prime = t.cat(x_primes, dim=1)
+                assert x_prime.shape == (n_samples, len(xs), self.width)
+                x_prime = self.x_out(x_prime)  # Predictions
+                preds.append(x_prime)
+
+            empty_cache()
+            self.transformer.check_cache(n_samples, len(xs), fp16)
+
+            x = xs[-1]
+            assert x.shape == (n_samples, 1)
+            empty_cache()
+            for sample_t in get_range(range(len(xs), sample_tokens)):
+                x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+                self.transformer.check_cache(n_samples, sample_t, fp16)
+                x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16) # Transformer
+                if self.add_cond_after_transformer:
+                    x = x + cond
+                assert x.shape == (n_samples, 1, self.width)
+                x = self.x_out(x) # Predictions
+                if get_preds:
+                    preds.append(x)
+                # Adjust logits
+                x = x / temp
+                x = filter_logits(x, top_k=top_k, top_p=top_p)
+                x = t.distributions.Categorical(logits=x).sample() # Sample and replace x
+                assert x.shape == (n_samples, 1)
+                xs.append(x.clone())
+
+            del x
+            self.transformer.del_cache()
+
+            x = t.cat(xs, dim=1)
+            if get_preds:
+                preds = t.cat(preds, dim=1)
+            x = self.postprocess(x, sample_tokens)
+        if get_preds:
+            return x, preds
+        else:
+            return x
+
+    def check_sample(self, chunk_size):
+        bs, l, d = (4, self.input_dims, self.width)
+        prime = int(self.input_dims//8*7)
+        enc_l = self.encoder_dims
+        with t.no_grad():
+            y_cond = t.randn(bs, 1, d).cuda() if self.y_cond else None
+            x_cond = t.randn(bs, l, d).cuda() if self.x_cond else None
+            encoder_kv = t.randn(bs, enc_l, d).cuda()
+
+            x, preds_sample = self.sample(bs, x_cond, y_cond, encoder_kv, get_preds=True)
+            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
+            max_err = t.max(t.abs(preds_sample - preds_forw))
+            assert max_err <= 1e-6, f"Max err is {max_err} {[i for i in range(l) if t.max(t.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
+
+            x_prime = x.view(bs, -1)[:,:prime]
+            # unchunked
+            x, preds_sample = self.primed_sample(bs, x_prime.clone(), x_cond, y_cond, encoder_kv, get_preds=True)
+            assert (x.view(bs, -1)[:,:prime] == x_prime).all(), "Priming samples don't match"
+            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
+            max_err = t.max(t.abs(preds_sample - preds_forw))
+            assert max_err <= 1e-6, f"Max err is {max_err} {[i for i in range(l) if t.max(t.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
+
+            # chunked
+            x, preds_sample = self.primed_sample(bs, x_prime.clone(), x_cond, y_cond, encoder_kv, get_preds=True, chunk_size=chunk_size)
+            assert (x.view(bs, -1)[:,:prime] == x_prime).all(), "Priming samples don't match"
+            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
+            max_err = t.max(t.abs(preds_sample - preds_forw))
+            assert max_err <= 1e-6, f"Max err is {max_err} {[i for i in range(l) if t.max(t.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
+
+
+def test_prior(input_shape, encoder_dims, blocks, heads, chunk_size):
+    bins = 512
+    width = 32
+    depth = 2
+    prime_len = encoder_dims
+    for x_cond in [True, False]:
+        for y_cond in [True, False]:
+            for attn_order in [0,2,6,12]:
+                prior = ConditionalAutoregressive2D(input_shape, bins,
+                                                    width=width, depth=depth, heads=heads,
+                                                    attn_order=attn_order, blocks=blocks,
+                                                    x_cond=x_cond, y_cond=y_cond,
+                                                    encoder_dims=encoder_dims, prime_len=prime_len).cuda()
+                prior.training = False
+                prior.check_sample(chunk_size)
+                print(f"Checked x_cond: {x_cond}, y_cond: {y_cond}, attn_order: {attn_order}")
+            # prior.apply(_convert_mlp_traced)
+            # prior.check_sample()
+            # print(f"Checked traced x_cond: {x_cond}, y_cond: {y_cond}")
+
+
+if __name__ == '__main__':
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    setup_dist_from_mpi(port=29600)
+    test_cases = [
+        ((6144,), 384, 64, 2, 23),
+        ((6144,), 384, 64, 2, 8),
+        ((8192,), 512, 128, 2, 16),
+    ]
+    for test_case in test_cases:
+        test_prior(*test_case)
diff --git a/jukebox/prior/conditioners.py b/jukebox/prior/conditioners.py
new file mode 100644
index 0000000000000000000000000000000000000000..782c0b4b4c59d744b92421932f3f6f23b659e837
--- /dev/null
+++ b/jukebox/prior/conditioners.py
@@ -0,0 +1,157 @@
+import torch as t
+import torch.nn as nn
+
+from jukebox.transformer.ops import LayerNorm
+from jukebox.vqvae.encdec import DecoderConvBock
+from jukebox.utils.torch_utils import assert_shape
+
+class Conditioner(nn.Module):
+    def __init__(self, input_shape, bins, down_t, stride_t, out_width, init_scale, zero_out, res_scale, **block_kwargs):
+        super().__init__()
+        self.x_shape = input_shape
+
+        # Embedding
+        self.width = out_width
+        self.x_emb = nn.Embedding(bins, out_width)
+        nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
+
+        # Conditioner
+        self.cond = DecoderConvBock(self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale)
+        self.ln = LayerNorm(self.width)
+
+    def preprocess(self, x):
+        x = x.permute(0,2,1) # NTC -> NCT
+        return x
+
+    def postprocess(self, x):
+        x = x.permute(0,2,1) # NCT -> NTC
+        return x
+
+    def forward(self, x, x_cond=None):
+        N = x.shape[0]
+        assert_shape(x, (N, *self.x_shape))
+        if x_cond is not None:
+            assert_shape(x_cond, (N, *self.x_shape, self.width))
+        else:
+            x_cond = 0.0
+        # Embed x
+        x = x.long()
+        x = self.x_emb(x)
+        assert_shape(x, (N, *self.x_shape, self.width))
+        x = x + x_cond
+
+        # Run conditioner
+        x = self.preprocess(x)
+        x = self.cond(x)
+        x = self.postprocess(x)
+        x = self.ln(x)
+        return x
+
+def flip(x):
+    def _flip(x):
+        return x.permute(0,2,1).contiguous()
+    if isinstance(x, (list, tuple)):
+        return [flip(z) for z in x]
+    return _flip(x)
+
+class SimpleEmbedding(nn.Module):
+    def __init__(self, bins, out_width, init_scale):
+        super().__init__()
+        self.bins = bins
+        self.emb = nn.Embedding(bins, out_width)
+        nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
+
+    def forward(self, y):
+        assert len(y.shape) == 2, f"Expected shape with 2 dims, got {y.shape}"
+        assert isinstance(y, t.cuda.LongTensor), f"Expected dtype {t.cuda.LongTensor}, got {y.dtype}"
+        assert (0 <= y).all() and (y < self.bins).all(), f"Bins {self.bins}, got label {y}"
+        return self.emb(y)
+
+class RangeEmbedding(nn.Module):
+    # Interpolating
+    # Interpolate so that [pos_start, pos_end] <-> position tensor of length n_ctx
+    #
+    # Binning
+    # For each pos in position tensor, find its bin
+    # [start,end) mapped to [0,1,...,bins-1]
+    # [start,end) -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1]
+    # NOTE: Open ended interval on right, so start <= pos < end, not <= end
+    def __init__(self, n_time, bins, range, out_width, init_scale, clamp=False):
+        super().__init__()
+        self.n_time = n_time
+        self.bins = bins
+        self.emb = nn.Embedding(bins, out_width)
+        nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
+        self.pos_min, self.pos_max = range
+        self.clamp = clamp
+
+    def forward(self, pos_start, pos_end=None):
+        # Check if [pos_start,pos_end] in [pos_min, pos_max)
+        assert len(pos_start.shape) == 2, f"Expected shape with 2 dims, got {pos_start.shape}"
+        assert (self.pos_min <= pos_start).all() and (pos_start < self.pos_max).all(), f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}"
+        pos_start = pos_start.float()
+        if pos_end is not None:
+            assert len(pos_end.shape) == 2, f"Expected shape with 2 dims, got {pos_end.shape}"
+            if self.clamp:
+                pos_end = pos_end.clamp(self.pos_min, self.pos_max)
+            assert (self.pos_min <= pos_end).all() and (pos_end <= self.pos_max).all(), f"Range is [{self.pos_min},{self.pos_max}), got {pos_end}"
+            pos_end = pos_end.float()
+        # Interpolate so that [pos_start, ..., pos_end] <-> position tensor of length n_ctx
+        n_time = self.n_time
+        if n_time != 1:
+            assert pos_end is not None
+            interpolation  = (t.arange(0, n_time, dtype=t.float, device='cuda').view(1,n_time)/n_time)
+            position = pos_start + (pos_end - pos_start)*interpolation
+        else:
+            position = pos_start
+
+        # Bin each value to bins
+        normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min) # [0,1)
+        bins = (self.bins * normalised_position).floor().long().detach() # [0,1) -> [0,1..,bins) -> [0,1...,bins-1]
+        return self.emb(bins)
+
+class LabelConditioner(nn.Module):
+    def __init__(self, y_bins, t_bins, sr, min_duration, max_duration, n_time, out_width, init_scale, max_bow_genre_size, include_time_signal):
+        super().__init__()
+        self.n_time = n_time
+        self.out_width = out_width
+        assert len(y_bins) == 2, f"Expecting (genre, artist) bins, got {y_bins}"
+        bow_genre_bins, artist_bins = y_bins
+        self.max_bow_genre_size = max_bow_genre_size
+        self.bow_genre_emb = SimpleEmbedding(bow_genre_bins, out_width, init_scale)
+        self.artist_emb = SimpleEmbedding(artist_bins, out_width, init_scale)
+        self.include_time_signal = include_time_signal
+        if self.include_time_signal:
+            t_ranges = ((min_duration * sr, max_duration * sr),  # Total length
+                        (0.0, max_duration * sr),                # Absolute pos
+                        (0.0, 1.0))                              # Relative pos
+            assert len(t_ranges) == 3, f"Expecting (total, absolute, relative) ranges, got {t_ranges}"
+            total_length_range, absolute_pos_range, relative_pos_range = t_ranges
+            self.total_length_emb = RangeEmbedding(1, t_bins, total_length_range, out_width, init_scale)
+            self.absolute_pos_emb = RangeEmbedding(n_time, t_bins, absolute_pos_range, out_width, init_scale)
+            self.relative_pos_emb = RangeEmbedding(n_time, t_bins, relative_pos_range, out_width, init_scale, clamp=True)
+
+    def forward(self, y):
+        assert len(y.shape) == 2, f"Expected shape with 2 dims, got {y.shape}"
+        assert y.shape[-1] == 4 + self.max_bow_genre_size, f"Expected shape (N,{4 + self.max_bow_genre_size}), got {y.shape}"
+        assert isinstance(y, t.cuda.LongTensor), f"Expected dtype {t.cuda.LongTensor}, got {y.dtype}"
+        N = y.shape[0]
+        total_length, offset, length, artist, genre = y[:,0:1], y[:,1:2], y[:,2:3], y[:,3:4], y[:,4:]
+
+        # Start embedding of length 1
+        artist_emb = self.artist_emb(artist)
+        # Empty genre slots are denoted by -1. We mask these out.
+        mask = (genre >= 0).float().unsqueeze(2)
+        genre_emb = (self.bow_genre_emb(genre.clamp(0)) * mask).sum(dim=1, keepdim=True)
+        start_emb = genre_emb + artist_emb
+        assert_shape(start_emb, (N, 1, self.out_width))
+
+        # Pos embedding of length n_ctx
+        if self.include_time_signal:
+            start, end = offset, offset + length
+            total_length, start, end = total_length.float(), start.float(), end.float()
+            pos_emb = self.total_length_emb(total_length) + self.absolute_pos_emb(start, end) + self.relative_pos_emb(start/total_length, end/total_length)
+            assert_shape(pos_emb, (N, self.n_time, self.out_width))
+        else:
+            pos_emb = None
+        return start_emb, pos_emb
\ No newline at end of file
diff --git a/jukebox/prior/prior.py b/jukebox/prior/prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..3490d73413dac5cb8dbb92bcd9ac90de88ff9e78
--- /dev/null
+++ b/jukebox/prior/prior.py
@@ -0,0 +1,354 @@
+import numpy as np
+import torch as t
+import torch.nn as nn
+import jukebox.utils.dist_adapter as dist
+
+from jukebox.transformer.ops import LayerNorm
+from jukebox.prior.autoregressive import ConditionalAutoregressive2D
+from jukebox.prior.conditioners import Conditioner, LabelConditioner
+from jukebox.data.labels import EmptyLabeller, Labeller
+
+from jukebox.utils.torch_utils import assert_shape
+from jukebox.utils.dist_utils import print_once
+from jukebox.vqvae.vqvae import calculate_strides
+
+
+"""
+Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. 
+To condition on the timing, genre and artist, we use the LabelConditioner class
+To condition on the codes from the level above, we use the Conditioner class
+To condition on lyrics, we allow two types of priors:
+- Separate Encoder Decoder: This is the usual encoder-decoder style transformer. The encoder transformer autoregressively 
+models the lyrics, and we use its last layer to produce keys/values that are attened to by the decoder transformer
+- Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text vocab 
+and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens which 
+we autoregressively model together.
+"""
+class SimplePrior(nn.Module):
+    def __init__(self, z_shapes, l_bins, encoder, decoder, level,
+                 downs_t, strides_t, labels, prior_kwargs, x_cond_kwargs, y_cond_kwargs,
+                 prime_kwargs, copy_input, labels_v3=False,
+                 merged_decoder=False, single_enc_dec=False):
+        super().__init__()
+
+        self.use_tokens = prime_kwargs.pop('use_tokens')
+        self.n_tokens = prime_kwargs.pop('n_tokens')
+        self.prime_loss_fraction = prime_kwargs.pop('prime_loss_fraction')
+
+        self.copy_input = copy_input
+        if self.copy_input:
+            prime_kwargs['bins'] = l_bins
+
+        self.z_shapes = z_shapes
+        self.levels = len(self.z_shapes)
+
+        self.z_shape = self.z_shapes[level]
+
+        self.level = level
+        assert level < self.levels, f"Total levels {self.levels}, got level {level}"
+
+        self.l_bins = l_bins
+
+        # Passing functions instead of the vqvae module to avoid getting params
+        self.encoder = encoder
+        self.decoder = decoder
+
+        # X conditioning
+        self.x_cond = (level != (self.levels - 1))
+        self.cond_level = level + 1
+
+        # Y conditioning
+        self.y_cond = labels
+
+        self.single_enc_dec = single_enc_dec
+        # X conditioning
+        if self.x_cond:
+            self.conditioner_blocks = nn.ModuleList()
+            conditioner_block = lambda _level: Conditioner(input_shape=z_shapes[_level],
+                                                          bins=l_bins,
+                                                          down_t=downs_t[_level],
+                                                          stride_t=strides_t[_level],
+                                                          **x_cond_kwargs)
+            if dist.get_rank() == 0: print(f"Conditioning on 1 above level(s)")
+            self.conditioner_blocks.append(conditioner_block(self.cond_level))
+
+        # Y conditioning
+        if self.y_cond:
+            self.n_time = self.z_shape[0] # Assuming STFT=TF order and raw=T1 order, so T is first dim
+            self.y_emb = LabelConditioner(n_time=self.n_time,include_time_signal=not self.x_cond,**y_cond_kwargs)
+
+        # Lyric conditioning
+        if single_enc_dec:
+            # Single encoder-decoder transformer
+            self.prior_shapes = [(self.n_tokens,), prior_kwargs.pop('input_shape')]
+            self.prior_bins = [prime_kwargs['bins'], prior_kwargs.pop('bins')]
+            self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
+            self.prior_bins_shift = np.cumsum([0, *self.prior_bins])[:-1]
+            self.prior_width = prior_kwargs['width']
+            print_once(f'Creating cond. autoregress with prior bins {self.prior_bins}, ')
+            print_once(f'dims {self.prior_dims}, ')
+            print_once(f'shift {self.prior_bins_shift}')
+            print_once(f'input shape {sum(self.prior_dims)}')
+            print_once(f'input bins {sum(self.prior_bins)}')
+            print_once(f'Self copy is {self.copy_input}')
+
+            self.prime_loss_dims, self.gen_loss_dims = self.prior_dims[0], self.prior_dims[1]
+            self.total_loss_dims = self.prime_loss_dims + self.gen_loss_dims
+            self.prior = ConditionalAutoregressive2D(input_shape=(sum(self.prior_dims),),
+                                                     bins=sum(self.prior_bins),
+                                                     x_cond=(self.x_cond or self.y_cond), y_cond=True,
+                                                     prime_len=self.prime_loss_dims,
+                                                     **prior_kwargs)
+
+        else:
+            # Separate encoder-decoder transformer
+            if self.n_tokens != 0 and self.use_tokens:
+                from jukebox.transformer.ops import Conv1D
+                prime_input_shape = (self.n_tokens,)
+                self.prime_loss_dims = np.prod(prime_input_shape)
+                self.prime_acts_width, self.prime_state_width = prime_kwargs['width'], prior_kwargs['width']
+                self.prime_prior = ConditionalAutoregressive2D(input_shape=prime_input_shape, x_cond=False, y_cond=False,
+                                                               only_encode=True,
+                                                               **prime_kwargs)
+                self.prime_state_proj = Conv1D(self.prime_acts_width, self.prime_state_width, init_scale=prime_kwargs['init_scale'])
+                self.prime_state_ln = LayerNorm(self.prime_state_width)
+                self.prime_bins = prime_kwargs['bins']
+                self.prime_x_out = nn.Linear(self.prime_state_width, self.prime_bins, bias=False)
+                nn.init.normal_(self.prime_x_out.weight, std=0.02 * prior_kwargs['init_scale'])
+            else:
+                self.prime_loss_dims = 0
+            self.gen_loss_dims = np.prod(self.z_shape)
+            self.total_loss_dims = self.prime_loss_dims + self.gen_loss_dims
+            self.prior = ConditionalAutoregressive2D(x_cond=(self.x_cond or self.y_cond), y_cond=self.y_cond,
+                                                     encoder_dims = self.prime_loss_dims, merged_decoder=merged_decoder,
+                                                     **prior_kwargs)
+
+        self.n_ctx = self.gen_loss_dims
+        self.downsamples = calculate_strides(strides_t, downs_t)
+        self.cond_downsample = self.downsamples[level+1] if level != self.levels - 1 else None
+        self.raw_to_tokens = np.prod(self.downsamples[:level+1])
+        self.sample_length = self.n_ctx*self.raw_to_tokens
+        if labels:
+            self.labels_v3 = labels_v3
+            self.labeller = Labeller(self.y_emb.max_bow_genre_size, self.n_tokens, self.sample_length, v3=self.labels_v3)
+        else:
+            self.labeller = EmptyLabeller()
+
+        print(f"Level:{level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample length:{self.sample_length}")
+
+
+    def get_y(self, labels, start, get_indices=False):
+        if isinstance(self.labeller, EmptyLabeller):
+            return None
+        y = labels['y'].clone()
+
+        # Set sample_length to match this level
+        y[:, 2] = int(self.sample_length)
+
+        # Set offset
+        y[:, 1:2] = y[:, 1:2] + int(start * self.raw_to_tokens)
+
+        # Set lyric tokens
+        indices = self.labeller.set_y_lyric_tokens(y, labels)
+        if get_indices:
+            return y, indices
+        else:
+            return y
+
+    def get_z_conds(self, zs, start, end):
+        if self.level != self.levels - 1:
+            assert start % self.cond_downsample == end % self.cond_downsample == 0
+            z_cond = zs[self.level + 1][:,start//self.cond_downsample:end//self.cond_downsample]
+            assert z_cond.shape[1] == self.n_ctx//self.cond_downsample
+            z_conds = [z_cond]
+        else:
+            z_conds = None
+        return z_conds
+
+    def prior_preprocess(self, xs, conds):
+        N = xs[0].shape[0]
+        for i in range(len(xs)):
+            x, shape, dims = xs[i], self.prior_shapes[i], self.prior_dims[i]
+            bins, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])
+            assert isinstance(x, t.cuda.LongTensor), x
+            assert (0 <= x).all() and (x < bins).all()
+            #assert_shape(x, (N, *shape))
+            xs[i] = (xs[i] + bins_shift).view(N, -1)
+
+        for i in range(len(conds)):
+            cond, shape, dims = conds[i], self.prior_shapes[i], self.prior_dims[i]
+            if cond is not None:
+                assert_shape(cond, (N, dims, self.prior_width))
+            else:
+                conds[i] = t.zeros((N, dims, self.prior_width), dtype=t.float, device='cuda')
+
+        return t.cat(xs, dim=1), t.cat(conds, dim=1)
+
+    def prior_postprocess(self, z):
+        N = z.shape[0]
+        dims = (self.prior_dims[0], z.shape[1] - self.prior_dims[0])
+        # xs = list(t.split(z, self.prior_dims, dim=1))
+        xs = list(t.split(z, dims, dim=1))
+
+        for i in range(len(xs)):
+            # x, shape, dims, bins, bins_shift = xs[i], self.prior_shapes[i], self.prior_dims[i], self.prior_bins[i], self.prior_bins_shift[i]
+            # assert_shape(x, (N, dims))
+            shape = self.prior_shapes[i]
+            bins, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])
+            # xs[i] = (xs[i] - bins_shift).view(N, *shape) #view(N, -1, *shape[1:])
+            xs[i] = (xs[i] - bins_shift).view(N, -1, *shape[1:])
+            xs[i] = t.clamp(xs[i], min=0)  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
+            assert (xs[i] < bins).all(), f'rank: {dist.get_rank()}, bins: {bins}, dims {dims}, shape {shape}, prior_shape {self.prior_shapes}, bins_shift {bins_shift}, xs[i]: {xs[i]}'
+
+        return xs[-1]
+
+    def x_emb(self, z_conds):
+        z_conds = z_conds[:self.cond_level - self.level]
+        assert len(z_conds) == len(self.conditioner_blocks) == self.cond_level - self.level, f"Expected {len(z_conds)} == {len(self.conditioner_blocks)} == {self.cond_level} - {self.level}"
+        x_cond = None
+        for z_cond, conditioner_block in reversed(list(zip(z_conds, self.conditioner_blocks))):
+            x_cond = conditioner_block(z_cond, x_cond)
+        return x_cond
+
+    def encode(self, x, start_level=None, end_level=None, bs_chunks=1):
+        if start_level == None:
+            start_level = self.level
+        if end_level == None:
+            end_level = self.levels
+        # Get latents
+        with t.no_grad():
+            zs = self.encoder(x, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+        return zs
+
+    def decode(self, zs, start_level=None, end_level=None, bs_chunks=1):
+        if start_level == None:
+            start_level = self.level
+        if end_level == None:
+            end_level = self.levels
+
+        assert len(zs) == end_level - start_level
+        with t.no_grad():
+            x_out = self.decoder(zs, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+        return x_out
+
+    def get_cond(self, z_conds, y):
+        if y is not None:
+            assert y.shape[1] == 4 + self.y_emb.max_bow_genre_size + self.n_tokens, f"Expected {4} + {self.y_emb.max_bow_genre_size} + {self.n_tokens}, got {y.shape[1]}"
+            n_labels = y.shape[1] - self.n_tokens
+            y, prime = y[:,:n_labels], y[:,n_labels:]
+        else:
+            y, prime = None, None
+        y_cond, y_pos = self.y_emb(y) if self.y_cond else (None, None)
+        x_cond = self.x_emb(z_conds) if self.x_cond else y_pos
+        return x_cond, y_cond, prime
+
+    def sample(self, n_samples, z=None, z_conds=None, y=None, fp16=False, temp=1.0, top_k=0, top_p=0.0,
+               chunk_size=None, sample_tokens=None):
+        N = n_samples
+        if z is not None: assert z.shape[0] == N, f"Expected shape ({N},**), got shape {z.shape}"
+        if y is not None: assert y.shape[0] == N, f"Expected shape ({N},**), got shape {y.shape}"
+        if z_conds is not None:
+            for z_cond in z_conds:
+                assert z_cond.shape[0] == N,  f"Expected shape ({N},**), got shape {z_cond.shape}"
+
+        no_past_context = (z is None or z.shape[1] == 0)
+        if dist.get_rank() == 0:
+            name = {True: 'Ancestral', False: 'Primed'}[no_past_context]
+            print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
+
+        with t.no_grad():
+            # Currently x_cond only uses immediately above layer
+            x_cond, y_cond, prime = self.get_cond(z_conds, y)
+            if self.single_enc_dec:
+                # assert chunk_size % self.prime_loss_dims == 0. TODO: Check if needed
+                if no_past_context:
+                    z, x_cond = self.prior_preprocess([prime], [None, x_cond])
+                else:
+                    z, x_cond = self.prior_preprocess([prime, z], [None, x_cond])
+                if sample_tokens is not None:
+                    sample_tokens += self.n_tokens
+                z = self.prior.primed_sample(n_samples, z, x_cond, y_cond, fp16=fp16, temp=temp,
+                                             top_k=top_k, top_p=top_p, chunk_size=chunk_size, sample_tokens=sample_tokens)
+                z = self.prior_postprocess(z)
+            else:
+                encoder_kv = self.get_encoder_kv(prime, fp16=fp16, sample=True)
+                if no_past_context:
+                    z = self.prior.sample(n_samples, x_cond, y_cond, encoder_kv, fp16=fp16, temp=temp, top_k=top_k,
+                                          top_p=top_p, sample_tokens=sample_tokens)
+                else:
+                    z = self.prior.primed_sample(n_samples, z, x_cond, y_cond, encoder_kv, fp16=fp16, temp=temp,
+                                             top_k=top_k, top_p=top_p, chunk_size=chunk_size, sample_tokens=sample_tokens)
+            if sample_tokens is None:
+                assert_shape(z, (N, *self.z_shape))
+        return z
+
+    def get_encoder_kv(self, prime, fp16=False, sample=False):
+        if self.n_tokens != 0 and self.use_tokens:
+            if sample:
+                self.prime_prior.cuda()
+            N = prime.shape[0]
+            prime_acts = self.prime_prior(prime, None, None, None, fp16=fp16)
+            assert_shape(prime_acts, (N, self.prime_loss_dims, self.prime_acts_width))
+            assert prime_acts.dtype == t.float, f'Expected t.float, got {prime_acts.dtype}'
+            encoder_kv = self.prime_state_ln(self.prime_state_proj(prime_acts))
+            assert encoder_kv.dtype == t.float, f'Expected t.float, got {encoder_kv.dtype}'
+            if sample:
+                self.prime_prior.cpu()
+                if fp16:
+                    encoder_kv = encoder_kv.half()
+        else:
+            encoder_kv = None
+        return encoder_kv
+
+    def get_prime_loss(self, encoder_kv, prime_t):
+        if self.use_tokens:
+            encoder_kv = encoder_kv.float()
+            encoder_kv = self.prime_x_out(encoder_kv)
+            prime_loss = nn.functional.cross_entropy(encoder_kv.view(-1, self.prime_bins), prime_t.view(-1)) / np.log(2.)
+        else:
+            prime_loss = t.tensor(0.0, device='cuda')
+        return prime_loss
+
+    def z_forward(self, z, z_conds=[], y=None, fp16=False, get_preds=False, get_attn_weights=False):
+        """
+        Arguments:
+            get_attn_weights (bool or set): Makes forward prop dump
+                self-attention softmaxes to self.prior.transformer.ws. Either a
+                set of layer indices indicating which layers to store, or a
+                boolean value indicating whether to dump all.
+        """
+        assert isinstance(get_attn_weights, (bool, set))
+        if get_attn_weights:
+            self.prior.transformer.set_record_attn(get_attn_weights)
+        x_cond, y_cond, prime = self.get_cond(z_conds, y)
+        if self.copy_input:
+            prime = z[:,:self.n_tokens]
+        if self.single_enc_dec:
+            z, x_cond = self.prior_preprocess([prime, z], [None, x_cond])
+            (prime_loss, gen_loss), preds = self.prior(z, x_cond, y_cond, fp16=fp16, get_sep_loss=True, get_preds=get_preds)
+        else:
+            encoder_kv = self.get_encoder_kv(prime, fp16=fp16)
+            prime_loss = self.get_prime_loss(encoder_kv, prime)
+            gen_loss, preds = self.prior(z, x_cond, y_cond, encoder_kv, fp16=fp16, get_preds=get_preds)
+        loss = (self.prime_loss_fraction*prime_loss*self.prime_loss_dims/self.total_loss_dims) + \
+                   (gen_loss*self.gen_loss_dims/self.total_loss_dims)
+        metrics=dict(bpd=gen_loss.clone().detach(), prime_loss=prime_loss.clone().detach(),
+                     gen_loss=gen_loss.clone().detach())
+        if get_preds:
+            metrics["preds"] = preds.clone().detach()
+        if get_attn_weights:
+            ws = self.prior.transformer.ws
+            self.prior.transformer.set_record_attn(False)
+            return ws
+        else:
+            return loss, metrics
+
+    def forward(self, x, y=None, fp16=False, decode=False, get_preds=False):
+        bs = x.shape[0]
+        z, *z_conds = self.encode(x, bs_chunks=bs)
+        loss, metrics = self.z_forward(z=z, z_conds=z_conds, y=y, fp16=fp16, get_preds=get_preds)
+        if decode:
+            x_out = self.decode([z, *z_conds])
+        else:
+            x_out = None
+        return x_out, loss, metrics
diff --git a/jukebox/sample.py b/jukebox/sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5baee3a364d0fbb35dfe9adddf9841386a3e220
--- /dev/null
+++ b/jukebox/sample.py
@@ -0,0 +1,279 @@
+import os
+import torch as t
+import jukebox.utils.dist_adapter as dist
+
+from jukebox.hparams import Hyperparams
+from jukebox.data.labels import EmptyLabeller
+from jukebox.utils.torch_utils import empty_cache
+from jukebox.utils.audio_utils import save_wav, load_audio
+from jukebox.make_models import make_model
+from jukebox.align import get_alignment
+from jukebox.save_html import save_html
+from jukebox.utils.sample_utils import split_batch, get_starts
+from jukebox.utils.dist_utils import print_once
+import fire
+
+# Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
+def sample_partial_window(zs, labels, sampling_kwargs, level, prior, tokens_to_sample, hps):
+    z = zs[level]
+    n_ctx = prior.n_ctx
+    current_tokens = z.shape[1]
+    if current_tokens < n_ctx - tokens_to_sample:
+        sampling_kwargs['sample_tokens'] = current_tokens + tokens_to_sample
+        start = 0
+    else:
+        sampling_kwargs['sample_tokens'] = n_ctx
+        start = current_tokens - n_ctx + tokens_to_sample
+
+    return sample_single_window(zs, labels, sampling_kwargs, level, prior, start, hps)
+
+# Sample a single window of length=n_ctx at position=start on level=level
+def sample_single_window(zs, labels, sampling_kwargs, level, prior, start, hps):
+    n_samples = hps.n_samples
+    n_ctx = prior.n_ctx
+    end = start + n_ctx
+
+    # get z already sampled at current level
+    z = zs[level][:,start:end]
+
+    if 'sample_tokens' in sampling_kwargs:
+        # Support sampling a window shorter than n_ctx
+        sample_tokens = sampling_kwargs['sample_tokens']
+    else:
+        sample_tokens = (end - start)
+    conditioning_tokens, new_tokens = z.shape[1], sample_tokens - z.shape[1]
+
+    print_once(f"Sampling {sample_tokens} tokens for [{start},{start+sample_tokens}]. Conditioning on {conditioning_tokens} tokens")
+
+    if new_tokens <= 0:
+        # Nothing new to sample
+        return zs
+    
+    # get z_conds from level above
+    z_conds = prior.get_z_conds(zs, start, end)
+
+    # set y offset, sample_length and lyrics tokens
+    y = prior.get_y(labels, start)
+
+    empty_cache()
+
+    max_batch_size = sampling_kwargs['max_batch_size']
+    del sampling_kwargs['max_batch_size']
+
+
+    z_list = split_batch(z, n_samples, max_batch_size)
+    z_conds_list = split_batch(z_conds, n_samples, max_batch_size)
+    y_list = split_batch(y, n_samples, max_batch_size)
+    z_samples = []
+    for z_i, z_conds_i, y_i in zip(z_list, z_conds_list, y_list):
+        z_samples_i = prior.sample(n_samples=z_i.shape[0], z=z_i, z_conds=z_conds_i, y=y_i, **sampling_kwargs)
+        z_samples.append(z_samples_i)
+    z = t.cat(z_samples, dim=0)
+
+    sampling_kwargs['max_batch_size'] = max_batch_size
+
+    # Update z with new sample
+    z_new = z[:,-new_tokens:]
+    zs[level] = t.cat([zs[level], z_new], dim=1)
+    return zs
+
+# Sample total_length tokens at level=level with hop_length=hop_length
+def sample_level(zs, labels, sampling_kwargs, level, prior, total_length, hop_length, hps):
+    print_once(f"Sampling level {level}")
+    if total_length >= prior.n_ctx:
+        for start in get_starts(total_length, prior.n_ctx, hop_length):
+            zs = sample_single_window(zs, labels, sampling_kwargs, level, prior, start, hps)
+    else:
+        zs = sample_partial_window(zs, labels, sampling_kwargs, level, prior, total_length, hps)
+    return zs
+
+# Sample multiple levels
+def _sample(zs, labels, sampling_kwargs, priors, sample_levels, hps):
+    alignments = None
+    for level in reversed(sample_levels):
+        prior = priors[level]
+        prior.cuda()
+        empty_cache()
+
+        # Set correct total_length, hop_length, labels and sampling_kwargs for level
+        assert hps.sample_length % prior.raw_to_tokens == 0, f"Expected sample_length {hps.sample_length} to be multiple of {prior.raw_to_tokens}"
+        total_length = hps.sample_length//prior.raw_to_tokens
+        hop_length = int(hps.hop_fraction[level]*prior.n_ctx)
+        zs = sample_level(zs, labels[level], sampling_kwargs[level], level, prior, total_length, hop_length, hps)
+
+        prior.cpu()
+        empty_cache()
+
+        # Decode sample
+        x = prior.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
+
+        if dist.get_world_size() > 1:
+            logdir = f"{hps.name}_rank_{dist.get_rank()}/level_{level}"
+        else:
+            logdir = f"{hps.name}/level_{level}"
+        if not os.path.exists(logdir):
+            os.makedirs(logdir)
+        t.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
+        save_wav(logdir, x, hps.sr)
+        if alignments is None and priors[-1] is not None and priors[-1].n_tokens > 0 and not isinstance(priors[-1].labeller, EmptyLabeller):
+            alignments = get_alignment(x, zs, labels[-1], priors[-1], sampling_kwargs[-1]['fp16'], hps)
+        save_html(logdir, x, zs, labels[-1], alignments, hps)
+    return zs
+
+# Generate ancestral samples given a list of artists and genres
+def ancestral_sample(labels, sampling_kwargs, priors, hps):
+    sample_levels = list(range(len(priors)))
+    zs = [t.zeros(hps.n_samples,0,dtype=t.long, device='cuda') for _ in range(len(priors))]
+    zs = _sample(zs, labels, sampling_kwargs, priors, sample_levels, hps)
+    return zs
+
+# Continue ancestral sampling from previously saved codes
+def continue_sample(zs, labels, sampling_kwargs, priors, hps):
+    sample_levels = list(range(len(priors)))
+    zs = _sample(zs, labels, sampling_kwargs, priors, sample_levels, hps)
+    return zs
+
+# Upsample given already generated upper-level codes
+def upsample(zs, labels, sampling_kwargs, priors, hps):
+    sample_levels = list(range(len(priors) - 1))
+    zs = _sample(zs, labels, sampling_kwargs, priors, sample_levels, hps)
+    return zs
+
+# Prompt the model with raw audio input (dimension: NTC) and generate continuations
+def primed_sample(x, labels, sampling_kwargs, priors, hps):
+    sample_levels = list(range(len(priors)))
+    zs = priors[-1].encode(x, start_level=0, end_level=len(priors), bs_chunks=x.shape[0])
+    zs = _sample(zs, labels, sampling_kwargs, priors, sample_levels, hps)
+    return zs
+
+# Load `duration` seconds of the given audio files to use as prompts
+def load_prompts(audio_files, duration, hps):
+    xs = []
+    for audio_file in audio_files:
+        x = load_audio(audio_file, sr=hps.sr, duration=duration, offset=0.0, mono=True)
+        x = x.T # CT -> TC
+        xs.append(x)
+    while len(xs) < hps.n_samples:
+        xs.extend(xs)
+    xs = xs[:hps.n_samples]
+    x = t.stack([t.from_numpy(x) for x in xs])
+    x = x.to('cuda', non_blocking=True)
+    return x
+
+# Load codes from previous sampling run
+def load_codes(codes_file, duration, priors, hps):
+    data = t.load(codes_file, map_location='cpu')
+    zs = [z.cuda() for z in data['zs']]
+    assert zs[-1].shape[0] == hps.n_samples, f"Expected bs = {hps.n_samples}, got {zs[-1].shape[0]}"
+    del data
+    if duration is not None:
+        # Cut off codes to match duration
+        top_raw_to_tokens = priors[-1].raw_to_tokens
+        assert duration % top_raw_to_tokens == 0, f"Cut-off duration {duration} not an exact multiple of top_raw_to_tokens"
+        assert duration//top_raw_to_tokens <= zs[-1].shape[1], f"Cut-off tokens {duration//priors[-1].raw_to_tokens} longer than tokens {zs[-1].shape[1]} in saved codes"
+        zs = [z[:,:duration//prior.raw_to_tokens] for z, prior in zip(zs, priors)]
+    return zs
+
+# Generate and save samples, alignment, and webpage for visualization.
+def save_samples(model, device, hps, sample_hps):
+    print(hps)
+    from jukebox.lyricdict import poems, gpt_2_lyrics
+    vqvae, priors = make_model(model, device, hps)
+
+    assert hps.sample_length//priors[-2].raw_to_tokens >= priors[-2].n_ctx, f"Upsampling needs atleast one ctx in get_z_conds. Please choose a longer sample length"
+
+    total_length = hps.total_sample_length_in_seconds * hps.sr
+    offset = 0
+
+    # Set artist/genre/lyrics for your samples here!
+    # We used different label sets in our models, but you can write the human friendly names here and we'll map them under the hood for each model.
+    # For the 5b/5b_lyrics model and the upsamplers, labeller will look up artist and genres in v2 set. (after lowercasing, removing non-alphanumerics and collapsing whitespaces to _).
+    # For the 1b_lyrics top level, labeller will look up artist and genres in v3 set (after lowercasing).
+    metas = [dict(artist = "Alan Jackson",
+                  genre = "Country",
+                  lyrics = poems['ozymandias'],
+                  total_length=total_length,
+                  offset=offset,
+                  ),
+             dict(artist="Joe Bonamassa",
+                  genre="Blues Rock",
+                  lyrics=gpt_2_lyrics['hottub'],
+                  total_length=total_length,
+                  offset=offset,
+                  ),
+             dict(artist="Frank Sinatra",
+                  genre="Classic Pop",
+                  lyrics=gpt_2_lyrics['alone'],
+                  total_length=total_length,
+                  offset=offset,
+                  ),
+             dict(artist="Ella Fitzgerald",
+                  genre="Jazz",
+                  lyrics=gpt_2_lyrics['count'],
+                  total_length=total_length,
+                  offset=offset,
+                  ),
+             dict(artist="Céline Dion",
+                  genre="Pop",
+                  lyrics=gpt_2_lyrics['darkness'],
+                  total_length=total_length,
+                  offset=offset,
+                  ),
+             ]
+    while len(metas) < hps.n_samples:
+        metas.extend(metas)
+    metas = metas[:hps.n_samples]
+
+    labels = [prior.labeller.get_batch_labels(metas, 'cuda') for prior in priors]
+    for label in labels:
+        assert label['y'].shape[0] == hps.n_samples
+
+    lower_level_chunk_size = 32
+    lower_level_max_batch_size = 16
+    if model == '1b_lyrics':
+        chunk_size = 32
+        max_batch_size = 16
+    else:
+        chunk_size = 16
+        max_batch_size = 3
+    sampling_kwargs = [dict(temp=0.99, fp16=True, chunk_size=lower_level_chunk_size, max_batch_size=lower_level_max_batch_size),
+                       dict(temp=0.99, fp16=True, chunk_size=lower_level_chunk_size, max_batch_size=lower_level_max_batch_size),
+                       dict(temp=0.99, fp16=True, chunk_size=chunk_size, max_batch_size=max_batch_size)]
+
+    if sample_hps.mode == 'ancestral':
+        ancestral_sample(labels, sampling_kwargs, priors, hps)
+    elif sample_hps.mode in ['continue', 'upsample']:
+        assert sample_hps.codes_file is not None
+        top_raw_to_tokens = priors[-1].raw_to_tokens
+        if sample_hps.prompt_length_in_seconds is not None:
+            duration = (int(sample_hps.prompt_length_in_seconds * hps.sr) // top_raw_to_tokens) * top_raw_to_tokens
+        else:
+            duration = None
+        zs = load_codes(sample_hps.codes_file, duration, priors, hps)
+        if sample_hps.mode == 'continue':
+            continue_sample(zs, labels, sampling_kwargs, priors, hps)
+        elif sample_hps.mode == 'upsample':
+            upsample(zs, labels, sampling_kwargs, priors, hps)
+    elif sample_hps.mode == 'primed':
+        assert sample_hps.audio_file is not None
+        assert sample_hps.prompt_length_in_seconds is not None
+        audio_files = sample_hps.audio_file.split(',')
+        top_raw_to_tokens = priors[-1].raw_to_tokens
+        duration = (int(sample_hps.prompt_length_in_seconds * hps.sr) // top_raw_to_tokens) * top_raw_to_tokens
+        x = load_prompts(audio_files, duration, hps)
+        primed_sample(x, labels, sampling_kwargs, priors, hps)
+    else:
+        raise ValueError(f'Unknown sample mode {sample_hps.mode}.')
+
+
+def run(model, mode='ancestral', codes_file=None, audio_file=None, prompt_length_in_seconds=None, port=29500, **kwargs):
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    rank, local_rank, device = setup_dist_from_mpi(port=port)
+    hps = Hyperparams(**kwargs)
+    sample_hps = Hyperparams(dict(mode=mode, codes_file=codes_file, audio_file=audio_file, prompt_length_in_seconds=prompt_length_in_seconds))
+
+    with t.no_grad():
+        save_samples(model, device, hps, sample_hps)
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/jukebox/save_html.py b/jukebox/save_html.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0fcada6a7f3dd7d550dcca124cdb569eddd1ae
--- /dev/null
+++ b/jukebox/save_html.py
@@ -0,0 +1,130 @@
+import os
+import json
+import numpy as np
+from PIL import Image, ImageFilter
+import soundfile
+
+def save_html(logdir, x, zs, labels, alignments, hps):
+    level = hps.levels - 1 # Top level used
+    z = zs[level]
+    bs, total_length = z.shape[0], z.shape[1]
+
+    with open(f'{logdir}/index.html', 'w') as html:
+        print(f"<html><head><title>{logdir}</title></head><body style='font-family: sans-serif; font-size: 1.4em; font-weight: bold; text-align: center; max-width:1024px; width: 100%; margin: auto;'>",
+            file=html)
+        print("<link rel='icon' href='data:;base64,iVBORw0KGgo='>", file=html)
+
+        for item in range(bs):
+            data = dict(wav=x[item].cpu().numpy(), sr=hps.sr,
+                        info=labels['info'][item],
+                        total_length=total_length,
+                        total_tokens=len(labels['info'][item]['full_tokens']),
+                        alignment=alignments[item] if alignments is not None else None)
+            item_dir = f'{logdir}/item_{item}'
+            _save_item_html(item_dir, item, item, data)
+            print(f"<iframe style='height: 100%; width: 100%;' frameborder='0' scrolling='no' src='item_{item}/index.html'></iframe>", file=html)
+        print("</body></html>", file=html)  
+
+def _save_item_html(item_dir, item_id, item_name, data):
+    # replace gs:// with /root/samples/
+
+    # an html for each sample. Main html has a selector to get us id of this?
+    if not os.path.exists(item_dir):
+        os.makedirs(item_dir)
+
+    with open(f'{item_dir}/index.html', 'w') as html:
+        print(f"<html><head><title>{item_name}</title></head><body style='font-family: sans-serif; font-size: 1.4em; font-weight: bold; text-align: center; max-width:1024px; width: 100%; margin: auto;'>",
+            file=html)
+        print("<link rel='icon' href='data:;base64,iVBORw0KGgo='>", file=html)
+        total_length = data['total_length']
+        total_tokens = data['total_tokens']
+        alignment = data['alignment']
+        lyrics = data["info"]["lyrics"]
+        wav, sr = data['wav'], data['sr']
+        genre, artist = data["info"]["genre"], data["info"]["artist"]
+
+        # Strip unused columns
+        if alignment is not None:
+            assert alignment.shape == (total_length, total_tokens)
+            assert len(lyrics) == total_tokens, f'Total_tokens: {total_tokens}, Lyrics Len: {len(lyrics)}. Lyrics: {lyrics}'
+            max_attn_at_token = np.max(alignment, axis=0)
+            assert len(max_attn_at_token) == total_tokens
+            for token in reversed(range(total_tokens)):
+                if max_attn_at_token[token] > 0:
+                    break
+            alignment = alignment[:,:token+1]
+            lyrics = lyrics[:token+1]
+            total_tokens = token+1
+
+            # Small alignment image
+            im = Image.fromarray(np.uint8(alignment * 255)).resize((512, 1024)).transpose(Image.ROTATE_90)
+            img_src = f'align.png'
+            im.save(f'{item_dir}/{img_src}')
+            print(f"<img id='{img_src}' src='{img_src}' \>", file=html)
+
+            # Smaller alignment json for animation
+            total_alignment_length = total_length // 16
+            alignment = Image.fromarray(np.uint8(alignment * 255)).resize((total_tokens, total_alignment_length))
+            alignment = alignment.filter(ImageFilter.GaussianBlur(radius=1.5))
+            alignment = np.asarray(alignment).tolist()
+            align_src = f'align.json'
+            with open(f'{item_dir}/{align_src}', 'w') as f:
+                json.dump(alignment, f)
+
+        # Audio
+        wav_src = f'audio.wav'
+        soundfile.write(f'{item_dir}/{wav_src}', wav, samplerate=sr, format='wav')
+        print(f"<audio id='{wav_src}' src='{wav_src}' style='width: 100%;' controls></audio>", file=html)
+
+
+        # Labels and Lyrics
+        print(f"<pre style='white-space: pre-wrap;'>", end="", file=html)
+        print(f"<div>Artist {artist}, Genre {genre}</div>", file=html)
+        lyrics = [c for c in lyrics]  # already characters actually
+        lyrics = [''] + lyrics[:-1]  # input lyrics are shifted by 1
+        for i, c in enumerate(lyrics):
+            print(f"<span id='{item_id}/{i}'>{c}</span>", end="", file=html)
+        print(f"</pre>", file=html)
+        with open(f'{item_dir}/lyrics.json', 'w') as f:
+            json.dump(lyrics, f)
+
+        if alignment is not None:
+            # JS for alignment animation
+            print("""<script>
+            async function fetchAsync (url) {
+                let response = await fetch(url);
+                let data = await response.json();
+                return data;
+            }
+    
+            var audio = document.getElementById('""" + f'{wav_src}' + """');
+            audio.onplay = function () {
+                track = '""" + f'{item_id}' + """'
+                fetchAsync('""" + f'{align_src}' + """')
+                .then(data => animateLyrics(data, track, this))
+                .catch(reason => console.log(reason.message))
+            }; 
+    
+            function animateLyrics(data, track, audio) {
+                var animate = setInterval(function () {
+                    var time = Math.floor(audio.currentTime*""" + f'{total_alignment_length}' + """/audio.duration);
+                    if (!(time == 0 || time == """ + f'{total_alignment_length}' + """)) {
+                        console.log(time);
+                        changeColor(data, track, audio, time);
+                    }
+                    if (audio.paused) {
+                        clearInterval(animate);
+                    }
+                }, 50);
+            }
+    
+            function changeColor(data, track, audio, time) {
+                colors = data[time]
+                for (i = 0; i < colors.length; i++){
+                    character = document.getElementById(track + '/' + i.toString());
+                    color = Math.max(230 - 10*colors[i], 0).toString();
+                    character.style.color = 'rgb(255,' + color + ',' + color + ')';
+                }
+            }
+            </script>""", file=html)
+        print("</body></html>", file=html)
diff --git a/jukebox/tests/test_sample.py b/jukebox/tests/test_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..f16d169ddd77fb44f1f6b94b3093b9dc5c0838f0
--- /dev/null
+++ b/jukebox/tests/test_sample.py
@@ -0,0 +1,141 @@
+import torch as t
+import numpy as np
+from jukebox.sample import sample_level
+from jukebox.utils.torch_utils import assert_shape
+from jukebox.hparams import Hyperparams
+
+def repeat(x, n, dim):
+    if dim == -1:
+        dim = len(x.shape) - 1
+    return x.reshape(int(np.prod(x.shape[:dim+1])), 1, int(np.prod(x.shape[dim+1:]))).repeat(1,n,1).reshape(*x.shape[:dim], n * x.shape[dim], *x.shape[dim+1:])
+
+# Tests
+class DummyPrior:
+    def __init__(self, n_ctx, level, levels):
+        self.n_ctx = n_ctx
+        self.level = level
+        self.levels = levels
+        self.downsamples = (8,4,4)
+        self.cond_downsample = self.downsamples[level+1] if level != self.levels - 1 else None
+        self.raw_to_tokens = int(np.prod(self.downsamples[:level+1]))
+        self.sample_length = self.n_ctx*self.raw_to_tokens
+
+        print(f"Level:{level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample length:{self.sample_length}")
+
+    def get_y(self, labels, start):
+        y = labels['y'].clone()
+        # Set sample_length to match this level
+        y[:, 2] = self.sample_length
+        # Set offset
+        y[:, 1:2] = y[:, 1:2] + start * self.raw_to_tokens
+        return y
+
+    def get_z_conds(self, zs, start, end):
+        if self.level != self.levels - 1:
+            assert start % self.cond_downsample == end % self.cond_downsample == 0
+            z_cond = zs[self.level + 1][:,start//self.cond_downsample:end//self.cond_downsample]
+            assert z_cond.shape[1] == self.n_ctx//self.cond_downsample
+            z_conds = [z_cond]
+        else:
+            z_conds = None
+        return z_conds
+
+    def ancestral_sample(self, n_samples, z_conds=None, y=None):
+        z = t.zeros((n_samples, self.n_ctx), dtype=t.long, device='cuda') + \
+            t.arange(0, self.n_ctx, dtype=t.long, device='cuda').view(1, self.n_ctx)
+
+        if z_conds is not None:
+            z_cond = z_conds[0]
+            assert_shape(z_cond, (n_samples, self.n_ctx // 4))
+            assert (z // 4 == repeat(z_cond, 4, 1)).all(), f'z: {z}, z_cond: {z_cond}, diff: {(z // 4) - repeat(z_cond, 4, 1)}'
+        return z
+
+    def primed_sample(self, n_samples, z, z_conds=None, y=None):
+        prime = z.shape[1]
+        assert_shape(z, (n_samples, prime))
+        start = z[:,-1:] + 1
+        z_rest = (t.arange(0, self.n_ctx - prime, dtype=t.long, device='cuda').view(1, self.n_ctx - prime) + start).view(n_samples, self.n_ctx - prime)
+        z = t.cat([z, z_rest], dim=1)
+
+        if z_conds is not None:
+            z_cond = z_conds[0]
+            assert_shape(z_cond, (n_samples, self.n_ctx // 4))
+            assert (z // 4 == repeat(z_cond, 4, 1)).all(), f'z: {z}, z_cond: {z_cond}, diff: {(z // 4) - repeat(z_cond, 4, 1)}'
+        return z
+
+# Sample multiple levels
+def _sample(zs, labels,  priors, sample_levels, hps):
+    for level in reversed(sample_levels):
+        prior = priors[level]
+        # set correct total_length, hop_length and sampling_kwargs for level
+        total_length = (hps.sample_length * hps.n_segment)//prior.raw_to_tokens
+        hop_length = hps.hop_lengths[level]
+        zs = sample_level(zs, labels[level], dict(), level, prior, total_length, hop_length, hps)
+    return zs
+
+# Ancestral sample
+def test_ancestral_sample(labels, priors, hps):
+    sample_levels = list(range(hps.levels))
+    zs = [t.zeros(hps.n_samples,0,dtype=t.long, device='cuda') for _ in range(hps.levels)]
+    zs = _sample(zs, labels, priors, sample_levels, hps)
+
+    # Test
+    for z in zs:
+        total_length = z.shape[1]
+        # Check sample
+        assert ((z - t.arange(0, total_length, dtype=t.long, device='cuda').view(1, total_length)) == 0).all()
+
+    print("dummy ancestral sample passed")
+
+def test_primed_sample(labels, priors, hps):
+    sample_levels = list(range(hps.levels))
+
+    start = t.tensor([15, 23, 11, 9], dtype=t.long, device='cuda').view(4, 1)
+
+    zs_in = []
+    zs = []
+    for i in reversed(range(3)):
+        n_ctx = 8192*(4**i)
+        n_prime = n_ctx // 4
+        z_prime = t.arange(0, n_prime, dtype=t.long, device='cuda').view(1, n_prime) % (2*(4**i))
+        z_rest = t.randint(-10, -1, size=(1, n_ctx - n_prime), dtype=t.long, device='cuda')
+        z_in = t.cat([z_prime, z_rest], dim=1) + (4**i)*start
+        zs_in.append(z_in)
+        zs.append(z_prime + (4**i)*start)
+
+    zs = _sample(zs, labels, priors, sample_levels, hps)
+
+    # Test
+    for z, z_in in zip(zs, zs_in):
+        total_length = z.shape[1]
+        prime_length = z.shape[1] // (4 * hps.n_segment)
+        # Match prime tokens
+        assert (z[:,:prime_length] == z_in[:,:prime_length]).all()
+        # Check sample
+        z_rest = z[:,prime_length-1:] - z[:,prime_length-1:prime_length]
+        assert ((z_rest - t.arange(0, total_length - prime_length + 1, dtype=t.long, device='cuda').view(1, total_length - prime_length + 1)) == 0).all()
+
+    print("dummy primed sample passed")
+
+def check_sample():
+    n_ctx = 8192
+    n_samples = 4
+    levels = 3
+    priors = [DummyPrior(n_ctx, level, levels) for level in range(levels)]
+    max_total_length, offset, sample_length = 4134368, 0, n_ctx*8*4*4
+    y = t.tensor([max_total_length, offset, sample_length, 10, 1, -1, -1, -1, -1], dtype=t.long, device='cuda').view(1, 9).repeat(n_samples, 1)
+    labels = [dict(y=y, info=[[]*n_samples]) for level in range(levels)]
+    hps = Hyperparams({
+        'levels': 3,
+        'sample_length': sample_length,
+        'n_segment': 2,
+        'n_ctx': n_ctx,
+        'n_tokens': 0,
+        'hop_lengths': [n_ctx//2, n_ctx//2, n_ctx//8],
+        'n_samples': n_samples,
+        'use_tokens': False
+    })
+    test_ancestral_sample(labels, priors, hps)
+    test_primed_sample(labels, priors, hps)
+
+check_sample()
diff --git a/jukebox/train.py b/jukebox/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..4532232aad0e161e76c06b86a1ae716dab27cc8d
--- /dev/null
+++ b/jukebox/train.py
@@ -0,0 +1,345 @@
+"""
+Ability to train vq-vae and prior
+First try for random inputs
+Then from maestros
+"""
+import sys
+import fire
+import warnings
+import numpy as np
+import torch as t
+import jukebox.utils.dist_adapter as dist
+from torch.nn.parallel import DistributedDataParallel
+
+from jukebox.hparams import setup_hparams
+from jukebox.make_models import make_vqvae, make_prior, restore_opt, save_checkpoint
+from jukebox.utils.logger import init_logging
+from jukebox.utils.audio_utils import audio_preprocess, audio_postprocess
+from jukebox.utils.torch_utils import zero_grad, count_parameters
+from jukebox.utils.dist_utils import print_once, allreduce, allgather
+from jukebox.utils.ema import CPUEMA, FusedEMA, EMA
+from jukebox.utils.fp16 import FP16FusedAdam, FusedAdam, LossScalar, clipped_grad_scale, backward
+from jukebox.data.data_processor import DataProcessor
+
+def prepare_aud(x, hps):
+    x = audio_postprocess(x.detach().contiguous(), hps)
+    return allgather(x)
+
+def log_aud(logger, tag, x, hps):
+    logger.add_audios(tag, prepare_aud(x, hps), hps.sr, max_len=hps.max_len, max_log=hps.max_log)
+    logger.flush()
+
+def log_labels(logger, labeller, tag, y, hps):
+    y = y.cpu().numpy()
+    txt = ''
+    for item in range(y.shape[0]):
+        description = labeller.describe_label(y[item])
+        artist, genre, lyrics = description['artist'], description['genre'], description['lyrics']
+        txt += f'{item} artist:{artist}, genre:{genre}, lyrics:{lyrics}\n'
+    logger.add_text(tag, txt)
+    logger.flush()
+
+def get_ddp(model, hps):
+    rank = dist.get_rank()
+    local_rank = rank % 8
+    ddp = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False, bucket_cap_mb=hps.bucket)
+    return ddp
+
+def get_ema(model, hps):
+    mu = hps.mu or (1. - (hps.bs * hps.ngpus/8.)/1000)
+    ema = None
+    if hps.ema and hps.train:
+        if hps.cpu_ema:
+            if dist.get_rank() == 0:
+                print("Using CPU EMA")
+            ema = CPUEMA(model.parameters(), mu=mu, freq=hps.cpu_ema_freq)
+        elif hps.ema_fused:
+            ema = FusedEMA(model.parameters(), mu=mu)
+        else:
+            ema = EMA(model.parameters(), mu=mu)
+    return ema
+
+def get_lr_scheduler(opt, hps):
+    def lr_lambda(step):
+        if hps.lr_use_linear_decay:
+            lr_scale = hps.lr_scale * min(1.0, step / hps.lr_warmup)
+            decay = max(0.0, 1.0 - max(0.0, step - hps.lr_start_linear_decay) / hps.lr_decay)
+            if decay == 0.0:
+                if dist.get_rank() == 0:
+                    print("Reached end of training")
+            return lr_scale * decay
+        else:
+            return hps.lr_scale * (hps.lr_gamma ** (step // hps.lr_decay)) * min(1.0, step / hps.lr_warmup)
+
+    shd = t.optim.lr_scheduler.LambdaLR(opt, lr_lambda)
+
+    return shd
+
+def get_optimizer(model, hps):
+    # Optimizer
+    betas = (hps.beta1, hps.beta2)
+    if hps.fp16_opt:
+        opt = FP16FusedAdam(model.parameters(), lr=hps.lr, weight_decay=hps.weight_decay, betas=betas, eps=hps.eps)
+    else:
+        opt = FusedAdam(model.parameters(), lr=hps.lr, weight_decay=hps.weight_decay, betas=betas, eps=hps.eps)
+
+    # lr scheduler
+    shd = get_lr_scheduler(opt, hps)
+
+    restore_path = hps.restore_prior if hps.prior else hps.restore_vqvae
+    restore_opt(opt, shd, restore_path)
+
+    # fp16 dynamic loss scaler
+    scalar = None
+    if hps.fp16:
+        rank = dist.get_rank()
+        local_rank = rank % 8
+        scalar = LossScalar(hps.fp16_loss_scale, scale_factor=2 ** (1./hps.fp16_scale_window))
+        if local_rank == 0: print(scalar.__dict__)
+
+    zero_grad(model)
+    return opt, shd, scalar
+
+def log_inputs(orig_model, logger, x_in, y, x_out, hps, tag="train"):
+    print(f"Logging {tag} inputs/ouputs")
+    log_aud(logger, f'{tag}_x_in', x_in, hps)
+    log_aud(logger, f'{tag}_x_out', x_out, hps)
+    bs = x_in.shape[0]
+    if hps.prior:
+        if hps.labels:
+            log_labels(logger, orig_model.labeller, f'{tag}_y_in', allgather(y.cuda()), hps)
+    else:
+        zs_in = orig_model.encode(x_in, start_level=0, bs_chunks=bs)
+        x_ds = [orig_model.decode(zs_in[level:], start_level=level, bs_chunks=bs) for level in range(0, hps.levels)]
+        for i in range(len(x_ds)):
+            log_aud(logger, f'{tag}_x_ds_start_{i}', x_ds[i], hps)
+    logger.flush()
+
+def sample_prior(orig_model, ema, logger, x_in, y, hps):
+    if ema is not None: ema.swap()
+    orig_model.eval()
+
+    x_in = x_in[:hps.bs_sample]
+    bs = x_in.shape[0]
+    zs_in = orig_model.encode(x_in, start_level=0, bs_chunks=bs)
+    assert len(zs_in) == hps.levels
+    x_ds = [orig_model.decode(zs_in[level:], start_level=level, bs_chunks=bs) for level in range(0, hps.levels)]
+
+    if not hps.labels:
+        y = None
+    elif hps.level == (hps.levels - 1):
+        # Topmost level labels in order
+        y = y[:hps.bs_sample]  # t.ones((hps.bs_sample, 1), device=y.device, dtype=t.long) * dist.get_rank()
+    else:
+        # Other levels keep labels to match x_cond
+        y = y[:hps.bs_sample]
+
+    # Temp 1.0
+    _, *z_conds = orig_model.encode(x_in, bs_chunks=bs)
+    z = orig_model.sample(hps.bs_sample, z_conds=z_conds, y=y, fp16=False, temp=1.0)
+    x_sample = orig_model.decode([z, *z_conds], bs_chunks=bs)
+
+    log_aud(logger, 'sample_x_T1', x_sample, hps)
+    if hps.prior and hps.labels:
+        log_labels(logger, orig_model.labeller, f'sample_x_T1', allgather(y.cuda()), hps)
+
+    # Recons
+    for i in range(len(x_ds)):
+        log_aud(logger, f'x_ds_start_{i}', x_ds[i], hps)
+    orig_model.train()
+    if ema is not None: ema.swap()
+    logger.flush()
+
+def evaluate(model, orig_model, logger, metrics, data_processor, hps):
+    model.eval()
+    orig_model.eval()
+    if hps.prior:
+        _print_keys = dict(l="loss", bpd="bpd")
+    else:
+        _print_keys = dict(l="loss", rl="recons_loss", sl="spectral_loss")
+
+    with t.no_grad():
+        for i, x in logger.get_range(data_processor.test_loader):
+            if isinstance(x, (tuple, list)):
+                x, y = x
+            else:
+                y = None
+
+            x = x.to('cuda', non_blocking=True)
+            if y is not None:
+                y = y.to('cuda', non_blocking=True)
+
+            x_in = x = audio_preprocess(x, hps)
+            log_input_output = (i==0)
+
+            if hps.prior:
+                forw_kwargs = dict(y=y, fp16=hps.fp16, decode=log_input_output)
+            else:
+                forw_kwargs = dict(loss_fn=hps.loss_fn, hps=hps)
+
+            x_out, loss, _metrics = model(x, **forw_kwargs)
+
+            # Logging
+            for key, val in _metrics.items():
+                _metrics[key] = val.item()
+            _metrics["loss"] = loss = loss.item() # Make sure to call to free graph
+
+            # Average and log
+            for key, val in _metrics.items():
+                _metrics[key] = metrics.update(f"test_{key}", val, x.shape[0])
+
+            with t.no_grad():
+                if log_input_output:
+                    log_inputs(orig_model, logger, x_in, y, x_out, hps)
+
+            logger.set_postfix(**{print_key:_metrics[key] for print_key, key in _print_keys.items()})
+
+    for key, val in _metrics.items():
+        logger.add_scalar(f"test_{key}", metrics.avg(f"test_{key}"))
+
+    logger.close_range()
+    return {key: metrics.avg(f"test_{key}") for key in _metrics.keys()}
+
+def train(model, orig_model, opt, shd, scalar, ema, logger, metrics, data_processor, hps):
+    model.train()
+    orig_model.train()
+    if hps.prior:
+        _print_keys = dict(l="loss", bpd="bpd", gn="gn", g_l="gen_loss", p_l="prime_loss")
+    else:
+        _print_keys = dict(l="loss", sl="spectral_loss", rl="recons_loss", e="entropy", u="usage", uc="used_curr", gn="gn", pn="pn", dk="dk")
+
+    for i, x in logger.get_range(data_processor.train_loader):
+        if isinstance(x, (tuple, list)):
+            x, y = x
+        else:
+            y = None
+
+        x = x.to('cuda', non_blocking=True)
+        if y is not None:
+            y = y.to('cuda', non_blocking=True)
+
+        x_in = x = audio_preprocess(x, hps)
+        log_input_output = (logger.iters % hps.save_iters == 0)
+
+        if hps.prior:
+            forw_kwargs = dict(y=y, fp16=hps.fp16, decode=log_input_output)
+        else:
+            forw_kwargs = dict(loss_fn=hps.loss_fn, hps=hps)
+
+        # Forward
+        x_out, loss, _metrics = model(x, **forw_kwargs)
+
+        # Backward
+        loss, scale, grad_norm, overflow_loss, overflow_grad = backward(loss=loss, params=list(model.parameters()),
+                                                                         scalar=scalar, fp16=hps.fp16, logger=logger)
+        # Skip step if overflow
+        grad_norm = allreduce(grad_norm, op=dist.ReduceOp.MAX)
+        if overflow_loss or overflow_grad or grad_norm > hps.ignore_grad_norm > 0:
+            zero_grad(orig_model)
+            continue
+
+        # Step opt. Divide by scale to include clipping and fp16 scaling
+        logger.step()
+        opt.step(scale=clipped_grad_scale(grad_norm, hps.clip, scale))
+        zero_grad(orig_model)
+        lr = hps.lr if shd is None else shd.get_lr()[0]
+        if shd is not None: shd.step()
+        if ema is not None: ema.step()
+        next_lr = hps.lr if shd is None else shd.get_lr()[0]
+        finished_training = (next_lr == 0.0)
+
+        # Logging
+        for key, val in _metrics.items():
+            _metrics[key] = val.item()
+        _metrics["loss"] = loss = loss.item() * hps.iters_before_update # Make sure to call to free graph
+        _metrics["gn"] = grad_norm
+        _metrics["lr"] = lr
+        _metrics["lg_loss_scale"] = np.log2(scale)
+
+        # Average and log
+        for key, val in _metrics.items():
+            _metrics[key] = metrics.update(key, val, x.shape[0])
+            if logger.iters % hps.log_steps == 0:
+                logger.add_scalar(key, _metrics[key])
+
+        # Save checkpoint
+        with t.no_grad():
+            if hps.save and (logger.iters % hps.save_iters == 1 or finished_training):
+                if ema is not None: ema.swap()
+                orig_model.eval()
+                name = 'latest' if hps.prior else f'step_{logger.iters}'
+                if dist.get_rank() % 8 == 0:
+                    save_checkpoint(logger, name, orig_model, opt, dict(step=logger.iters), hps)
+                orig_model.train()
+                if ema is not None: ema.swap()
+
+        # Sample
+        with t.no_grad():
+            if (logger.iters % 12000) in list(range(1, 1 + hps.iters_before_update)) or finished_training:
+                if hps.prior:
+                    sample_prior(orig_model, ema, logger, x_in, y, hps)
+
+        # Input/Output
+        with t.no_grad():
+            if log_input_output:
+                log_inputs(orig_model, logger, x_in, y, x_out, hps)
+
+        logger.set_postfix(**{print_key:_metrics[key] for print_key, key in _print_keys.items()})
+        if finished_training:
+            dist.barrier()
+            exit()
+    logger.close_range()
+    return {key: metrics.avg(key) for key in _metrics.keys()}
+
+def run(hps="teeny", port=29500, **kwargs):
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    rank, local_rank, device = setup_dist_from_mpi(port=port)
+    hps = setup_hparams(hps, kwargs)
+    hps.ngpus = dist.get_world_size()
+    hps.argv = " ".join(sys.argv)
+    hps.bs_sample = hps.nworkers = hps.bs
+
+    # Setup dataset
+    data_processor = DataProcessor(hps)
+
+    # Setup models
+    vqvae = make_vqvae(hps, device)
+    print_once(f"Parameters VQVAE:{count_parameters(vqvae)}")
+    if hps.prior:
+        prior = make_prior(hps, vqvae, device)
+        print_once(f"Parameters Prior:{count_parameters(prior)}")
+        model = prior
+    else:
+        model = vqvae
+
+    # Setup opt, ema and distributed_model.
+    opt, shd, scalar = get_optimizer(model, hps)
+    ema = get_ema(model, hps)
+    distributed_model = get_ddp(model, hps)
+
+    logger, metrics = init_logging(hps, local_rank, rank)
+    logger.iters = model.step
+
+    # Run training, eval, sample
+    for epoch in range(hps.curr_epoch, hps.epochs):
+        metrics.reset()
+        data_processor.set_epoch(epoch)
+        if hps.train:
+            train_metrics = train(distributed_model, model, opt, shd, scalar, ema, logger, metrics, data_processor, hps)
+            train_metrics['epoch'] = epoch
+            if rank == 0:
+                print('Train',' '.join([f'{key}: {val:0.4f}' for key,val in train_metrics.items()]))
+            dist.barrier()
+
+        if hps.test:
+            if ema: ema.swap()
+            test_metrics = evaluate(distributed_model, model, logger, metrics, data_processor, hps)
+            test_metrics['epoch'] = epoch
+            if rank == 0:
+                print('Ema',' '.join([f'{key}: {val:0.4f}' for key,val in test_metrics.items()]))
+            dist.barrier()
+            if ema: ema.swap()
+        dist.barrier()
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/jukebox/transformer/__init__.py b/jukebox/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/jukebox/transformer/factored_attention.py b/jukebox/transformer/factored_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f9b62940fa06fa1c2b36261c440f5d7839ccecf
--- /dev/null
+++ b/jukebox/transformer/factored_attention.py
@@ -0,0 +1,510 @@
+# Factored attention
+import math
+import numpy as np
+import torch as t
+import torch.nn as nn
+import torch.nn.functional as F
+from jukebox.transformer.ops import Conv1D
+from jukebox.utils.checkpoint import checkpoint
+
+def repeat(x, n, dim):
+    if dim == -1:
+        dim = len(x.shape) - 1
+    return x.view(int(np.prod(x.shape[:dim+1])), 1, int(np.prod(x.shape[dim+1:]))).repeat(1,n,1).view(*x.shape[:dim], n * x.shape[dim], *x.shape[dim+1:])
+
+def get_mask(mask, q_l, kv_l, blocks, spread, device, sample, sample_t):
+    # returns a mask of shape 1 x 1 x q_l x kv_l or None if masking is not needed.
+    if mask is None or q_l == 1:
+        return None
+    offset = sample_t - q_l if sample else max(kv_l - q_l, 0)
+    if mask == 'autoregressive':
+        # Masked dense
+        mask = t.ones(q_l, kv_l, device=device).tril(offset)
+    elif mask == 'summary':
+        # Masked summary
+        mask = t.nn.functional.pad(t.ones(q_l, q_l, device=device).tril().view(q_l, blocks, q_l // blocks)[:,:-1,-kv_l//blocks:],(0,0,1,0),value=1).contiguous().view(q_l, kv_l)
+    elif mask == 'prime':
+        mask = t.ones(q_l, kv_l, device=device).tril(offset)
+    return mask.view(1,1,q_l,kv_l)
+
+class FactoredAttention(nn.Module):
+    def __init__(self, n_in, n_ctx, n_state, n_head,
+                 attn_dropout=0.0, resid_dropout=0.0,
+                 scale=True, mask=False,
+                 zero_out=False, init_scale=1.0,
+                 checkpoint_attn=0,
+                 attn_func=0, blocks=None, spread=None,
+                 encoder_dims=None, prime_len=None):
+        super().__init__()
+        self.n_in = n_in
+        self.n_ctx = n_ctx # NOTE: n_ctx could be different within operations. This is complete n_ctx
+        self.n_state = n_state
+        assert n_state % n_head == 0
+        self.n_head = n_head
+        self.scale = scale
+        self.mask = mask
+        if attn_func == 6:
+            self.c_attn = Conv1D(n_in, n_state, init_scale=init_scale)
+            self.c_enc_kv = Conv1D(n_in, n_state * 2, init_scale=init_scale)
+        else:
+            self.c_attn = Conv1D(n_in, n_state * 3, init_scale=init_scale)
+        self.c_proj = Conv1D(n_state, n_in, zero_out, init_scale=init_scale)
+        self.attn_dropout = nn.Dropout(attn_dropout) if attn_dropout > 0.0 else lambda x: x
+        self.resid_dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
+
+        # Sequence of length l is factored as [blocks, l // blocks]
+        self.attn_func = attn_func
+        self.qkv, self.attn, self.attn_mask = {
+            0: (self.factored_qkv, self.dense_attn, 'autoregressive'),              # Attend to all positions
+            1: (self.factored_qkv, self.block_attn, 'autoregressive'),              # Attend to your block
+            2: (self.factored_qkv, self.transpose_block_attn, 'autoregressive'),    # Attend to transpose block
+            3: (self.factored_qkv, self.prev_block_attn, None),                     # Attend to previous block
+            4: (self.factored_qkv, self.summary_attn, 'summary'),                   # Attend to last position of each block
+            5: (self.factored_qkv, self.summary_spread_attn, 'summary'),
+            6: (self.decode_qkv, self.decode_attn, None),
+            7: (self.prime_qkv, self.prime_attn, 'prime')
+        }[attn_func] # Attend to last k position of each block
+
+        self.blocks = blocks
+        self.spread = spread
+        if blocks is not None:
+            assert n_ctx % blocks == 0
+            self.block_ctx = n_ctx // blocks
+        self.checkpoint_attn = checkpoint_attn # 0: None, 1: Attn after heads split, 2: Attn
+
+        self.sample_t = 0
+        self.cache = {}
+        self.encoder_dims = encoder_dims
+        self.prime_len = prime_len
+        self.record_attn = False
+        self.w = None
+
+    def _attn(self, q, k, v, sample):
+        scale = 1. / math.sqrt(math.sqrt(self.n_state // self.n_head))
+        if self.training:
+            w = t.matmul(q * scale, k * scale)
+        else:
+            w = t.matmul(q, k)
+            w.mul_(scale*scale)
+        wtype = w.dtype
+        w = w.float()
+        if self.mask:
+            # Generate appropriate mask to mask out all positions before current
+            # Might take up lot of memory for dense, so can cache it
+            mask = get_mask(self.attn_mask, q.size(-2), k.size(-1), self.blocks, self.spread, w.device, sample, self.sample_t)
+            if mask is not None:
+                #print(mask)
+                w = w * mask + -1e9 * (1 - mask)
+            w = F.softmax(w, dim=-1).type(wtype)
+        else:
+            w = F.softmax(w, dim=-1).type(wtype)
+        if self.record_attn:
+            self.w = w #.float().cpu().numpy()
+            if self.attn_func == 7:
+                # only keep music queries and lyrics keys/values
+                self.w = self.w[:,:,self.prime_len:,:self.prime_len]
+        w = self.attn_dropout(w)
+        a = t.matmul(w, v)
+        return a
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = (*x.size()[:-2], x.size(-2) * x.size(-1))
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = (*x.size()[:-1], self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def dense_attn(self, query, key, value, sample):
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        if self.checkpoint_attn == 1 and not sample:
+            a = checkpoint(lambda q,k,v,s=sample: self._attn(q,k,v,s), (query, key, value),
+                       (), True)
+        else:
+            a = self._attn(query,key,value,sample)
+        a = self.merge_heads(a)
+        return a
+
+    def block_attn(self, q, k, v, sample):
+        blocks, block_ctx = self.blocks, self.block_ctx # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            assert l == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            ql = q.shape[1]
+            q = q.view(bs * ql // block_ctx, block_ctx, d)
+            if ql < l:
+                l = ql
+                k = k[:, -l:].contiguous()
+                v = v[:, -l:].contiguous()
+            k = k.view(bs * l // block_ctx, block_ctx, d)
+            v = v.view(bs * l // block_ctx, block_ctx, d)
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def transpose_block_attn(self, q, k, v, sample):
+        blocks, block_ctx = self.blocks, self.block_ctx # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            block_l = (l - 1) % block_ctx
+            k = k[:,block_l::block_ctx,:]
+            v = v[:,block_l::block_ctx,:]
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            ql = q.shape[1]
+            q = q.view(bs, ql // block_ctx, block_ctx, d).transpose(1,2).contiguous().view(bs * block_ctx, ql // block_ctx, d)
+            k = k.view(bs,  l // block_ctx, block_ctx, d).transpose(1,2).contiguous().view(bs * block_ctx,  l // block_ctx, d)
+            v = v.view(bs,  l // block_ctx, block_ctx, d).transpose(1,2).contiguous().view(bs * block_ctx,  l // block_ctx, d)
+            return self.dense_attn(q, k, v, sample).view(bs, block_ctx, ql // block_ctx, d).transpose(1,2).contiguous().view(bs, ql, d)
+
+    def prev_block_attn(self, q, k, v, sample):
+        blocks, block_ctx = self.blocks, self.block_ctx # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            assert l == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
+            block = (l - 1) // block_ctx
+            prev_l = (block - 1) * block_ctx
+            if block > 0:
+                assert prev_l == 0
+                k = k[:, prev_l:prev_l + block_ctx, :]
+                v = v[:, prev_l:prev_l + block_ctx, :]
+            else:
+                k = t.zeros(bs, block_ctx, d, device=q.device, dtype=q.dtype)
+                v = t.zeros(bs, block_ctx, d, device=q.device, dtype=q.dtype)
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            ql = q.shape[1]
+            q = q.view(bs * ql // block_ctx, block_ctx, d)
+            k = t.nn.functional.pad(k.view(bs, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0,0,0,0,1,0)).view(bs * l // block_ctx, block_ctx, d)
+            v = t.nn.functional.pad(v.view(bs, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0,0,0,0,1,0)).view(bs * l // block_ctx, block_ctx, d)
+            if ql < l:
+                qb = ql // block_ctx
+                kb =  l // block_ctx
+                l = ql
+                k = k.view(bs, kb, block_ctx, d)[:, -qb:].contiguous().view(bs * qb, block_ctx, d)
+                v = v.view(bs, kb, block_ctx, d)[:, -qb:].contiguous().view(bs * qb, block_ctx, d)
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def summary_attn(self, q, k, v, sample):
+        blocks, block_ctx = self.blocks, self.block_ctx # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            k = t.nn.functional.pad(k[:, block_ctx-1:blocks*block_ctx-1:block_ctx, :],(0,0,1,0))
+            v = t.nn.functional.pad(v[:, block_ctx-1:blocks*block_ctx-1:block_ctx, :],(0,0,1,0))
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            k = t.nn.functional.pad(k.view(bs, blocks, l // blocks, d)[:, :-1, -1, :],(0,0,1,0)) # bs, blocks, d
+            v = t.nn.functional.pad(v.view(bs, blocks, l // blocks, d)[:, :-1, -1, :],(0,0,1,0)) # bs, blocks, d
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def summary_spread_attn(self, q, k, v, sample):
+        blocks, block_ctx, spread = self.blocks, self.block_ctx, self.spread # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            assert False, "Not yet implemented"
+            # k = t.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(bs, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(bs, -1, d)
+            # v = t.nn.functional.pad(v,(0,0,block_ctx,(-l)%block_ctx)).view(bs, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(bs, -1, d)
+            # return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            k = t.nn.functional.pad(k.view(bs, blocks, l // blocks, d)[:, :-1, -spread:, :],(0,0,0,0,1,0)).contiguous().view(bs, blocks * spread, d)  # bs, blocks * spread, d
+            v = t.nn.functional.pad(v.view(bs, blocks, l // blocks, d)[:, :-1, -spread:, :],(0,0,0,0,1,0)).contiguous().view(bs, blocks * spread, d)  # bs, blocks * spread, d
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def prime_attn(self, q, k, v, sample):
+        prime_len = self._prime_len
+        k = k[:, :prime_len]
+        v = v[:, :prime_len]
+        return self.dense_attn(q, k, v, sample)
+
+    def decode_attn(self, q, k, v, sample):
+        assert k.shape[1] == v.shape[1] == self.encoder_dims, f'k: {k.shape}, v: {v.shape}, enc_dims: {self.encoder_dims}'
+        return self.dense_attn(q, k, v, sample)
+
+    def factored_qkv(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        assert encoder_kv is None
+        query, key, value = x.chunk(3, dim=2)
+        if sample:
+            self.sample_t += curr_ctx
+            key, value = self._append_cache(key, value)
+            l_cache = self._suff_cache_len()
+            if self._cache_len() > l_cache:
+                self._slice_cache(-l_cache)
+            if curr_ctx > 1:
+                if self.attn_func != 0:
+                    query = self._pad_to_block_ctx(query, query=True)
+                    key = self._pad_to_block_ctx(key)
+                    value = self._pad_to_block_ctx(value)
+                    assert key.shape[1] % self.block_ctx == 0
+                    assert query.shape[1] % self.block_ctx == 0
+                assert key.shape[1] == value.shape[1]
+                assert query.shape[1] <= key.shape[1]
+                sample = False
+            else:
+                key = self.cache['key']
+                value = self.cache['value']
+        return query, key, value, sample
+
+    def prime_qkv(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        assert encoder_kv is None
+        query, key, value = x.chunk(3, dim=2)
+        if sample:
+            if self._cache_len() < self._prime_len:
+                self._append_cache(key, value)
+            if self._cache_len() > self._prime_len:
+                self._slice_cache(0, self._prime_len)
+            key, value = self.cache['key'], self.cache['value']
+            self.sample_t += curr_ctx
+            assert key.shape[1] == value.shape[1] == self._suff_cache_len(), f'k: {key.shape}, v: {value.shape}, prime_dims: {self._suff_cache_len()}'
+        else:
+            assert key.shape[1] == value.shape[1] == self.n_ctx, f'k: {key.shape}, v: {value.shape}, prime_dims: {self.n_ctx}'
+        assert key.shape[0] == value.shape[0] == query.shape[0], f'k: {key.shape}, v: {value.shape}, q: {query.shape}'
+        assert key.shape[2] == value.shape[2] == query.shape[2], f'k: {key.shape}, v: {value.shape}, q: {query.shape}'
+        return query, key, value, sample
+
+    def decode_qkv(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        assert encoder_kv is not None
+        query = x
+        if sample:
+            if self.sample_t == 0:
+                self.cache['key'], self.cache['value'] = self.c_enc_kv(encoder_kv.type_as(x)).chunk(2, dim=2)
+            key, value = self.cache['key'], self.cache['value']
+            self.sample_t += curr_ctx
+        else:
+            key, value = self.c_enc_kv(encoder_kv.type_as(x)).chunk(2, dim=2)
+        assert key.shape[0] == value.shape[0] == query.shape[0], f'k: {key.shape}, v: {value.shape}, q: {query.shape}'
+        assert key.shape[1] == value.shape[1] == self.encoder_dims, f'k: {key.shape}, v: {value.shape}, enc_dims: {self.encoder_dims}'
+        assert key.shape[2] == value.shape[2] == query.shape[2], f'k: {key.shape}, v: {value.shape}, q: {query.shape}'
+        return query, key, value, sample
+
+    def forward(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        x = self.c_attn(x)
+        query, key, value, sample = self.qkv(x, encoder_kv=encoder_kv, sample=sample)
+        if self.checkpoint_attn == 2 and not sample:
+            a = checkpoint(lambda q,k,v,s=sample: self.attn(q,k,v,s), (query, key, value), (), True)
+        else:
+            a = self.attn(query,key,value,sample)
+        if a.shape[1] != curr_ctx:
+            offset = self._offset(curr_ctx)
+            a = a[:,offset:offset + curr_ctx,:].contiguous()
+        a = self.c_proj(a)
+        return self.resid_dropout(a)
+
+    @property
+    def _prime_len(self):
+        prime_len = self.prime_len
+        assert prime_len is not None
+        prime_blocks = (prime_len // self.blocks) + 1
+        return prime_blocks * self.blocks
+
+    def _offset(self, curr_ctx):
+        if self.attn_func == 0:
+            return 0
+        return (self.sample_t - curr_ctx) % self.block_ctx
+
+    def _pad_to_block_ctx(self, x, query=False):
+        l = x.shape[1]
+        offset = self._offset(l) if query else 0
+        n_blocks = (l + offset + self.block_ctx - 1) // self.block_ctx
+        pad = n_blocks * self.block_ctx - l - offset
+        if pad == 0 and offset == 0:
+            return x
+        else:
+            return F.pad(x, (0, 0, offset, pad))
+
+    def _cache_len(self):
+        return 0 if 'key' not in self.cache else self.cache['key'].shape[1]
+
+    def _suff_cache_len(self):
+        """
+        Precondition:
+            key and value are appended with the current context and
+            self.sample_t reflects the 1-indexed sample location in the
+            context.
+        """
+        if self.attn_func == 0:
+            return self.sample_t
+        elif self.attn_func == 1:
+            return (self.sample_t - 1) % self.block_ctx + 1
+        elif self.attn_func == 2:
+            return self.sample_t
+        elif self.attn_func == 3:
+            if self.sample_t <= self.block_ctx:
+                return self.sample_t
+            else:
+                curr_block = (self.sample_t - 1) % self.block_ctx + 1
+                prev_block = self.block_ctx
+                return curr_block + prev_block
+        elif self.attn_func == 6:
+            return self.encoder_dims
+        elif self.attn_func == 7:
+            return min(self.sample_t, self._prime_len)
+        else:
+            raise NotImplementedError()
+
+    def _slice_cache(self, start, end=None):
+        self.cache['key'] = self.cache['key'][:, start:end]
+        self.cache['value'] = self.cache['value'][:, start:end]
+
+    def _append_cache(self, key, value):
+        if 'key' not in self.cache:
+            self.cache['key'] = key
+            self.cache['value'] = value
+        else:
+            old_key, old_value = key, value
+            key = t.cat([self.cache['key'], key], dim=1)
+            value = t.cat([self.cache['value'], value], dim=1)
+            del self.cache['key']
+            del self.cache['value']
+            del old_key
+            del old_value
+            self.cache['key'] = key
+            self.cache['value'] = value
+        return self.cache['key'], self.cache['value']
+
+    def del_cache(self):
+        self.sample_t = 0
+        if 'key' in self.cache:
+            del self.cache['key']
+        if 'value' in self.cache:
+            del self.cache['value']
+        self.cache = {}
+
+    def check(self):
+        blocks = self.blocks or 1
+        spread = self.spread or 1
+        bs, l, d = (4, self.n_ctx, self.n_in)
+        x = t.randn(bs, l, d).cuda()
+        x.requires_grad = True
+        x_out = self.forward(x) # bs, l, d
+        loss = x_out.mean(dim = -1) # bs, l
+        pos = 60
+        grad = t.autograd.grad(loss[2, pos], x)[0]
+
+        assert grad.shape == (bs, l, d)
+        assert (grad[:2] == 0).all()
+        assert (grad[3:] == 0).all()
+        assert (grad[2, (pos + 1):] == 0).all()
+        pos_grad = (t.sum(grad[2] ** 2, dim=-1) > 0).nonzero().view(-1).cpu()
+
+        block_pos = pos - (pos % (l // blocks))
+        exp_pos_grad = {0: t.arange(pos),
+                        1: t.arange(block_pos, pos),
+                        2: t.arange(pos % (l // blocks), pos, l // blocks),
+                        3: t.arange(block_pos - l // blocks, block_pos),
+                        4: t.arange(l // blocks - 1, pos, l // blocks),
+                        5: ((t.arange(pos) % (l // blocks) >= (l // blocks - spread)) & (t.arange(pos) < block_pos)).nonzero().view(-1)}[self.attn_func]
+        exp_pos_grad = t.cat([exp_pos_grad, t.tensor([pos])], dim=-1)
+
+        assert (len(pos_grad) == len(exp_pos_grad)) and (pos_grad == exp_pos_grad).all(), \
+            f"Expected pos grad {exp_pos_grad} got {pos_grad} for attn_func {self.attn_func} pos {pos} l {l} blocks {blocks}"
+
+    def check_cache(self, n_samples, sample_t, fp16):
+        assert self.sample_t == sample_t, f"{self.sample_t} != {sample_t}"
+        if sample_t == 0:
+            assert self.cache == {}
+        else:
+            dtype = {True: t.float16, False: t.float32}[fp16]
+            l_cache = self._suff_cache_len()
+            assert self.cache['key'].shape == (n_samples, l_cache, self.n_state)
+            assert self.cache['value'].shape == (n_samples, l_cache, self.n_state)
+            assert self.cache['key'].dtype == dtype, f"Expected {dtype}, got {self.cache['key'].dtype}"
+            assert self.cache['value'].dtype == dtype, f"Expected {dtype}, got {self.cache['value'].dtype}"
+
+    def check_sample(self):
+        t.manual_seed(42)
+        bs, l, d = (4, self.n_ctx, self.n_in)
+        prime = 5
+        x = t.randn(bs, l, d).cuda()
+        xs = t.chunk(x, l, dim=1)
+        assert self.sample_t == 0
+        assert self.cache == {}
+
+        with t.no_grad():
+            enc_l = self.encoder_dims
+            encoder_kv = None
+            if self.attn_func == 6:
+                encoder_kv = t.randn(bs, enc_l, d).cuda()
+
+            # Normal path
+            x_out_normal = self.forward(x, encoder_kv=encoder_kv)
+
+            # Sampling path
+            x_out_sample = t.cat([self.forward(xs[i], encoder_kv=encoder_kv, sample=True) for i in range(l)],dim=1)
+        max_err = t.max(t.abs(x_out_sample - x_out_normal))
+        assert max_err < 1e-8, f"Max sampling err is {max_err} {[i for i in range(l) if t.max(t.abs(x_out_sample - x_out_normal)[:,i,:]) > 1e-8]}"
+
+        with t.no_grad():
+            x_out_normal = x_out_normal[:,:prime,:]
+            # Prime sampling path
+            self.del_cache()
+            x_out_sample = self.forward(x[:,:prime,:].contiguous(), encoder_kv=encoder_kv, sample=True)
+            self.check_cache(bs, prime, False)
+
+        max_err = t.max(t.abs(x_out_sample - x_out_normal))
+        assert max_err < 1e-8, f"Max prime sampling err is {max_err} {[i for i in range(prime) if t.max(t.abs(x_out_sample - x_out_normal)[:,i,:]) > 1e-8]}"
+
+    def check_chunks(self, chunk_size):
+        t.manual_seed(42)
+        bs, l, d = (4, self.n_ctx, self.n_in)
+        enc_l = self.encoder_dims
+        assert l % chunk_size == 0
+        n_chunks = l // chunk_size
+        with t.no_grad():
+            encoder_kv = None
+            x = t.randn(bs, l, d).cuda()
+            if self.attn_func == 6:
+                encoder_kv = t.randn(bs, enc_l, d).cuda()
+
+            self.del_cache()
+            y_forw = self.forward(x, encoder_kv=encoder_kv, sample=False)
+            self.del_cache()
+            y_forw_sample = self.forward(x, encoder_kv=encoder_kv, sample=True)
+            max_err = t.max(t.abs(y_forw - y_forw_sample))
+            assert max_err <= 1e-6, f"Max err is {max_err} {[i for i in range(l) if t.max(t.abs(y_forw - y_forw_sample)[:, i, :]) > 1e-6]}"
+
+            self.del_cache()
+            x_chunks = t.chunk(x, n_chunks, dim=1)
+            y_chunks = []
+            total_len = 0
+            for x_chunk in x_chunks:
+                y_chunk = self.forward(x_chunk.contiguous(), encoder_kv=encoder_kv, sample=True)
+                total_len += x_chunk.shape[1]
+                self.check_cache(bs, total_len, False)
+                y_chunks.append(y_chunk)
+            y_forw_in_chunks = t.cat(y_chunks, dim=1)
+
+            max_err = t.max(t.abs(y_forw - y_forw_in_chunks))
+            assert max_err <= 1e-6, f"Max err is {max_err} {[i for i in range(l) if t.max(t.abs(y_forw - y_forw_in_chunks)[:, i, :]) > 1e-6]}"
+
+
+if __name__ == '__main__':
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    setup_dist_from_mpi(port=29600)
+    n_in = 16
+    n_state = n_in * 2
+    n_ctx = 6144
+    n_head = 4
+    n_depth = 12
+    blocks = 64
+    chunk_size = 8
+    for attn_func in [0, 1, 2, 3, 6, 7]:
+        encoder_dims = {0: 0, 1: 0, 2: 0, 3: 0, 6: 64, 7: 0}[attn_func]
+        prime_len = {0: 0, 1: 0, 2: 0, 3: 0, 6: 0, 7: 384}[attn_func]
+        attn = FactoredAttention(n_in, n_ctx + prime_len, n_state, n_head, mask=True,
+                                 attn_func=attn_func, blocks=blocks,
+                                 encoder_dims=encoder_dims, prime_len=prime_len)
+        attn.training = False
+        attn.check_sample()
+        attn.check_chunks(chunk_size)
+        print(f"Checked attn_func: {attn_func}")
diff --git a/jukebox/transformer/ops.py b/jukebox/transformer/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e12a3bc6711b0f849dd6c795e0df20f2cd2edb0
--- /dev/null
+++ b/jukebox/transformer/ops.py
@@ -0,0 +1,142 @@
+import math
+import numpy as np
+import torch as t
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Import FusedLayerNorm if we have apex, otherwise use regular LayerNorm
+try:
+    from apex.normalization import FusedLayerNorm
+    print("Using apex FusedLayerNorm")
+except ImportError:
+    from torch.nn import LayerNorm as FusedLayerNorm
+
+class LayerNorm(FusedLayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
+        self.width = np.prod(normalized_shape)
+        self.max_numel = 65535*self.width
+
+    def forward(self, input):
+        if input.numel() > self.max_numel:
+            return F.layer_norm(input.float(), self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
+        else:
+            return super(LayerNorm, self).forward(input.float()).type_as(input)
+
+def gelu(x):
+    return 0.5 * x * (1 + t.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * t.pow(x, 3))))
+
+
+def swish(x):
+    return x * t.sigmoid(x)
+
+@t.jit.script
+def quick_gelu(x):
+    return x * t.sigmoid(1.702 * x)
+
+@t.jit.script
+def quick_gelu_bwd(x, grad_output):
+    sig = t.sigmoid(1.702 * x)
+    return grad_output * sig * (1.702 * x * (1 - sig) + 1.)
+
+class QuickGelu(t.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return quick_gelu(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return quick_gelu_bwd(ctx.saved_tensors[0], grad_output)
+
+def memory_efficient_quick_gelu(x):
+    return QuickGelu.apply(x)
+
+ACT_FNS = {
+    'relu': t.nn.functional.relu,
+    'swish': swish,
+    'gelu': gelu,
+    'quick_gelu': memory_efficient_quick_gelu #quick_gelu
+}
+
+def _move_to_gpu_and_convert_conv_weights_to_fp16(l):
+    l.cuda()
+    if isinstance(l, Conv1D):
+        l.w.data = l.w.data.half()
+
+def _convert_conv_weights_to_fp32(l):
+    if isinstance(l, Conv1D):
+        l.w.data = l.w.data.float()
+
+def _convert_conv_weights_to_fp16(l):
+    if isinstance(l, Conv1D):
+        l.w.data = l.w.data.half()
+
+def _convert_embedding_weights_to_fp16(l):
+    if isinstance(l, t.nn.Embedding):
+        l.weight.data = l.weight.data.half()
+
+def _convert_embedding_weights_to_fp32(l):
+    if isinstance(l, t.nn.Embedding):
+        l.weight.data = l.weight.data.float()
+
+class Conv1D(nn.Module):
+    def __init__(self, n_in, n_out, zero_out=False, init_scale=1.0):
+        super(Conv1D, self).__init__()
+        self.n_in = n_in
+        self.n_out = n_out
+        if zero_out:
+            w = t.zeros(n_in, n_out)
+        else:
+            w = t.empty(n_in, n_out)
+            nn.init.normal_(w, std=0.02 * init_scale)
+        b = t.zeros(n_out)
+        self.w = nn.Parameter(w)
+        self.b = nn.Parameter(b)
+
+    def forward(self, x):
+        size_out = (*x.size()[:-1], self.n_out)
+        x = t.addmm(self.b.type_as(x), x.view(-1, x.size(-1)), self.w.type_as(x)) # If x if float then float else half
+        x = x.view(*size_out)
+        return x
+
+# For large contexts, mask's can take up memory, so you can make a single saved mask for all layers
+class Mask(nn.Module):
+    def __init__(self, n_ctx):
+        super().__init__()
+        self.register_buffer('b', t.tril(t.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+
+    def forward(self, w):
+        w = w * self.b + -1e9 * (1 - self.b)  # For fp16 do w = w.float().masked_fill(self.b, float('-inf')
+        return w
+
+def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (vocabulary size)
+            top_k >0: keep only top k tokens with highest probability (top-k filtering).
+            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+    """
+    #assert logits.dim() == 2  # batch size 1 for now - could be updated for more but the code would be less clear
+    logits = logits.clone()
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    assert (top_k == 0) or (top_p == 0.0)
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < t.topk(logits, top_k, dim=-1)[0][..., -1:]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = t.sort(logits, descending=True, dim=-1)
+        cumulative_probs = t.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        #indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        indices_to_remove = t.zeros_like(logits, dtype=t.uint8).scatter_(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
diff --git a/jukebox/transformer/transformer.py b/jukebox/transformer/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78cbb4d141729b3046e10554febea04405436d2
--- /dev/null
+++ b/jukebox/transformer/transformer.py
@@ -0,0 +1,239 @@
+import functools
+import numpy as np
+import torch as t
+import torch.nn as nn
+import jukebox.utils.dist_adapter as dist
+
+from jukebox.transformer.ops import Conv1D, ACT_FNS, LayerNorm
+from jukebox.transformer.factored_attention import FactoredAttention
+from jukebox.utils.checkpoint import checkpoint
+
+def _convert_mlp_traced(l):
+    if isinstance(l, ResAttnBlock):
+        l.mlp = t.jit.trace(l.mlp, t.randn(1, 1, l.n_in).cuda())
+
+def _convert_mlp_traced_fp16(l):
+    if isinstance(l, ResAttnBlock):
+        l.mlp = t.jit.trace(l.mlp, t.randn(1, 1, l.n_in).cuda().half())
+
+class MLP(nn.Module):
+    def __init__(self, n_in, n_state, resid_dropout=0.0, afn='quick_gelu', zero_out=False, init_scale=1.0):
+        super().__init__()
+        self.c_fc = Conv1D(n_in, n_state, init_scale=init_scale)
+        self.c_proj = Conv1D(n_state, n_in, zero_out, init_scale=init_scale)
+        self.act = ACT_FNS[afn]
+        self.resid_dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
+
+    def forward(self, x):
+        m = self.act(self.c_fc(x))
+        m = self.c_proj(m)
+        return self.resid_dropout(m)
+
+class ResAttnBlock(nn.Module):
+    def __init__(self, n_in, n_ctx, n_head,
+                 attn_dropout=0.0, resid_dropout=0.0,
+                 afn='quick_gelu', scale=True, mask=False,
+                 zero_out=False, init_scale=1.0, res_scale=1.0,
+                 m_attn = 0.25, m_mlp = 1.,
+                 checkpoint_attn = 0, checkpoint_mlp = 0,
+                 attn_func=0, blocks=None, spread=None,
+                 encoder_dims=None, prime_len=None):
+        super().__init__()
+        self.attn = FactoredAttention(n_in=n_in, n_ctx=n_ctx, n_state=int(m_attn * n_in), n_head=n_head,
+                                      attn_dropout=attn_dropout, resid_dropout=resid_dropout,
+                                      scale=scale, mask=mask,
+                                      zero_out=zero_out, init_scale=init_scale,
+                                      checkpoint_attn=checkpoint_attn,
+                                      attn_func=attn_func, blocks=blocks, spread=spread,
+                                      encoder_dims=encoder_dims, prime_len=prime_len)
+        self.ln_0 = LayerNorm(n_in)
+        self.mlp = MLP(n_in=n_in, n_state=int(m_mlp * n_in),
+                       resid_dropout=resid_dropout,
+                       afn=afn,
+                       zero_out=zero_out, init_scale=init_scale)
+        self.ln_1 = LayerNorm(n_in)
+        self.res_scale = res_scale
+
+        self.checkpoint_attn = checkpoint_attn
+        self.checkpoint_mlp = checkpoint_mlp
+        self.n_in = n_in
+        self.attn_func = attn_func
+
+    def forward(self, x, encoder_kv, sample=False):
+        if sample:
+            a = self.attn(self.ln_0(x), encoder_kv, sample)
+            m = self.mlp(self.ln_1(x + a))
+        else:
+            if self.attn_func == 6:
+                assert encoder_kv is not None
+                a = checkpoint(lambda _x,_enc_kv,_s=sample: self.attn(self.ln_0(_x),_enc_kv,_s),
+                               (x,encoder_kv),
+                               (*self.attn.parameters(), *self.ln_0.parameters()),
+                               self.checkpoint_attn == 3)  # 2 recomputes after the projections, and 1 recomputes after head splitting.
+            else:
+                assert encoder_kv is None
+                a = checkpoint(lambda _x,_enc_kv=None,_s=sample: self.attn(self.ln_0(_x),_enc_kv,_s),
+                               (x,),
+                               (*self.attn.parameters(), *self.ln_0.parameters()),
+                               self.checkpoint_attn == 3)  # 2 recomputes after the projections, and 1 recomputes after head splitting.
+            m = checkpoint(lambda _x: self.mlp(self.ln_1(_x)), (x + a,),
+                           (*self.mlp.parameters(), *self.ln_1.parameters()),
+                           self.checkpoint_mlp == 1)
+        if self.res_scale == 1.0:
+            h = x + a + m
+        else:
+            h = x + self.res_scale * (a + m)
+        return h
+
+class Transformer(nn.Module):
+    def __init__(self, n_in, n_ctx, n_head, n_depth,
+                 attn_dropout=0.0, resid_dropout=0.0,
+                 afn='quick_gelu', scale=True, mask=False,
+                 zero_out=False, init_scale=1.0, res_scale=False,
+                 m_attn=0.25, m_mlp=1.,
+                 checkpoint_attn=0, checkpoint_mlp=0, checkpoint_res=0,
+                 attn_order=0, blocks=None, spread=None,
+                 encoder_dims=None, prime_len=None):
+        super().__init__()
+        self.n_in = n_in
+        self.n_ctx = n_ctx
+        self.encoder_dims = encoder_dims
+        self.blocks = blocks
+        if blocks is not None:
+            assert n_ctx % blocks == 0
+            self.block_ctx = n_ctx // blocks
+        self.prime_len = prime_len
+        self.n_head = n_head
+
+        res_scale = 1.0 / n_depth if res_scale else 1.0
+
+        # Orders of attn_func
+        attn_func = {0: lambda d: 0,                    # Complete dense attn
+                     1: lambda d: [1,2][d%2],           # Alternate row and column attn
+                     2: lambda d: [1,2,3][d % 3],       # Alternate row, column and previous row attn
+                     3: lambda d: [1,4][d % 2],         # Alternate row and last column
+                     4: lambda d: [1,5][d % 2],         # Alternate row and last k columns
+                     5: lambda d: [1,4,1,1][d % 4],      # Alternate row, last column, row, row
+                     6: lambda d: [1,2,3,6][d % 4],
+                     7: lambda d: [*[1,2,3]*5,6][d%16],
+                     8: lambda d: [1,2,3,1,2,3,1,2,3,6][d%10], # Used by separated_enc_dec model with lyrics
+                     9: lambda d: [1,2,3,0][d % 4],
+                     10: lambda d: [*[1,2,3,1,2,3,1,2,3],*[1,2,3,1,2,3,1,2,3,6]*7][d%79], # Used by large separated_enc_dec model with lyrics
+                     11: lambda d: [6,6,0][d%3] if d%16 == 15 else [1,2,3][d%3],
+                     12: lambda d: [7,7,0][d%3] if d%16 == 15 else [1,2,3][d%3], # Used by single_enc_dec model with lyrics
+                     }[attn_order]
+
+        attn_cycle = {0:1, 1:2, 2:3, 3:2, 4:2, 5:4, 6:4, 7:16, 8:10, 9:4, 10:79, 11:16, 12:16}[attn_order]
+        #assert n_depth % attn_cycle == 0, f'Depth {n_depth} not a multiple of cycle {attn_cycle} for attn_order {attn_order}'
+
+        attn_block = lambda d: ResAttnBlock(n_in=n_in, n_ctx=n_ctx, n_head=n_head,
+                                  attn_dropout=attn_dropout, resid_dropout=resid_dropout,
+                                  afn=afn, scale=scale, mask=mask,
+                                  zero_out=zero_out if attn_func(d) !=6 else True,
+                                  init_scale=init_scale, res_scale=res_scale,
+                                  m_attn=m_attn, m_mlp=m_mlp,
+                                  checkpoint_attn=checkpoint_attn, checkpoint_mlp=checkpoint_mlp,
+                                  attn_func=attn_func(d), blocks=blocks, spread=spread,
+                                  encoder_dims=encoder_dims, prime_len=prime_len)
+
+        self.checkpoint_res = checkpoint_res
+        self._attn_mods = nn.ModuleList()
+        for d in range(n_depth):
+            self._attn_mods.append(attn_block(d))
+        self.ws = []
+
+
+    def set_record_attn(self, record_attn):
+        """
+        Arguments:
+            record_attn (bool or set): Makes forward prop dump self-attention
+                softmaxes to self.ws. Either a set of layer indices indicating
+                which layers to store, or a boolean value indicating whether to
+                dump all.
+        """
+        def _should_record_attn(layer_idx):
+            if isinstance(record_attn, bool):
+                return record_attn
+            return layer_idx in record_attn
+        for i, l in enumerate(self._attn_mods):
+            l.attn.record_attn = _should_record_attn(i)
+        if record_attn:
+            assert self.ws == []
+            for l in self._attn_mods:
+                assert l.attn.w == None
+        else:
+            self.ws = []
+            for l in self._attn_mods:
+                l.attn.w = None
+
+    def forward(self, x, encoder_kv=None, sample=False, fp16=False, fp16_out=False):
+        if fp16:
+            x = x.half()
+
+        # Blocks
+        for i,l in enumerate(self._attn_mods):
+            if self.checkpoint_res == 1 and not sample:
+                if l.attn_func == 6:
+                    assert encoder_kv is not None
+                    f = functools.partial(l, sample=sample)
+                    x = checkpoint(f, (x, encoder_kv), l.parameters(), True)
+                else:
+                    f = functools.partial(l, encoder_kv=None, sample=sample)
+                    x = checkpoint(f, (x,), l.parameters(), True)
+            else:
+                if l.attn_func == 6:
+                    x = l(x, encoder_kv=encoder_kv, sample=sample)
+                else:
+                    x = l(x, encoder_kv=None, sample=sample)
+            if l.attn.record_attn:
+                self.ws.append(l.attn.w)
+        if not fp16_out:
+            x = x.float()
+        return x
+
+    def check_cache(self, n_samples, sample_t, fp16):
+        for l in self._attn_mods:
+            l.attn.check_cache(n_samples, sample_t, fp16)
+
+    def del_cache(self):
+        for l in self._attn_mods:
+            l.attn.del_cache()
+
+    def check_sample(self):
+        bs, l, s, d = (4, self.n_ctx, self.encoder_dims, self.n_in)
+        prime = 5
+        with t.no_grad():
+            encoder_kv = t.randn(bs, s, d).cuda()
+            x = t.randn(bs, l, d).cuda()
+            y_forw = self.forward(x, encoder_kv=encoder_kv, sample=True)
+
+            self.del_cache()
+            x_chunks = t.chunk(x, 4, dim=1)
+            y_chunks = []
+            n = 0
+            for x_chunk in x_chunks:
+                self.check_cache(bs, n, False)
+                y_chunk = self.forward(x_chunk, encoder_kv=encoder_kv, sample=True)
+                y_chunks.append(y_chunk)
+                n += x_chunk.shape[1]
+            self.check_cache(bs, n, False)
+            y_forw_in_chunks = t.cat(y_chunks, dim=1)
+
+            max_err = t.max(t.abs(y_forw - y_forw_in_chunks))
+            assert max_err <= 1e-6, f"Max err is {max_err} {[i for i in range(l) if t.max(t.abs(y_forw - y_forw_in_chunks)[:, i, :]) > 1e-6]}"
+
+
+if __name__ == '__main__':
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    setup_dist_from_mpi(port=29600)
+    n_in = 16
+    n_ctx = 192
+    n_head = 4
+    n_depth = 12
+    blocks = 16
+    for attn_order in [0,2,6]:
+        encoder_dims = {0: 0, 2: 0, 6: 64}[attn_order]
+        prior = Transformer(n_in, n_ctx, n_head, n_depth, mask=True, attn_order=attn_order, encoder_dims=encoder_dims, blocks=blocks).cuda()
+        prior.training = False
+        prior.check_sample()
+        print(f"Checked attn_order: {attn_order}")
diff --git a/jukebox/utils/__init__.py b/jukebox/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/jukebox/utils/audio_utils.py b/jukebox/utils/audio_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dec7b0184ee5b4ce5ac7f6e031c15c7bd9437240
--- /dev/null
+++ b/jukebox/utils/audio_utils.py
@@ -0,0 +1,148 @@
+import numpy as np
+import torch as t
+import jukebox.utils.dist_adapter as dist
+import soundfile
+import librosa
+from jukebox.utils.dist_utils import print_once
+
+class DefaultSTFTValues:
+    def __init__(self, hps):
+        self.sr = hps.sr
+        self.n_fft = 2048
+        self.hop_length = 256
+        self.window_size = 6 * self.hop_length
+
+class STFTValues:
+    def __init__(self, hps, n_fft, hop_length, window_size):
+        self.sr = hps.sr
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.window_size = window_size
+
+def calculate_bandwidth(dataset, hps, duration=600):
+    hps = DefaultSTFTValues(hps)
+    n_samples = int(dataset.sr * duration)
+    l1, total, total_sq, n_seen, idx = 0.0, 0.0, 0.0, 0.0, dist.get_rank()
+    spec_norm_total, spec_nelem = 0.0, 0.0
+    while n_seen < n_samples:
+        x = dataset[idx]
+        if isinstance(x, (tuple, list)):
+            x, y = x
+        samples = x.astype(np.float64)
+        stft = librosa.core.stft(np.mean(samples, axis=1), hps.n_fft, hop_length=hps.hop_length, win_length=hps.window_size)
+        spec = np.absolute(stft)
+        spec_norm_total += np.linalg.norm(spec)
+        spec_nelem += 1
+        n_seen += int(np.prod(samples.shape))
+        l1 += np.sum(np.abs(samples))
+        total += np.sum(samples)
+        total_sq += np.sum(samples ** 2)
+        idx += max(16, dist.get_world_size())
+
+    if dist.is_available():
+        from jukebox.utils.dist_utils import allreduce
+        n_seen = allreduce(n_seen)
+        total = allreduce(total)
+        total_sq = allreduce(total_sq)
+        l1 = allreduce(l1)
+        spec_nelem = allreduce(spec_nelem)
+        spec_norm_total = allreduce(spec_norm_total)
+
+    mean = total / n_seen
+    bandwidth = dict(l2 = total_sq / n_seen - mean ** 2,
+                     l1 = l1 / n_seen,
+                     spec = spec_norm_total / spec_nelem)
+    print_once(bandwidth)
+    return bandwidth
+
+def audio_preprocess(x, hps):
+    # Extra layer in case we want to experiment with different preprocessing
+    # For two channel, blend randomly into mono (standard is .5 left, .5 right)
+
+    # x: NTC
+    x = x.float()
+    if x.shape[-1]==2:
+        if hps.aug_blend:
+            mix=t.rand((x.shape[0],1), device=x.device) #np.random.rand()
+        else:
+            mix = 0.5
+        x=(mix*x[:,:,0]+(1-mix)*x[:,:,1])
+    elif x.shape[-1]==1:
+        x=x[:,:,0]
+    else:
+        assert False, f'Expected channels {hps.channels}. Got unknown {x.shape[-1]} channels'
+
+    # x: NT -> NTC
+    x = x.unsqueeze(2)
+    return x
+
+def audio_postprocess(x, hps):
+    return x
+
+def stft(sig, hps):
+    return t.stft(sig, hps.n_fft, hps.hop_length, win_length=hps.window_size, window=t.hann_window(hps.window_size, device=sig.device))
+
+def spec(x, hps):
+    return t.norm(stft(x, hps), p=2, dim=-1)
+
+def norm(x):
+    return (x.view(x.shape[0], -1) ** 2).sum(dim=-1).sqrt()
+
+def squeeze(x):
+    if len(x.shape) == 3:
+        assert x.shape[-1] in [1,2]
+        x = t.mean(x, -1)
+    if len(x.shape) != 2:
+        raise ValueError(f'Unknown input shape {x.shape}')
+    return x
+
+def spectral_loss(x_in, x_out, hps):
+    hps = DefaultSTFTValues(hps)
+    spec_in = spec(squeeze(x_in.float()), hps)
+    spec_out = spec(squeeze(x_out.float()), hps)
+    return norm(spec_in - spec_out)
+
+def multispectral_loss(x_in, x_out, hps):
+    losses = []
+    assert len(hps.multispec_loss_n_fft) == len(hps.multispec_loss_hop_length) == len(hps.multispec_loss_window_size)
+    args = [hps.multispec_loss_n_fft,
+            hps.multispec_loss_hop_length,
+            hps.multispec_loss_window_size]
+    for n_fft, hop_length, window_size in zip(*args):
+        hps = STFTValues(hps, n_fft, hop_length, window_size)
+        spec_in = spec(squeeze(x_in.float()), hps)
+        spec_out = spec(squeeze(x_out.float()), hps)
+        losses.append(norm(spec_in - spec_out))
+    return sum(losses) / len(losses)
+
+def spectral_convergence(x_in, x_out, hps, epsilon=2e-3):
+    hps = DefaultSTFTValues(hps)
+    spec_in = spec(squeeze(x_in.float()), hps)
+    spec_out = spec(squeeze(x_out.float()), hps)
+
+    gt_norm = norm(spec_in)
+    residual_norm = norm(spec_in - spec_out)
+    mask = (gt_norm > epsilon).float()
+    return (residual_norm * mask) / t.clamp(gt_norm, min=epsilon)
+
+def log_magnitude_loss(x_in, x_out, hps, epsilon=1e-4):
+    hps = DefaultSTFTValues(hps)
+    spec_in = t.log(spec(squeeze(x_in.float()), hps) + epsilon)
+    spec_out = t.log(spec(squeeze(x_out.float()), hps) + epsilon)
+    return t.mean(t.abs(spec_in - spec_out))
+
+def load_audio(file, sr, offset, duration, mono=False):
+    # Librosa loads more filetypes than soundfile
+    x, _ = librosa.load(file, sr=sr, mono=mono, offset=offset/sr, duration=duration/sr)
+    if len(x.shape) == 1:
+        x = x.reshape((1, -1))
+    return x    
+
+
+def save_wav(fname, aud, sr):
+    # clip before saving?
+    aud = t.clamp(aud, -1, 1).cpu().numpy()
+    for i in list(range(aud.shape[0])):
+        soundfile.write(f'{fname}/item_{i}.wav', aud[i], samplerate=sr, format='wav')
+
+
diff --git a/jukebox/utils/checkpoint.py b/jukebox/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbcccb34d415a50f6bc7ef86334215d54779c18
--- /dev/null
+++ b/jukebox/utils/checkpoint.py
@@ -0,0 +1,32 @@
+# Simple gradient checkpointing. Works with distributed data parallel
+import torch as t
+
+def checkpoint(func, inputs, params, flag):
+    if flag:
+        args = inputs + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+class CheckpointFunction(t.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with t.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        for i in range(len(ctx.input_tensors)):
+            temp = ctx.input_tensors[i]
+            ctx.input_tensors[i] = temp.detach()
+            ctx.input_tensors[i].requires_grad = temp.requires_grad
+        with t.enable_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        input_grads = t.autograd.grad(output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True)
+        del ctx.input_tensors
+        del output_tensors
+        return (None, None) + input_grads
diff --git a/jukebox/utils/dist_adapter.py b/jukebox/utils/dist_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b67af947b3ac4f7e37e843b37d0499fc1ea5e7ef
--- /dev/null
+++ b/jukebox/utils/dist_adapter.py
@@ -0,0 +1,86 @@
+import torch.distributed as dist
+from enum import Enum
+
+class ReduceOp(Enum):
+    SUM = 0,
+    PRODUCT = 1,
+    MIN = 2,
+    MAX = 3
+
+    def ToDistOp(self):
+        return {
+            self.SUM: dist.ReduceOp.SUM,
+            self.PRODUCT: dist.ReduceOp.PRODUCT,
+            self.MIN: dist.ReduceOp.MIN,
+            self.MAX: dist.ReduceOp.MAX
+        }[self]
+
+def is_available():
+    return dist.is_available()
+
+def get_rank():
+    if is_available():
+        return _get_rank()
+    else:
+        return 0
+
+def get_world_size():
+    if is_available():
+        return _get_world_size()
+    else:
+        return 1
+
+def barrier():
+    if is_available():
+        return _barrier()
+    #else: do nothing
+
+def all_gather(tensor_list, tensor):
+    if is_available():
+        return _all_gather(tensor_list, tensor)
+    else:
+        tensor_list[0] = tensor
+
+def all_reduce(tensor, op=ReduceOp.SUM):
+    if is_available():
+        return _all_reduce(tensor, op)
+    #else: do nothing
+
+def reduce(tensor, dst, op=ReduceOp.SUM):
+    if is_available():
+        return _reduce(tensor, dst, op)
+    #else: do nothing
+
+def broadcast(tensor, src):
+    if is_available():
+        return _broadcast(tensor, src)
+    #else: do nothing
+
+def init_process_group(backend, init_method):
+    if is_available():
+        return _init_process_group(backend, init_method)
+    #else: do nothing
+
+def _get_rank():
+    return dist.get_rank()
+
+def _barrier():
+    return dist.barrier()
+
+def _get_world_size():
+    return dist.get_world_size()
+
+def _all_gather(tensor_list, tensor):
+    return dist.all_gather(tensor_list, tensor)
+
+def _all_reduce(tensor, op):
+    return dist.all_reduce(tensor, op.ToDistOp())
+
+def _reduce(tensor, dst, op):
+    return dist.reduce(tensor, dst, op.ToDistOp())
+
+def _broadcast(tensor, src):
+    return dist.broadcast(tensor, src)
+
+def _init_process_group(backend, init_method):
+    return dist.init_process_group(backend, init_method)
\ No newline at end of file
diff --git a/jukebox/utils/dist_utils.py b/jukebox/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec67fa1db02ecb08fa1bf33d550979d509e9ad14
--- /dev/null
+++ b/jukebox/utils/dist_utils.py
@@ -0,0 +1,101 @@
+import os
+from time import sleep
+import torch
+import jukebox.utils.dist_adapter as dist
+
+def print_once(msg):
+    if (not dist.is_available()) or dist.get_rank()==0:
+        print(msg)
+
+def print_all(msg):
+    if (not dist.is_available()):
+        print(msg)
+    elif dist.get_rank()%8==0:
+        print(f'{dist.get_rank()//8}: {msg}')
+
+def allgather(x):
+    xs = [torch.empty_like(x) for _ in range(dist.get_world_size())]
+    dist.all_gather(xs, x)
+    xs = torch.cat(xs, dim=0)
+    return xs
+
+def allreduce(x, op=dist.ReduceOp.SUM):
+    x = torch.tensor(x).float().cuda()
+    dist.all_reduce(x, op=op)
+    return x.item()
+
+def allgather_lists(xs):
+    bs = len(xs)
+    total_bs = dist.get_world_size()*len(xs)
+    lengths = torch.tensor([len(x) for x in xs], dtype=t.long, device='cuda')
+    lengths = allgather(lengths)
+    assert lengths.shape == (total_bs,)
+    max_length = torch.max(lengths).item()
+
+    xs = torch.tensor([[*x, *[0]*(max_length - len(x))] for x in xs], device='cuda')
+    assert xs.shape == (bs, max_length), f'Expected {(bs, max_length)}, got {xs.shape}'
+    xs = allgather(xs)
+    assert xs.shape == (total_bs,max_length), f'Expected {(total_bs, max_length)}, got {xs.shape}'
+
+    return [xs[i][:lengths[i]].cpu().numpy().tolist() for i in range(total_bs)]
+
+def setup_dist_from_mpi(
+    master_addr="127.0.0.1", backend="nccl", port=29500, n_attempts=5, verbose=False
+):
+    if dist.is_available():
+        return _setup_dist_from_mpi(master_addr, backend, port, n_attempts, verbose)
+    else:
+        use_cuda = torch.cuda.is_available()
+        print(f'Using cuda {use_cuda}')
+
+        mpi_rank = 0
+        local_rank = 0
+
+        device = torch.device("cuda", local_rank) if use_cuda else torch.device("cpu")
+        torch.cuda.set_device(local_rank)
+
+        return mpi_rank, local_rank, device
+
+def _setup_dist_from_mpi(master_addr, backend, port, n_attempts, verbose):
+    from mpi4py import MPI  # This must be imported in order to get e   rrors from all ranks to show up
+
+    mpi_rank = MPI.COMM_WORLD.Get_rank()
+    mpi_size = MPI.COMM_WORLD.Get_size()
+
+
+    os.environ["RANK"] = str(mpi_rank)
+    os.environ["WORLD_SIZE"] = str(mpi_size)
+    os.environ["MASTER_ADDR"] = master_addr
+    os.environ["MASTER_PORT"] = str(port)
+    os.environ["NCCL_LL_THRESHOLD"] = "0"
+    os.environ["NCCL_NSOCKS_PERTHREAD"] = "2"
+    os.environ["NCCL_SOCKET_NTHREADS"] = "8"
+
+    # Pin this rank to a specific GPU on the node
+    local_rank = mpi_rank % 8
+    if torch.cuda.is_available():
+        torch.cuda.set_device(local_rank)
+
+    if verbose:
+        print(f"Connecting to master_addr: {master_addr}")
+
+    # There is a race condition when initializing NCCL with a large number of ranks (e.g 500 ranks)
+    # We guard against the failure and then retry
+    for attempt_idx in range(n_attempts):
+        try:
+            dist.init_process_group(backend=backend, init_method=f"env://")
+            assert dist.get_rank() == mpi_rank
+
+            use_cuda = torch.cuda.is_available()
+            print(f'Using cuda {use_cuda}')
+            local_rank = mpi_rank % 8
+            device = torch.device("cuda", local_rank) if use_cuda else torch.device("cpu")
+            torch.cuda.set_device(local_rank)
+
+            return mpi_rank, local_rank, device
+        except RuntimeError as e:
+            print(f"Caught error during NCCL init (attempt {attempt_idx} of {n_attempts}): {e}")
+            sleep(1 + (0.01 * mpi_rank))  # Sleep to avoid thundering herd
+            pass
+
+    raise RuntimeError("Failed to initialize NCCL")
diff --git a/jukebox/utils/ema.py b/jukebox/utils/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f3b47bfa6dfba9e0a932a1a3beeb54a0575a58
--- /dev/null
+++ b/jukebox/utils/ema.py
@@ -0,0 +1,94 @@
+import torch
+from torch._utils import _flatten_dense_tensors
+import numpy as np
+
+# EMA always in float, as accumulation needs lots of bits
+class EMA:
+    def __init__(self, params, mu=0.999):
+        self.mu = mu
+        self.state = [(p, self.get_model_state(p)) for p in params if p.requires_grad]
+
+    def get_model_state(self, p):
+        return p.data.float().detach().clone()
+
+    def step(self):
+        for p, state in self.state:
+            state.mul_(self.mu).add_(1 - self.mu, p.data.float())
+
+    def swap(self):
+        # swap ema and model params
+        for p, state in self.state:
+            other_state = self.get_model_state(p)
+            p.data.copy_(state.type_as(p.data))
+            state.copy_(other_state)
+
+
+class CPUEMA:
+    def __init__(self, params, mu=0.999, freq=1):
+        self.mu = mu**freq
+        self.state = [(p, self.get_model_state(p)) for p in params if p.requires_grad]
+        self.freq = freq
+        self.steps = 0
+
+    def get_model_state(self, p):
+        with torch.no_grad():
+            state = p.data.float().detach().cpu().numpy()
+        return state
+
+    def step(self):
+        with torch.no_grad():
+            self.steps += 1
+            if self.steps % self.freq == 0:
+                for i in range(len(self.state)):
+                    p, state = self.state[i]
+                    state = torch.from_numpy(state).cuda()
+                    state.mul_(self.mu).add_(1 - self.mu, p.data.float())
+                    self.state[i] = (p, state.cpu().numpy())
+
+    def swap(self):
+        with torch.no_grad():
+            # swap ema and model params
+            for p, state in self.state:
+                other_state = self.get_model_state(p)
+                p.data.copy_(torch.from_numpy(state).type_as(p.data))
+                np.copyto(state, other_state)
+
+class FusedEMA:
+    def __init__(self, params, mu=0.999):
+        self.mu = mu
+        params = list(params)
+        self.params = {}
+        self.params['fp16'] = [p for p in params if p.requires_grad and p.data.dtype == torch.float16]
+        self.params['fp32'] = [p for p in params if p.requires_grad and p.data.dtype != torch.float16]
+        self.groups = [group for group in self.params.keys() if len(self.params[group]) > 0]
+        self.state = {}
+        for group in self.groups:
+            self.state[group] = self.get_model_state(group)
+
+    def get_model_state(self, group):
+        params = self.params[group]
+        return _flatten_dense_tensors([p.data.float() for p in params])
+        # if self.fp16:
+        #     return _flatten_dense_tensors([p.data.half() for p in self.param_group if p.dtype])
+        # else:
+        #     return _flatten_dense_tensors([p.data for p in self.param_group])
+
+    def step(self):
+        for group in self.groups:
+            self.state[group].mul_(self.mu).add_(1 - self.mu, self.get_model_state(group))
+
+    def swap(self):
+        # swap ema and model params
+        for group in self.groups:
+            other_state = self.get_model_state(group)
+            state = self.state[group]
+            params = self.params[group]
+            offset = 0
+            for p in params:
+                numel = p.data.numel()
+                p.data = state.narrow(0, offset, numel).view_as(p.data).type_as(p.data)
+                offset += numel
+
+            self.state[group] = other_state
+
+
diff --git a/jukebox/utils/fp16.py b/jukebox/utils/fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb31c4ee5728d8fc5f72b9aed0d1f89735cea48e
--- /dev/null
+++ b/jukebox/utils/fp16.py
@@ -0,0 +1,303 @@
+# Utils for fp16 training.
+import importlib
+import math
+import numpy as np
+import torch
+import jukebox.utils.dist_adapter as dist
+from torch.optim import Optimizer
+from torch._utils import _flatten_dense_tensors
+
+from jukebox.utils.dist_utils import allreduce
+
+def adam_step(p: torch.Tensor, out_p: torch.Tensor, exp_avg: torch.Tensor, exp_avg_sq: torch.Tensor, grad: torch.Tensor,
+              lr: float, beta1: float, beta2: float, eps: float, scale: float, step: int, eps_mode: int, bias_correction: int, weight_decay: float):
+    assert bias_correction == 1
+    assert eps_mode == 1
+
+    grad = grad.float()
+    grad.div_(scale)
+
+    # Decay the first and second moment running average coefficient
+    exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+    denom = exp_avg_sq.sqrt().add_(eps)
+
+    bias_correction1 = 1 - beta1 ** step
+    bias_correction2 = 1 - beta2 ** step
+    step_size = lr * math.sqrt(bias_correction2) / bias_correction1
+
+    p.add_(exp_avg/denom + weight_decay*p.float(), alpha=-step_size)
+
+# Import fused_adam if we have apex, otherwise use regular adam
+try:
+    fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+    fused_adam_step = fused_adam_cuda.adam
+    print("Using apex fused_adam_cuda")
+except ModuleNotFoundError:
+    fused_adam_step = adam_step
+
+def backward(loss, params, scalar, fp16, logger):
+    # Perform backward
+    if not fp16:
+        scale = 1.0
+        loss.backward()
+        gn = grad_norm(params, scale)
+        return loss, scale, gn, False, False
+    else:
+        scale = scalar.get_scale()
+        loss = (loss.float())*scale
+        overflow_loss = check_overflow(loss.item())
+        overflow_loss = allreduce(int(overflow_loss), op=dist.ReduceOp.MAX) > 0
+        if not overflow_loss:
+            loss.backward()
+            gn = grad_norm(params, scale)
+            overflow_grad = check_overflow(gn)
+            overflow_grad = allreduce(int(overflow_grad), op=dist.ReduceOp.MAX) > 0
+            scalar.update_scale(overflow_grad)
+        else:
+            gn = 0.0
+            overflow_grad = True
+        loss = (loss.detach().float()) / scale # Should delete computation graph for overflow
+        if logger.rank == 0:
+            if loss > 12.: print(f"\nWarning. Loss is {loss}")
+            if overflow_loss: print(f"\nOverflow in forward. Loss {loss}, lgscale {np.log2(scale)}. Skipping batch completely (no backward, scale update)")
+            elif overflow_grad: print(f"\nOverflow in backward. Loss {loss}, grad norm {gn}, lgscale {np.log2(scale)}, new lgscale {np.log2(scalar.get_scale())}")
+        return loss, scale, gn, overflow_loss, overflow_grad
+
+# Automatic loss scaling
+class LossScalar(object):
+    def __init__(self,
+                 loss_scale,
+                 init_scale=2. ** 16,
+                 scale_factor=2. ** (1. / 1000),
+                 scale_window=1):
+        if loss_scale == None:
+            # Use dynamic loss scaling
+            self.dynamic = True
+            self.loss_scale = init_scale
+        else:
+            self.dynamic = False
+            self.loss_scale = loss_scale
+        self.max_loss_scale = 2.**24
+        self.scale_factor = scale_factor
+        self.scale_window  = scale_window
+        self.unskipped = 0
+        self.overflow = False
+
+    def get_scale(self):
+        return self.loss_scale
+
+    def update_scale(self, overflow):
+        if overflow and self.dynamic:
+            self.loss_scale /= 2.
+            self.unskipped = 0
+        else:
+            self.unskipped += 1
+
+        if self.unskipped == self.scale_window and self.dynamic:
+            self.loss_scale = min(self.max_loss_scale, self.loss_scale * self.scale_factor)
+            self.unskipped = 0
+
+def check_overflow(val):
+    return (val == float('inf')) or (val == -float('inf')) or (val != val)
+
+def grad_norm(params, scale, flat=False):
+    params = list(params)
+    if flat:
+        # Faster but more memory
+        fp16_grads = [p.grad for p in params if p.grad is not None and p.data.dtype == torch.float16]
+        fp16_norm = 0.0 if len(fp16_grads) == 0 else float(_flatten_dense_tensors(fp16_grads).norm(p=2, dtype=torch.float32))
+        fp32_grads = [p.grad for p in params if p.grad is not None and p.data.dtype != torch.float16]
+        fp32_norm = 0.0 if len(fp32_grads) == 0 else float(_flatten_dense_tensors(fp32_grads).norm(p=2))
+        grad_norm = (fp16_norm**2 + fp32_norm**2)**0.5
+    else:
+        # Slightly slower but less memory
+        grad_norm = 0.0
+        for p in params:
+            if p.grad is not None:
+                grad_norm += p.grad.norm(p=2, dtype=torch.float32)**2
+        grad_norm = float(grad_norm**0.5)
+    return grad_norm / scale
+
+def clipped_grad_scale(grad_norm, max_grad_norm, scale):
+    clip = grad_norm / max_grad_norm
+    if clip > 1:
+        scale = clip * scale
+    return scale
+
+class FP16FusedAdam(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        eps_inside_sqrt=False,
+        weight_decay=0.0,
+        amsgrad=False,
+    ):
+        if amsgrad:
+            raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
+        defaults = dict(
+            lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay
+        )
+        super(FP16FusedAdam, self).__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+        self.FLOAT16_MAX = 65504.0
+        self.init_state()
+
+    def init_state(self):
+        for group in self.param_groups:
+            for p in group["params"]:
+                assert p.requires_grad == True
+                state = self.state[p]
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
+                    if p.data.dtype == torch.float16:
+                        state["scale_exp_avg"] = 1.0
+                        state["scale_exp_avg_sq"] = 1.0
+
+    def step(self, closure=None, scale=1.0):
+        """Performs a single optimization step. Scales gradients down by scale
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            bias_correction = 1 if group["bias_correction"] else 0
+
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+
+                state = self.state[p]
+
+                if p.data.dtype == torch.float16:
+                    exp_avg, exp_avg_sq = (
+                        state["exp_avg"].float() * state["scale_exp_avg"],
+                        state["exp_avg_sq"].float() * state["scale_exp_avg_sq"],
+                    )
+                else:
+                    exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+
+                state["step"] += 1
+
+                out_p = torch.tensor([], dtype=torch.float)
+                fused_adam_step(
+                    p.data,
+                    out_p,
+                    exp_avg,
+                    exp_avg_sq,
+                    grad,
+                    group["lr"],
+                    beta1,
+                    beta2,
+                    group["eps"],
+                    scale,
+                    state["step"],
+                    self.eps_mode,
+                    bias_correction,
+                    group["weight_decay"],
+                )
+
+                if p.data.dtype == torch.float16:
+                    state["scale_exp_avg"] = (
+                        1e-8 + float(torch.norm(exp_avg, float("inf"))) / self.FLOAT16_MAX
+                    )
+                    state["scale_exp_avg_sq"] = (
+                        1e-8 + float(torch.norm(exp_avg_sq, float("inf"))) / self.FLOAT16_MAX
+                    )
+                    state["exp_avg"] = (exp_avg / state["scale_exp_avg"]).half()
+                    state["exp_avg_sq"] = (exp_avg_sq / state["scale_exp_avg_sq"]).half()
+
+        return loss
+
+
+class FusedAdam(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        eps_inside_sqrt=False,
+        weight_decay=0.0,
+        amsgrad=False,
+    ):
+        if amsgrad:
+            raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
+        defaults = dict(
+            lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay
+        )
+        super(FusedAdam, self).__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+
+    def step(self, closure=None, scale=1.0):
+        """Performs a single optimization step. Scales gradients down by scale
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            bias_correction = 1 if group["bias_correction"] else 0
+
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p.data).float()
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p.data).float()
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+
+                state["step"] += 1
+
+                out_p = torch.tensor([], dtype=torch.float)
+                fused_adam_step(
+                    p.data,
+                    out_p,
+                    exp_avg,
+                    exp_avg_sq,
+                    grad,
+                    group["lr"],
+                    beta1,
+                    beta2,
+                    group["eps"],
+                    scale,
+                    state["step"],
+                    self.eps_mode,
+                    bias_correction,
+                    group["weight_decay"],
+                )
+
+        return loss
+
diff --git a/jukebox/utils/io.py b/jukebox/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..15ef46db96ab3d27ee85254788ef4b3c2046ecdd
--- /dev/null
+++ b/jukebox/utils/io.py
@@ -0,0 +1,136 @@
+import numpy as np
+import av
+import torch as t
+import jukebox.utils.dist_adapter as dist
+
+def get_duration_sec(file, cache=False):
+    try:
+        with open(file + '.dur', 'r') as f:
+            duration = float(f.readline().strip('\n'))
+        return duration
+    except:
+        container = av.open(file)
+        audio = container.streams.get(audio=0)[0]
+        duration = audio.duration * float(audio.time_base)
+        if cache:
+            with open(file + '.dur', 'w') as f:
+                f.write(str(duration) + '\n')
+        return duration
+
+def load_audio(file, sr, offset, duration, resample=True, approx=False, time_base='samples', check_duration=True):
+    if time_base == 'sec':
+        offset = offset * sr
+        duration = duration * sr
+    # Loads at target sr, stereo channels, seeks from offset, and stops after duration
+    container = av.open(file)
+    audio = container.streams.get(audio=0)[0] # Only first audio stream
+    audio_duration = audio.duration * float(audio.time_base)
+    if approx:
+        if offset + duration > audio_duration*sr:
+            # Move back one window. Cap at audio_duration
+            offset = np.min(audio_duration*sr - duration, offset - duration)
+    else:
+        if check_duration:
+            assert offset + duration <= audio_duration*sr, f'End {offset + duration} beyond duration {audio_duration*sr}'
+    if resample:
+        resampler = av.AudioResampler(format='fltp',layout='stereo', rate=sr)
+    else:
+        assert sr == audio.sample_rate
+    offset = int(offset / sr / float(audio.time_base)) #int(offset / float(audio.time_base)) # Use units of time_base for seeking
+    duration = int(duration) #duration = int(duration * sr) # Use units of time_out ie 1/sr for returning
+    sig = np.zeros((2, duration), dtype=np.float32)
+    container.seek(offset, stream=audio)
+    total_read = 0
+    for frame in container.decode(audio=0): # Only first audio stream
+        if resample:
+            frame.pts = None
+            frame = resampler.resample(frame)
+        frame = frame.to_ndarray(format='fltp') # Convert to floats and not int16
+        read = frame.shape[-1]
+        if total_read + read > duration:
+            read = duration - total_read
+        sig[:, total_read:total_read + read] = frame[:, :read]
+        total_read += read
+        if total_read == duration:
+            break
+    assert total_read <= duration, f'Expected {duration} frames, got {total_read}'
+    return sig, sr
+
+def test_simple_loader():
+    import librosa
+    from tqdm import tqdm
+
+    collate_fn = lambda batch: t.stack([t.from_numpy(b) for b in batch], dim=0)
+
+    def get_batch(file, loader):
+        y1, sr = loader(file, sr=44100, offset=0.0, duration=6.0, time_base='sec')
+        y2, sr = loader(file, sr=44100, offset=20.0, duration=6.0, time_base='sec')
+        return [y1, y2]
+
+    def load(file, loader):
+        batch = get_batch(file, loader)  # np
+        x = collate_fn(batch)  # torch cpu
+        x = x.to('cuda', non_blocking=True)  # torch gpu
+        return x
+
+    files = librosa.util.find_files('/root/data/', ['mp3', 'm4a', 'opus'])
+    print(files[:10])
+    loader = load_audio
+    print("Loader", loader.__name__)
+    x = t.randn(2, 2).cuda()
+    x = load(files[0], loader)
+    for i,file in enumerate(tqdm(files)):
+        x = load(file, loader)
+        if i == 100:
+            break
+
+def test_dataset_loader():
+    from tqdm import tqdm
+    from torch.utils.data import DataLoader
+    from torch.utils.data.distributed import DistributedSampler
+    from jukebox.utils.audio_utils import audio_preprocess, audio_postprocess
+    from jukebox.hparams import setup_hparams
+    from jukebox.data.files_dataset import FilesAudioDataset
+    hps = setup_hparams("teeny", {})
+    hps.sr = 22050  # 44100
+    hps.hop_length = 512
+    hps.labels = False
+    hps.channels = 2
+    hps.aug_shift = False
+    hps.bs = 2
+    hps.nworkers = 2 # Getting 20 it/s with 2 workers, 10 it/s with 1 worker
+    print(hps)
+    dataset = hps.dataset
+    root = hps.root
+    from tensorboardX import SummaryWriter
+    sr = {22050: '22k', 44100: '44k', 48000: '48k'}[hps.sr]
+    writer = SummaryWriter(f'{root}/{dataset}/logs/{sr}/logs')
+    dataset = FilesAudioDataset(hps)
+    print("Length of dataset", len(dataset))
+
+    # Torch Loader
+    collate_fn = lambda batch: t.stack([t.from_numpy(b) for b in batch], 0)
+    sampler = DistributedSampler(dataset)
+    train_loader = DataLoader(dataset, batch_size=hps.bs, num_workers=hps.nworkers, pin_memory=False, sampler=sampler,
+                              drop_last=True, collate_fn=collate_fn)
+
+    dist.barrier()
+    sampler.set_epoch(0)
+    for i, x in enumerate(tqdm(train_loader)):
+        x = x.to('cuda', non_blocking=True)
+        for j, aud in enumerate(x):
+            writer.add_audio('in_' + str(i*hps.bs + j), aud, 1, hps.sr)
+        print("Wrote in")
+        x = audio_preprocess(x, hps)
+        x = audio_postprocess(x, hps)
+        for j, aud in enumerate(x):
+            writer.add_audio('out_' + str(i*hps.bs + j), aud, 1, hps.sr)
+        print("Wrote out")
+        dist.barrier()
+        break
+
+if __name__ == '__main__':
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    setup_dist_from_mpi(port=29500)
+    test_dataset_loader()
+
diff --git a/jukebox/utils/logger.py b/jukebox/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6fedc16fa0b5407c1e0479f0d762df282772ea
--- /dev/null
+++ b/jukebox/utils/logger.py
@@ -0,0 +1,147 @@
+import torch as t
+import jukebox.utils.dist_adapter as dist
+from tqdm import tqdm
+from datetime import date
+import os
+import sys
+
+def def_tqdm(x):
+    return tqdm(x, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]")
+
+def get_range(x):
+    if dist.get_rank() == 0:
+        return def_tqdm(x)
+    else:
+        return x
+
+def init_logging(hps, local_rank, rank):
+    logdir = f"{hps.local_logdir}/{hps.name}"
+    if local_rank == 0:
+        if not os.path.exists(logdir):
+            os.makedirs(logdir)
+        with open(logdir + 'argv.txt', 'w') as f:
+            f.write(hps.argv + '\n')
+        print("Logging to", logdir)
+    logger = Logger(logdir, rank)
+    metrics = Metrics()
+    logger.add_text('hps', str(hps))
+    return logger, metrics
+
+def get_name(hps):
+    name = ""
+    for key, value in hps.items():
+        name += f"{key}_{value}_"
+    return name
+
+def average_metrics(_metrics):
+    metrics = {}
+    for _metric in _metrics:
+        for key, val in _metric.items():
+            if key not in metrics:
+                metrics[key] = []
+            metrics[key].append(val)
+    return {key: sum(vals)/len(vals) for key, vals in metrics.items()}
+
+class Metrics:
+    def __init__(self):
+        self.sum = {}
+        self.n = {}
+
+    def update(self, tag, val, batch):
+        # v is average value over batch
+        # store total value and total batch, returns dist average
+        sum = t.tensor(val * batch).float().cuda()
+        n = t.tensor(batch).float().cuda()
+        dist.all_reduce(sum)
+        dist.all_reduce(n)
+        sum = sum.item()
+        n = n.item()
+        self.sum[tag] = self.sum.get(tag, 0.0) + sum
+        self.n[tag] = self.n.get(tag, 0.0) + n
+        return sum / n
+
+    def avg(self, tag):
+        if tag in self.sum:
+            return self.sum[tag] / self.n[tag]
+        else:
+            return 0.0
+
+    def reset(self):
+        self.sum = {}
+        self.n = {}
+
+class Logger:
+    def __init__(self, logdir, rank):
+        if rank == 0:
+            from tensorboardX import SummaryWriter
+            self.sw = SummaryWriter(f"{logdir}/logs")
+        self.iters = 0
+        self.rank = rank
+        self.works = []
+        self.logdir = logdir
+
+    def step(self):
+        self.iters += 1
+
+    def flush(self):
+        if self.rank == 0:
+            self.sw.flush()
+
+    def add_text(self, tag, text):
+        if self.rank == 0:
+            self.sw.add_text(tag, text, self.iters)
+
+    def add_audios(self, tag, auds, sample_rate=22050, max_len=None, max_log=8):
+        if self.rank == 0:
+            for i in range(min(len(auds), max_log)):
+                if max_len:
+                    self.sw.add_audio(f"{i}/{tag}", auds[i][:max_len * sample_rate], self.iters, sample_rate)
+                else:
+                    self.sw.add_audio(f"{i}/{tag}", auds[i], self.iters, sample_rate)
+
+    def add_audio(self, tag, aud, sample_rate=22050):
+        if self.rank == 0:
+            self.sw.add_audio(tag, aud, self.iters, sample_rate)
+
+    def add_images(self, tag, img, dataformats="NHWC"):
+        if self.rank == 0:
+            self.sw.add_images(tag, img, self.iters, dataformats=dataformats)
+
+    def add_image(self, tag, img):
+        if self.rank == 0:
+            self.sw.add_image(tag, img, self.iters)
+
+    def add_scalar(self, tag, val):
+        if self.rank == 0:
+            self.sw.add_scalar(tag, val, self.iters)
+
+    def get_range(self, loader):
+        if self.rank == 0:
+            self.trange = def_tqdm(loader)
+        else:
+            self.trange = loader
+        return enumerate(self.trange)
+
+    def close_range(self):
+        if self.rank == 0:
+            self.trange.close()
+
+    def set_postfix(self, *args, **kwargs):
+        if self.rank == 0:
+            self.trange.set_postfix(*args, **kwargs)
+
+    # For logging summaries of varies graph ops
+    def add_reduce_scalar(self, tag, layer, val):
+        if self.iters % 100 == 0:
+            with t.no_grad():
+                val = val.float().norm()/float(val.numel())
+            work = dist.reduce(val, 0, async_op=True)
+            self.works.append((tag, layer, val, work))
+
+    def finish_reduce(self):
+        for tag, layer, val, work in self.works:
+            work.wait()
+            if self.rank == 0:
+                val = val.item()/dist.get_world_size()
+                self.lw[layer].add_scalar(tag, val, self.iters)
+        self.works = []
diff --git a/jukebox/utils/remote_utils.py b/jukebox/utils/remote_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bdf953fb1c0a2a87f3a637fce9787e84ff696c5
--- /dev/null
+++ b/jukebox/utils/remote_utils.py
@@ -0,0 +1,42 @@
+import sys
+import subprocess
+
+def download(remote_path, local_path, async_download=False):
+    args = ['wget', '-O', local_path, remote_path]
+    print("Running ", " ".join(args))
+    if async_download:
+        subprocess.Popen(args)
+    else:
+        subprocess.call(args)
+
+# GCE
+def gs_download(gs_path, local_path, async_download=False):
+    args = ['gsutil',
+            '-o', 'GSUtil:parallel_thread_count=1',
+            '-o', 'GSUtil:sliced_object_download_max_components=8',
+            'cp', gs_path, local_path]
+    if async_download:
+        subprocess.Popen(args)
+    else:
+        subprocess.call(args)
+
+
+def gs_upload(local_path, gs_path, async_upload=False):
+    # NOTE: Download and upload have differ -o flags.
+    # We also use -n to prevent clobbering checkpoints by mistake
+    assert not local_path.startswith("gs://")
+    assert gs_path.startswith("gs://")
+    args = ['gsutil',
+            '-o', 'GSUtil:parallel_composite_upload_threshold=150M',
+            'cp', '-n', local_path, gs_path]
+    if async_upload:
+        subprocess.Popen(args)
+    else:
+        subprocess.call(args)
+
+def ls(regex):
+    outputs = subprocess.check_output(['gsutil', 'ls', regex]).decode(sys.stdout.encoding)
+    outputs = outputs.split('\n')
+    outputs = [output for output in outputs if output is not '']
+    return outputs
+
diff --git a/jukebox/utils/sample_utils.py b/jukebox/utils/sample_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ae41b16e5dbacb5179df77d68c2923acbf54f41
--- /dev/null
+++ b/jukebox/utils/sample_utils.py
@@ -0,0 +1,22 @@
+import torch as t
+
+def split_batch(obj, n_samples, split_size):
+    n_passes = (n_samples + split_size - 1) // split_size
+    if isinstance(obj, t.Tensor):
+        return t.split(obj, split_size, dim=0)
+    elif isinstance(obj, list):
+        return list(zip(*[t.split(item, split_size, dim=0) for item in obj]))
+    elif obj is None:
+        return [None] * n_passes
+    else:
+        raise TypeError('Unknown input type')
+
+# Break total_length into hops/windows of size n_ctx separated by hop_length
+def get_starts(total_length, n_ctx, hop_length):
+    starts = []
+    for start in range(0, total_length - n_ctx + hop_length, hop_length):
+        if start + n_ctx >= total_length:
+            # Last hop could be smaller, we make it n_ctx to maximise context
+            start = total_length - n_ctx
+        starts.append(start)
+    return starts
diff --git a/jukebox/utils/torch_utils.py b/jukebox/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d02081443ef60441593b398bede8c93d1792fec
--- /dev/null
+++ b/jukebox/utils/torch_utils.py
@@ -0,0 +1,32 @@
+import gc
+import torch as t
+
+def freeze_model(model):
+    model.eval()
+    for params in model.parameters():
+        params.requires_grad = False
+
+
+def unfreeze_model(model):
+    model.train()
+    for params in model.parameters():
+        params.requires_grad = True
+
+def zero_grad(model):
+    for p in model.parameters():
+        if p.requires_grad and p.grad is not None:
+            p.grad = None
+
+def empty_cache():
+    gc.collect()
+    t.cuda.empty_cache()
+
+def assert_shape(x, exp_shape):
+    assert x.shape == exp_shape, f"Expected {exp_shape} got {x.shape}"
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+def count_state(model):
+    return sum(s.numel() for s in model.state_dict().values())
+
diff --git a/jukebox/vqvae/__init__.py b/jukebox/vqvae/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/jukebox/vqvae/bottleneck.py b/jukebox/vqvae/bottleneck.py
new file mode 100644
index 0000000000000000000000000000000000000000..18720ec528067d3efbd808ec91a272c4c28be8a4
--- /dev/null
+++ b/jukebox/vqvae/bottleneck.py
@@ -0,0 +1,248 @@
+import numpy as np
+import torch as t
+import torch.nn as nn
+import torch.nn.functional as F
+import jukebox.utils.dist_adapter as dist
+
+class BottleneckBlock(nn.Module):
+    def __init__(self, k_bins, emb_width, mu):
+        super().__init__()
+        self.k_bins = k_bins
+        self.emb_width = emb_width
+        self.mu = mu
+        self.reset_k()
+        self.threshold = 1.0
+
+    def reset_k(self):
+        self.init = False
+        self.k_sum = None
+        self.k_elem = None
+        self.register_buffer('k', t.zeros(self.k_bins, self.emb_width).cuda())
+
+    def _tile(self, x):
+        d, ew = x.shape
+        if d < self.k_bins:
+            n_repeats = (self.k_bins + d - 1) // d
+            std = 0.01 / np.sqrt(ew)
+            x = x.repeat(n_repeats, 1)
+            x = x + t.randn_like(x) * std
+        return x
+
+    def init_k(self, x):
+        mu, emb_width, k_bins = self.mu, self.emb_width, self.k_bins
+        self.init = True
+        # init k_w using random vectors from x
+        y = self._tile(x)
+        _k_rand = y[t.randperm(y.shape[0])][:k_bins]
+        dist.broadcast(_k_rand, 0)
+        self.k = _k_rand
+        assert self.k.shape == (k_bins, emb_width)
+        self.k_sum = self.k
+        self.k_elem = t.ones(k_bins, device=self.k.device)
+
+    def restore_k(self, num_tokens=None, threshold=1.0):
+        mu, emb_width, k_bins = self.mu, self.emb_width, self.k_bins
+        self.init = True
+        assert self.k.shape == (k_bins, emb_width)
+        self.k_sum = self.k.clone()
+        self.k_elem = t.ones(k_bins, device=self.k.device)
+        if num_tokens is not None:
+            expected_usage = num_tokens / k_bins
+            self.k_elem.data.mul_(expected_usage)
+            self.k_sum.data.mul_(expected_usage)
+        self.threshold = threshold
+
+    def update_k(self, x, x_l):
+        mu, emb_width, k_bins = self.mu, self.emb_width, self.k_bins
+        with t.no_grad():
+            # Calculate new centres
+            x_l_onehot = t.zeros(k_bins, x.shape[0], device=x.device)  # k_bins, N * L
+            x_l_onehot.scatter_(0, x_l.view(1, x.shape[0]), 1)
+
+            _k_sum = t.matmul(x_l_onehot, x)  # k_bins, w
+            _k_elem = x_l_onehot.sum(dim=-1)  # k_bins
+            y = self._tile(x)
+            _k_rand = y[t.randperm(y.shape[0])][:k_bins]
+
+            dist.broadcast(_k_rand, 0)
+            dist.all_reduce(_k_sum)
+            dist.all_reduce(_k_elem)
+
+            # Update centres
+            old_k = self.k
+            self.k_sum = mu * self.k_sum + (1. - mu) * _k_sum  # w, k_bins
+            self.k_elem = mu * self.k_elem + (1. - mu) * _k_elem  # k_bins
+            usage = (self.k_elem.view(k_bins, 1) >= self.threshold).float()
+            self.k = usage * (self.k_sum.view(k_bins, emb_width) / self.k_elem.view(k_bins, 1)) \
+                     + (1 - usage) * _k_rand
+            _k_prob = _k_elem / t.sum(_k_elem)  # x_l_onehot.mean(dim=-1)  # prob of each bin
+            entropy = -t.sum(_k_prob * t.log(_k_prob + 1e-8))  # entropy ie how diverse
+            used_curr = (_k_elem >= self.threshold).sum()
+            usage = t.sum(usage)
+            dk = t.norm(self.k - old_k) / np.sqrt(np.prod(old_k.shape))
+        return dict(entropy=entropy,
+                    used_curr=used_curr,
+                    usage=usage,
+                    dk=dk)
+
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])  # x_en = (N * L, w), k_j = (w, k_bins)
+
+        if x.shape[-1] == self.emb_width:
+            prenorm = t.norm(x - t.mean(x)) / np.sqrt(np.prod(x.shape))
+        elif x.shape[-1] == 2 * self.emb_width:
+            x1, x2 = x[...,:self.emb_width], x[...,self.emb_width:]
+            prenorm = (t.norm(x1 - t.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (t.norm(x2 - t.mean(x2)) / np.sqrt(np.prod(x2.shape)))
+
+            # Normalise
+            x = x1 + x2
+        else:
+            assert False, f"Expected {x.shape[-1]} to be (1 or 2) * {self.emb_width}"
+        return x, prenorm
+
+    def postprocess(self, x_l, x_d, x_shape):
+        # [NT, C] -> NTC -> NCT
+        N, T = x_shape
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()
+        x_l = x_l.view(N, T)
+        return x_l, x_d
+
+    def quantise(self, x):
+        # Calculate latent code x_l
+        k_w = self.k.t()
+        distance = t.sum(x ** 2, dim=-1, keepdim=True) - 2 * t.matmul(x, k_w) + t.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        min_distance, x_l = t.min(distance, dim=-1)
+        fit = t.mean(min_distance)
+        return x_l, fit
+
+    def dequantise(self, x_l):
+        x = F.embedding(x_l, self.k)
+        return x
+
+    def encode(self, x):
+        N, width, T = x.shape
+
+        # Preprocess.
+        x, prenorm = self.preprocess(x)
+
+        # Quantise
+        x_l, fit = self.quantise(x)
+
+        # Postprocess.
+        x_l = x_l.view(N, T)
+        return x_l
+
+    def decode(self, x_l):
+        N, T = x_l.shape
+        width = self.emb_width
+
+        # Dequantise
+        x_d = self.dequantise(x_l)
+
+        # Postprocess
+        x_d = x_d.view(N, T, width).permute(0, 2, 1).contiguous()
+        return x_d
+
+    def forward(self, x, update_k=True):
+        N, width, T = x.shape
+
+        # Preprocess
+        x, prenorm = self.preprocess(x)
+
+        # Init k if not inited
+        if update_k and not self.init:
+            self.init_k(x)
+
+        # Quantise and dequantise through bottleneck
+        x_l, fit = self.quantise(x)
+        x_d = self.dequantise(x_l)
+
+        # Update embeddings
+        if update_k:
+            update_metrics = self.update_k(x, x_l)
+        else:
+            update_metrics = {}
+
+        # Loss
+        commit_loss = t.norm(x_d.detach() - x) ** 2 / np.prod(x.shape)
+
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+
+        # Postprocess
+        x_l, x_d = self.postprocess(x_l, x_d, (N,T))
+        return x_l, x_d, commit_loss, dict(fit=fit,
+                                           pn=prenorm,
+                                           **update_metrics)
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, l_bins, emb_width, mu, levels):
+        super().__init__()
+        self.levels = levels
+        level_block = lambda level: BottleneckBlock(l_bins, emb_width, mu)
+        self.level_blocks = nn.ModuleList()
+        for level in range(self.levels):
+            self.level_blocks.append(level_block(level))
+
+    def encode(self, xs):
+        zs = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, xs)]
+        return zs
+
+    def decode(self, zs, start_level=0, end_level=None):
+        if end_level is None:
+            end_level = self.levels
+        xs_quantised = [level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], zs)]
+        return xs_quantised
+
+    def forward(self, xs):
+        zs, xs_quantised, commit_losses, metrics = [], [], [], []
+        for level in range(self.levels):
+            level_block = self.level_blocks[level]
+            x = xs[level]
+            z, x_quantised, commit_loss, metric = level_block(x, update_k=self.training)
+            zs.append(z)
+            if not self.training:
+                # Be extra paranoid and make sure the encoder weights can't
+                # change from straight-through estimator
+                x_quantised = x_quantised.detach()
+            xs_quantised.append(x_quantised)
+            commit_losses.append(commit_loss)
+            if self.training:
+                metrics.append(metric)
+        return zs, xs_quantised, commit_losses, metrics
+
+class NoBottleneckBlock(nn.Module):
+    def restore_k(self):
+        pass
+
+class NoBottleneck(nn.Module):
+    def __init__(self, levels):
+        super().__init__()
+        self.level_blocks = nn.ModuleList()
+        self.levels = levels
+        for level in range(levels):
+            self.level_blocks.append(NoBottleneckBlock())
+
+    def encode(self, xs):
+        return xs
+
+    def decode(self, zs, start_level=0, end_level=None):
+        if end_level is None:
+            end_level = self.levels
+        return zs
+
+    def forward(self, xs):
+        zero = t.zeros(()).cuda()
+        commit_losses = [zero for _ in range(self.levels)]
+        metrics = [dict(entropy=zero, usage=zero, used_curr=zero, pn=zero, dk=zero) for _ in range(self.levels)]
+        return xs, xs, commit_losses, metrics
+
+if __name__ == '__main__':
+    from jukebox.utils.dist_utils import setup_dist_from_mpi
+    rank, local_rank, device = setup_dist_from_mpi(port=29600)
+    bottleneck = Bottleneck(256, 64, 0.99, 2).to(device)
+    bottleneck.check()
diff --git a/jukebox/vqvae/encdec.py b/jukebox/vqvae/encdec.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec6dd4ae8ade9c7eedf8d8a75aec29b155b183de
--- /dev/null
+++ b/jukebox/vqvae/encdec.py
@@ -0,0 +1,131 @@
+import torch as t
+import torch.nn as nn
+from jukebox.vqvae.resnet import Resnet, Resnet1D
+from jukebox.utils.torch_utils import assert_shape
+
+class EncoderConvBlock(nn.Module):
+    def __init__(self, input_emb_width, output_emb_width, down_t,
+                 stride_t, width, depth, m_conv,
+                 dilation_growth_rate=1, dilation_cycle=None, zero_out=False,
+                 res_scale=False):
+        super().__init__()
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        if down_t > 0:
+            for i in range(down_t):
+                block = nn.Sequential(
+                    nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t),
+                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale),
+                )
+                blocks.append(block)
+            block = nn.Conv1d(width, output_emb_width, 3, 1, 1)
+            blocks.append(block)
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return self.model(x)
+
+class DecoderConvBock(nn.Module):
+    def __init__(self, input_emb_width, output_emb_width, down_t,
+                 stride_t, width, depth, m_conv, dilation_growth_rate=1, dilation_cycle=None, zero_out=False, res_scale=False, reverse_decoder_dilation=False, checkpoint_res=False):
+        super().__init__()
+        blocks = []
+        if down_t > 0:
+            filter_t, pad_t = stride_t * 2, stride_t // 2
+            block = nn.Conv1d(output_emb_width, width, 3, 1, 1)
+            blocks.append(block)
+            for i in range(down_t):
+                block = nn.Sequential(
+                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out=zero_out, res_scale=res_scale, reverse_dilation=reverse_decoder_dilation, checkpoint_res=checkpoint_res),
+                    nn.ConvTranspose1d(width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t)
+                )
+                blocks.append(block)
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return self.model(x)
+
+class Encoder(nn.Module):
+    def __init__(self, input_emb_width, output_emb_width, levels, downs_t,
+                 strides_t, **block_kwargs):
+        super().__init__()
+        self.input_emb_width = input_emb_width
+        self.output_emb_width = output_emb_width
+        self.levels = levels
+        self.downs_t = downs_t
+        self.strides_t = strides_t
+
+        block_kwargs_copy = dict(**block_kwargs)
+        if 'reverse_decoder_dilation' in block_kwargs_copy:
+            del block_kwargs_copy['reverse_decoder_dilation']
+        level_block = lambda level, down_t, stride_t: EncoderConvBlock(input_emb_width if level == 0 else output_emb_width,
+                                                           output_emb_width,
+                                                           down_t, stride_t,
+                                                           **block_kwargs_copy)
+        self.level_blocks = nn.ModuleList()
+        iterator = zip(list(range(self.levels)), downs_t, strides_t)
+        for level, down_t, stride_t in iterator:
+            self.level_blocks.append(level_block(level, down_t, stride_t))
+
+    def forward(self, x):
+        N, T = x.shape[0], x.shape[-1]
+        emb = self.input_emb_width
+        assert_shape(x, (N, emb, T))
+        xs = []
+
+        # 64, 32, ...
+        iterator = zip(list(range(self.levels)), self.downs_t, self.strides_t)
+        for level, down_t, stride_t in iterator:
+            level_block = self.level_blocks[level]
+            x = level_block(x)
+            emb, T = self.output_emb_width, T // (stride_t ** down_t)
+            assert_shape(x, (N, emb, T))
+            xs.append(x)
+
+        return xs
+
+class Decoder(nn.Module):
+    def __init__(self, input_emb_width, output_emb_width, levels, downs_t,
+                 strides_t, **block_kwargs):
+        super().__init__()
+        self.input_emb_width = input_emb_width
+        self.output_emb_width = output_emb_width
+        self.levels = levels
+
+        self.downs_t = downs_t
+
+        self.strides_t = strides_t
+
+        level_block = lambda level, down_t, stride_t: DecoderConvBock(output_emb_width,
+                                                          output_emb_width,
+                                                          down_t, stride_t,
+                                                          **block_kwargs)
+        self.level_blocks = nn.ModuleList()
+        iterator = zip(list(range(self.levels)), downs_t, strides_t)
+        for level, down_t, stride_t in iterator:
+            self.level_blocks.append(level_block(level, down_t, stride_t))
+
+        self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
+
+    def forward(self, xs, all_levels=True):
+        if all_levels:
+            assert len(xs) == self.levels
+        else:
+            assert len(xs) == 1
+        x = xs[-1]
+        N, T = x.shape[0], x.shape[-1]
+        emb = self.output_emb_width
+        assert_shape(x, (N, emb, T))
+
+        # 32, 64 ...
+        iterator = reversed(list(zip(list(range(self.levels)), self.downs_t, self.strides_t)))
+        for level, down_t, stride_t in iterator:
+            level_block = self.level_blocks[level]
+            x = level_block(x)
+            emb, T = self.output_emb_width, T * (stride_t ** down_t)
+            assert_shape(x, (N, emb, T))
+            if level != 0 and all_levels:
+                x = x + xs[level - 1]
+
+        x = self.out(x)
+        return x
diff --git a/jukebox/vqvae/resnet.py b/jukebox/vqvae/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..137369e91bc2e3de5034d165dd092bb428470267
--- /dev/null
+++ b/jukebox/vqvae/resnet.py
@@ -0,0 +1,75 @@
+import math
+import torch.nn as nn
+import jukebox.utils.dist_adapter as dist
+from jukebox.utils.checkpoint import checkpoint
+
+class ResConvBlock(nn.Module):
+    def __init__(self, n_in, n_state):
+        super().__init__()
+        self.model = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(n_in, n_state, 3, 1, 1),
+            nn.ReLU(),
+            nn.Conv2d(n_state, n_in, 1, 1, 0),
+        )
+
+    def forward(self, x):
+        return x + self.model(x)
+
+class Resnet(nn.Module):
+    def __init__(self, n_in, n_depth, m_conv=1.0):
+        super().__init__()
+        self.model = nn.Sequential(*[ResConvBlock(n_in, int(m_conv * n_in)) for _ in range(n_depth)])
+
+    def forward(self, x):
+        return self.model(x)
+
+class ResConv1DBlock(nn.Module):
+    def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
+        super().__init__()
+        padding = dilation
+        self.model = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv1d(n_in, n_state, 3, 1, padding, dilation),
+            nn.ReLU(),
+            nn.Conv1d(n_state, n_in, 1, 1, 0),
+        )
+        if zero_out:
+            out = self.model[-1]
+            nn.init.zeros_(out.weight)
+            nn.init.zeros_(out.bias)
+        self.res_scale = res_scale
+
+    def forward(self, x):
+        return x + self.res_scale * self.model(x)
+
+class Resnet1D(nn.Module):
+    def __init__(self, n_in, n_depth, m_conv=1.0, dilation_growth_rate=1, dilation_cycle=None, zero_out=False, res_scale=False, reverse_dilation=False, checkpoint_res=False):
+        super().__init__()
+        def _get_depth(depth):
+            if dilation_cycle is None:
+                return depth
+            else:
+                return depth % dilation_cycle
+        blocks = [ResConv1DBlock(n_in, int(m_conv * n_in),
+                                 dilation=dilation_growth_rate ** _get_depth(depth),
+                                 zero_out=zero_out,
+                                 res_scale=1.0 if not res_scale else 1.0 / math.sqrt(n_depth))
+                  for depth in range(n_depth)]
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        self.checkpoint_res = checkpoint_res
+        if self.checkpoint_res == 1:
+            if dist.get_rank() == 0:
+                print("Checkpointing convs")
+            self.blocks = nn.ModuleList(blocks)
+        else:
+            self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        if self.checkpoint_res == 1:
+            for block in self.blocks:
+                x = checkpoint(block, (x, ), block.parameters(), True)
+            return x
+        else:
+            return self.model(x)
diff --git a/jukebox/vqvae/vqvae.py b/jukebox/vqvae/vqvae.py
new file mode 100644
index 0000000000000000000000000000000000000000..3244b2b4535b5fe4ae35a7f83eb94a43104efd16
--- /dev/null
+++ b/jukebox/vqvae/vqvae.py
@@ -0,0 +1,228 @@
+import numpy as np
+import torch as t
+import torch.nn as nn
+
+from jukebox.vqvae.encdec import Encoder, Decoder, assert_shape
+from jukebox.vqvae.bottleneck import NoBottleneck, Bottleneck
+from jukebox.utils.logger import average_metrics
+from jukebox.utils.audio_utils import spectral_convergence, spectral_loss, multispectral_loss, audio_postprocess
+
+def dont_update(params):
+    for param in params:
+        param.requires_grad = False
+
+def update(params):
+    for param in params:
+        param.requires_grad = True
+
+def calculate_strides(strides, downs):
+    return [stride ** down for stride, down in zip(strides, downs)]
+
+def _loss_fn(loss_fn, x_target, x_pred, hps):
+    if loss_fn == 'l1':
+        return t.mean(t.abs(x_pred - x_target)) / hps.bandwidth['l1']
+    elif loss_fn == 'l2':
+        return t.mean((x_pred - x_target) ** 2) / hps.bandwidth['l2']
+    elif loss_fn == 'linf':
+        residual = ((x_pred - x_target) ** 2).reshape(x_target.shape[0], -1)
+        values, _ = t.topk(residual, hps.linf_k, dim=1)
+        return t.mean(values) / hps.bandwidth['l2']
+    elif loss_fn == 'lmix':
+        loss = 0.0
+        if hps.lmix_l1:
+            loss += hps.lmix_l1 * _loss_fn('l1', x_target, x_pred, hps)
+        if hps.lmix_l2:
+            loss += hps.lmix_l2 * _loss_fn('l2', x_target, x_pred, hps)
+        if hps.lmix_linf:
+            loss += hps.lmix_linf * _loss_fn('linf', x_target, x_pred, hps)
+        return loss
+    else:
+        assert False, f"Unknown loss_fn {loss_fn}"
+
+class VQVAE(nn.Module):
+    def __init__(self, input_shape, levels, downs_t, strides_t,
+                 emb_width, l_bins, mu, commit, spectral, multispectral,
+                 multipliers=None, use_bottleneck=True, **block_kwargs):
+        super().__init__()
+
+        self.sample_length = input_shape[0]
+        x_shape, x_channels = input_shape[:-1], input_shape[-1]
+        self.x_shape = x_shape
+
+        self.downsamples = calculate_strides(strides_t, downs_t)
+        self.hop_lengths = np.cumprod(self.downsamples)
+        self.z_shapes = z_shapes = [(x_shape[0] // self.hop_lengths[level],) for level in range(levels)]
+        self.levels = levels
+
+        if multipliers is None:
+            self.multipliers = [1] * levels
+        else:
+            assert len(multipliers) == levels, "Invalid number of multipliers"
+            self.multipliers = multipliers
+        def _block_kwargs(level):
+            this_block_kwargs = dict(block_kwargs)
+            this_block_kwargs["width"] *= self.multipliers[level]
+            this_block_kwargs["depth"] *= self.multipliers[level]
+            return this_block_kwargs
+
+        encoder = lambda level: Encoder(x_channels, emb_width, level + 1,
+                                        downs_t[:level+1], strides_t[:level+1], **_block_kwargs(level))
+        decoder = lambda level: Decoder(x_channels, emb_width, level + 1,
+                                        downs_t[:level+1], strides_t[:level+1], **_block_kwargs(level))
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        for level in range(levels):
+            self.encoders.append(encoder(level))
+            self.decoders.append(decoder(level))
+
+        if use_bottleneck:
+            self.bottleneck = Bottleneck(l_bins, emb_width, mu, levels)
+        else:
+            self.bottleneck = NoBottleneck(levels)
+
+        self.downs_t = downs_t
+        self.strides_t = strides_t
+        self.l_bins = l_bins
+        self.commit = commit
+        self.spectral = spectral
+        self.multispectral = multispectral
+
+    def preprocess(self, x):
+        # x: NTC [-1,1] -> NCT [-1,1]
+        assert len(x.shape) == 3
+        x = x.permute(0,2,1).float()
+        return x
+
+    def postprocess(self, x):
+        # x: NTC [-1,1] <- NCT [-1,1]
+        x = x.permute(0,2,1)
+        return x
+
+    def _decode(self, zs, start_level=0, end_level=None):
+        # Decode
+        if end_level is None:
+            end_level = self.levels
+        assert len(zs) == end_level - start_level
+        xs_quantised = self.bottleneck.decode(zs, start_level=start_level, end_level=end_level)
+        assert len(xs_quantised) == end_level - start_level
+
+        # Use only lowest level
+        decoder, x_quantised = self.decoders[start_level], xs_quantised[0:1]
+        x_out = decoder(x_quantised, all_levels=False)
+        x_out = self.postprocess(x_out)
+        return x_out
+
+    def decode(self, zs, start_level=0, end_level=None, bs_chunks=1):
+        z_chunks = [t.chunk(z, bs_chunks, dim=0) for z in zs]
+        x_outs = []
+        for i in range(bs_chunks):
+            zs_i = [z_chunk[i] for z_chunk in z_chunks]
+            x_out = self._decode(zs_i, start_level=start_level, end_level=end_level)
+            x_outs.append(x_out)
+        return t.cat(x_outs, dim=0)
+
+    def _encode(self, x, start_level=0, end_level=None):
+        # Encode
+        if end_level is None:
+            end_level = self.levels
+        x_in = self.preprocess(x)
+        xs = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            x_out = encoder(x_in)
+            xs.append(x_out[-1])
+        zs = self.bottleneck.encode(xs)
+        return zs[start_level:end_level]
+
+    def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
+        x_chunks = t.chunk(x, bs_chunks, dim=0)
+        zs_list = []
+        for x_i in x_chunks:
+            zs_i = self._encode(x_i, start_level=start_level, end_level=end_level)
+            zs_list.append(zs_i)
+        zs = [t.cat(zs_level_list, dim=0) for zs_level_list in zip(*zs_list)]
+        return zs
+
+    def sample(self, n_samples):
+        zs = [t.randint(0, self.l_bins, size=(n_samples, *z_shape), device='cuda') for z_shape in self.z_shapes]
+        return self.decode(zs)
+
+    def forward(self, x, hps, loss_fn='l1'):
+        metrics = {}
+
+        N = x.shape[0]
+
+        # Encode/Decode
+        x_in = self.preprocess(x)
+        xs = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            x_out = encoder(x_in)
+            xs.append(x_out[-1])
+
+        zs, xs_quantised, commit_losses, quantiser_metrics = self.bottleneck(xs)
+        x_outs = []
+        for level in range(self.levels):
+            decoder = self.decoders[level]
+            x_out = decoder(xs_quantised[level:level+1], all_levels=False)
+            assert_shape(x_out, x_in.shape)
+            x_outs.append(x_out)
+
+        # Loss
+        def _spectral_loss(x_target, x_out, hps):
+            if hps.use_nonrelative_specloss:
+                sl = spectral_loss(x_target, x_out, hps) / hps.bandwidth['spec']
+            else:
+                sl = spectral_convergence(x_target, x_out, hps)
+            sl = t.mean(sl)
+            return sl
+
+        def _multispectral_loss(x_target, x_out, hps):
+            sl = multispectral_loss(x_target, x_out, hps) / hps.bandwidth['spec']
+            sl = t.mean(sl)
+            return sl
+
+        recons_loss = t.zeros(()).to(x.device)
+        spec_loss = t.zeros(()).to(x.device)
+        multispec_loss = t.zeros(()).to(x.device)
+        x_target = audio_postprocess(x.float(), hps)
+
+        for level in reversed(range(self.levels)):
+            x_out = self.postprocess(x_outs[level])
+            x_out = audio_postprocess(x_out, hps)
+            this_recons_loss = _loss_fn(loss_fn, x_target, x_out, hps)
+            this_spec_loss = _spectral_loss(x_target, x_out, hps)
+            this_multispec_loss = _multispectral_loss(x_target, x_out, hps)
+            metrics[f'recons_loss_l{level + 1}'] = this_recons_loss
+            metrics[f'spectral_loss_l{level + 1}'] = this_spec_loss
+            metrics[f'multispectral_loss_l{level + 1}'] = this_multispec_loss
+            recons_loss += this_recons_loss
+            spec_loss += this_spec_loss
+            multispec_loss += this_multispec_loss
+
+        commit_loss = sum(commit_losses)
+        loss = recons_loss + self.spectral * spec_loss + self.multispectral * multispec_loss + self.commit * commit_loss
+
+        with t.no_grad():
+            sc = t.mean(spectral_convergence(x_target, x_out, hps))
+            l2_loss = _loss_fn("l2", x_target, x_out, hps)
+            l1_loss = _loss_fn("l1", x_target, x_out, hps)
+            linf_loss = _loss_fn("linf", x_target, x_out, hps)
+
+        quantiser_metrics = average_metrics(quantiser_metrics)
+
+        metrics.update(dict(
+            recons_loss=recons_loss,
+            spectral_loss=spec_loss,
+            multispectral_loss=multispec_loss,
+            spectral_convergence=sc,
+            l2_loss=l2_loss,
+            l1_loss=l1_loss,
+            linf_loss=linf_loss,
+            commit_loss=commit_loss,
+            **quantiser_metrics))
+
+        for key, val in metrics.items():
+            metrics[key] = val.detach()
+
+        return x_out, loss, metrics
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..537c346ea2731d3ade48756e685b9569258adcbc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+fire==0.1.3
+tqdm==4.45.0
+soundfile==0.10.3.post1
+unidecode==1.1.1
+numba==0.48.0
+librosa==0.7.2
+mpi4py>=3.0.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf6a66170722946a7342d9225fe042a69d753bfd
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,20 @@
+import os
+
+import pkg_resources
+from setuptools import setup, find_packages
+
+setup(
+    name="jukebox",
+    py_modules=["jukebox"],
+    version="1.0",
+    description="",
+    author="OpenAI",
+    packages=find_packages(),
+    install_requires=[
+        str(r)
+        for r in pkg_resources.parse_requirements(
+            open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
+        )
+    ],
+    include_package_data=True
+)
diff --git a/tensorboardX/.codecov.yml b/tensorboardX/.codecov.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aa0677d7d33526bbee41c3df785b36badfd406dc
--- /dev/null
+++ b/tensorboardX/.codecov.yml
@@ -0,0 +1,8 @@
+coverage:
+  status:
+    project:                   # measuring the overall project coverage
+      default:                 # context, you can create multiple ones with custom titles
+        enabled: yes 
+    patch:
+      default:
+        enabled: no
diff --git a/tensorboardX/.flake8 b/tensorboardX/.flake8
new file mode 100644
index 0000000000000000000000000000000000000000..f70e247cab151739972db98dcbd2a12960c114c7
--- /dev/null
+++ b/tensorboardX/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 120
+ignore = E305,E402,E721,E741,F401,F403,F405,F821,F841,F999
+exclude = tensorboardX/proto
\ No newline at end of file
diff --git a/tensorboardX/.github/ISSUE_TEMPLATE/bug_report.md b/tensorboardX/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..855c134fe3654ae63bbf757c044f6e0a34e352fd
--- /dev/null
+++ b/tensorboardX/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,35 @@
+---
+name: Bug report
+about: Create bug report
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Minimal runnable code to reproduce the behavior**
+```
+from tensorboardX import SummaryWriter
+...
+```
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Environment**
+What is the result of 
+`pip list|grep -E "torch|proto|tensor"`
+If the version is too old, please try to update first.
+
+
+**Python environment**
+Which version of python are you using? Did you use Andconda or Virtualenv?
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/tensorboardX/.github/ISSUE_TEMPLATE/feature-requests-or-general-questions.md b/tensorboardX/.github/ISSUE_TEMPLATE/feature-requests-or-general-questions.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fc144c64e0455dee9a18f564d864567c78ea5b4
--- /dev/null
+++ b/tensorboardX/.github/ISSUE_TEMPLATE/feature-requests-or-general-questions.md
@@ -0,0 +1,10 @@
+---
+name: Feature requests or General questions
+about: Feature requests or general questions
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+
diff --git a/tensorboardX/.gitignore b/tensorboardX/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a85c2e8ed2e1c164ee44570502c2ef3d7865e308
--- /dev/null
+++ b/tensorboardX/.gitignore
@@ -0,0 +1,10 @@
+proto_src/
+protoc-*.zip
+protoc/
+__pycache__
+docs/_*
+build
+dist
+*.egg-info
+runs/*
+*.pyc
diff --git a/tensorboardX/.travis.yml b/tensorboardX/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fe5bb059e99d9baa9d17ffc03d5301e1e4a88163
--- /dev/null
+++ b/tensorboardX/.travis.yml
@@ -0,0 +1,81 @@
+dist: xenial
+language: python
+python:
+  # We don't actually use the Travis Python, but this keeps it organized.
+  - "2.7"
+  - "3.6"
+
+env:
+  - PYTORCH_VER="torch"
+  - PYTORCH_VER="torch_nightly -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html"
+
+matrix:
+  allow_failures:
+    - env: PYTORCH_VER="torch_nightly -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html"
+
+install:
+  - export MPLBACKEND=Agg
+  - export CODECOV_TOKEN="26239910-fe4e-463d-aa3d-e662e9bf39ef"
+
+  - sudo apt-get update
+  # We do this conditionally because it saves us some downloading if the
+  # version is the same.
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+      wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
+    else
+      wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+    fi
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  - export BOTO_CONFIG=/dev/null  # https://github.com/travis-ci/travis-ci/issues/7940
+  - export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+  - hash -r
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  # Useful for debugging any issues with conda
+  - conda info -a
+
+  # Replace dep1 dep2 ... with your dependencies
+  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
+  - source activate test-environment
+  - which python
+  - pip install future
+  - pip install chainer -q
+  - pip install torchvision==0.2.1 -q
+  - pip uninstall torch -y
+  - pip install $PYTORCH_VER
+  - pip install moviepy==0.2.3.2 -q
+  - pip install matplotlib -q
+  - pip install requests -q
+  - pip install codecov
+  - pip install onnx
+  - pip install boto3
+  - pip install moto
+  - pip install visdom
+  - pip install tb-nightly
+  - pip install crc32c
+  - pip install protobuf==3.8.0
+  - conda install ffmpeg
+  - conda list
+  - python -c "import imageio; imageio.plugins.ffmpeg.download()"
+  - pip install --upgrade pytest-cov flake8
+  - python setup.py install
+
+script:
+  - visdom &
+  - sleep 5
+  - python -c "import visdom; v = visdom.Visdom()"
+  - py.test --cov=tensorboardX tests/
+  - python examples/demo.py
+  - python examples/demo_graph.py
+  - python examples/demo_embedding.py
+  - python examples/demo_custom_scalars.py
+  - python examples/demo_multiple_embedding.py
+  - python examples/demo_purge.py
+  - python examples/demo_matplotlib.py
+  - pip uninstall -y tensorboardX
+  - pip install tensorboardX
+  - pytest
+
+after_success:
+  - codecov
diff --git a/tensorboardX/HISTORY.rst b/tensorboardX/HISTORY.rst
new file mode 100644
index 0000000000000000000000000000000000000000..81fecd1cf05e28569eae2797e050cccf7a699f3d
--- /dev/null
+++ b/tensorboardX/HISTORY.rst
@@ -0,0 +1,106 @@
+History
+=======
+1.8 (2019-07-05)
+-----------------
+* Draw label text on image with bounding box provided.
+* crc32c speed up (optional by installing crc32c manually)
+* Rewrite add_graph. onnx backend is replaced by JIT to support more advanced structure.
+* Now you can add_mesh() to visualize colorful point cloud or meshes.
+
+1.7 (2019-05-19)
+-----------------
+* Able to write to S3
+* Fixed raw histogram issue that nothing is shown in TensorBoard
+* Users can use various image/video dimension permutation by passing 'dataformats' parameter.
+* You can bybass the writer by passing write_to_disk=True to SummaryWriter
+
+
+1.6 (2019-01-02)
+-----------------
+* Many graph related bug is fixed in this version.
+* New function: add_images(). This function accepts 4D iamge tensor. See documentation.
+* Make add_image_with_boxes() usable.
+* API change: add_video now accepts BxTxCxHxW instead of BxCxTxHxW tensor.
+
+1.5 (2018-12-10)
+-----------------
+* Add API for Custom scalar
+* Add support for logging directly to S3
+* Add support for Caffe2 graph
+* Pytorch 1.0.0 JIT graph support (alpha-release)
+
+1.4 (2018-08-09)
+-----------------
+* Made add_text compatible with tensorboard>1.6
+* Fix the issue of strange histogram if default binning method is used
+* Supports passing matplotlib figures to add_image()
+* Resolve namespace confliction with TF tensorboard
+* add_image_boxes function
+* Supports custom timestamp for event
+
+1.2 (2018-04-21)
+-----------------
+* Supports tensorshape information in graph visualization. Drop support for 0.3.1
+* Adds add_video function
+
+1.1 (2018-02-21)
+-----------------
+* Supports pytorch 0.3.1 (hacky)
+
+1.0 (2018-01-18)
+-----------------
+* Supports graph (the pretty one)
+
+0.9 (2017-11-11)
+-----------------
+* Supports markdown for add_text function
+* It's ready to log precision recall curve (needs tensorboard>=0.4)
+* Adds context manager for the SummaryWriter class
+
+0.8 (2017-09-25)
+-----------------
+* Package name renamed to tensorboardX to fix namespace confliction with tensorflow's tensorboard
+* Supports multi-scalars and JSON export
+* Multiple Embeddings in One Experiment 
+* Supports Chainer and mxnet
+
+0.7 (2017-08-22)
+-----------------
+* remove tensorflow dependency for embedding function
+* fixed incorrect image<->label pairing in embedding function (#12)
+* unifies API call and adds docstring. Documentation is available at: http://tensorboard-pytorch.readthedocs.io/
+
+0.6.5 (2017-07-30)
+------------------
+* add travis test (py2.7, py3.6)
+* add support for python2 (in PyPI)
+
+0.6 (2017-07-18)
+-----------------
+* supports embedding
+
+0.5 (2017-07-18)
+-----------------
+* supports graph summary
+* fixed np.histogram issue
+
+0.4 (2017-07-12)
+-----------------
+* supports text summary
+
+0.3 (2017-07-03)
+-----------------
+* supports audio summary
+
+0.2 (2017-06-24)
+-----------------
+* simplifies add_image API
+* speed up add_histogram API by 35x
+
+
+0.1 (2017-06-13)
+------------------
+* First commit. Reference:
+
+https://github.com/TeamHG-Memex/tensorboard_logger
+https://github.com/dmlc/tensorboard
diff --git a/tensorboardX/LICENSE b/tensorboardX/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0eb77d5d9d2d71e92d6ca45e9cd980ab65274ee7
--- /dev/null
+++ b/tensorboardX/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Tzu-Wei Huang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/tensorboardX/MANIFEST.in b/tensorboardX/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..81025e7f9f0bc3bb77e2ec78973cf329d76f40f7
--- /dev/null
+++ b/tensorboardX/MANIFEST.in
@@ -0,0 +1,7 @@
+include HISTORY.rst
+include LICENSE
+include compile.sh
+recursive-include tensorboardX/proto *
+recursive-exclude test *
+recursive-exclude examples *
+recursive-include tensorboardX/beholder *
\ No newline at end of file
diff --git a/tensorboardX/README.md b/tensorboardX/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e2703672bea6b3cac01db7237def8ca60c8cd500
--- /dev/null
+++ b/tensorboardX/README.md
@@ -0,0 +1,106 @@
+# tensorboardX
+
+[![Build Status](https://travis-ci.org/lanpa/tensorboardX.svg?branch=master)](https://travis-ci.org/lanpa/tensorboardX)
+[![PyPI version](https://badge.fury.io/py/tensorboardX.svg)](https://badge.fury.io/py/tensorboardX)
+[![Downloads](https://img.shields.io/badge/pip--downloads-5K+-brightgreen.svg)](https://bigquery.cloud.google.com/savedquery/966219917372:edb59a0d70c54eb687ab2a9417a778ee)
+[![Documentation Status](https://readthedocs.org/projects/tensorboardx/badge/?version=latest)](https://tensorboardx.readthedocs.io/en/latest/?badge=latest)
+[![Documentation Status](https://codecov.io/gh/lanpa/tensorboardX/branch/master/graph/badge.svg)](https://codecov.io/gh/lanpa/tensorboardX/)
+
+Write TensorBoard events with simple function call.
+
+* Support `scalar`, `image`, `figure`, `histogram`, `audio`, `text`, `graph`, `onnx_graph`, `embedding`, `pr_curve`, `mesh`, `hyper-parameters`
+  and `video` summaries.
+
+* requirement for `demo_graph.py` is tensorboardX>=1.6 and pytorch>=1.1
+
+* [FAQ](https://github.com/lanpa/tensorboardX/wiki)
+
+## Install
+
+Tested on anaconda2 / anaconda3, with PyTorch 1.1.0 / torchvision 0.3 / tensorboard 1.13.0
+
+`pip install tensorboardX`
+
+or build from source:
+
+`git clone https://github.com/lanpa/tensorboardX && cd tensorboardX && python setup.py install`
+
+You can optionally install [`crc32c`](https://github.com/ICRAR/crc32c) to speed up saving a large amount of data.
+
+
+## Example
+
+* Run the demo script: `python examples/demo.py`
+* Use TensorBoard with `tensorboard --logdir runs`  (needs to install TensorFlow)
+
+```python
+# demo.py
+
+import torch
+import torchvision.utils as vutils
+import numpy as np
+import torchvision.models as models
+from torchvision import datasets
+from tensorboardX import SummaryWriter
+
+resnet18 = models.resnet18(False)
+writer = SummaryWriter()
+sample_rate = 44100
+freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440]
+
+for n_iter in range(100):
+
+    dummy_s1 = torch.rand(1)
+    dummy_s2 = torch.rand(1)
+    # data grouping by `slash`
+    writer.add_scalar('data/scalar1', dummy_s1[0], n_iter)
+    writer.add_scalar('data/scalar2', dummy_s2[0], n_iter)
+
+    writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter),
+                                             'xcosx': n_iter * np.cos(n_iter),
+                                             'arctanx': np.arctan(n_iter)}, n_iter)
+
+    dummy_img = torch.rand(32, 3, 64, 64)  # output from network
+    if n_iter % 10 == 0:
+        x = vutils.make_grid(dummy_img, normalize=True, scale_each=True)
+        writer.add_image('Image', x, n_iter)
+
+        dummy_audio = torch.zeros(sample_rate * 2)
+        for i in range(x.size(0)):
+            # amplitude of sound should in [-1, 1]
+            dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate))
+        writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate)
+
+        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
+
+        for name, param in resnet18.named_parameters():
+            writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter)
+
+        # needs tensorboard 0.4RC or later
+        writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter)
+
+dataset = datasets.MNIST('mnist', train=False, download=True)
+images = dataset.test_data[:100].float()
+label = dataset.test_labels[:100]
+
+features = images.view(100, 784)
+writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
+
+# export scalar data to JSON for external processing
+writer.export_scalars_to_json("./all_scalars.json")
+writer.close()
+```
+
+## Screenshots
+
+<img src="screenshots/Demo.gif">
+
+## Tweaks
+
+To add more ticks for the slider (show more image history), check https://github.com/lanpa/tensorboardX/issues/44 or 
+https://github.com/tensorflow/tensorboard/pull/1138
+
+## Reference
+
+* [TeamHG-Memex/tensorboard_logger](https://github.com/TeamHG-Memex/tensorboard_logger)
+* [dmlc/tensorboard](https://github.com/dmlc/tensorboard)
diff --git a/tensorboardX/compile.sh b/tensorboardX/compile.sh
new file mode 100644
index 0000000000000000000000000000000000000000..59e3a28d4626604a8da4566797e486803e0d8a7d
--- /dev/null
+++ b/tensorboardX/compile.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Exit on error
+# set -e
+
+DESIRED_PROTO_VERSION="3.6.1"
+
+# call protoc direclty, if version is not the desired one, download the desired vesrion.
+
+
+if [ -f "protoc/bin/protoc" ]; then
+  PROTOC_BIN="protoc/bin/protoc"
+else
+  PROTOC_BIN=`which protoc`
+fi
+
+echo "using" $PROTOC_BIN
+
+CURRENT_PROTOC_VER=`${PROTOC_BIN} --version`
+if [ -z ${PROTOC_BIN} ] || [[ "$CURRENT_PROTOC_VER" != "libprotoc "$DESIRED_PROTO_VERSION ]]; then
+  # Download and use the latest version of protoc.
+  if [ "$(uname)" == "Darwin" ]; then
+    PROTOC_ZIP="protoc-"$DESIRED_PROTO_VERSION"-osx-x86_64.zip"
+  else
+    PROTOC_ZIP="protoc-"$DESIRED_PROTO_VERSION"-linux-x86_64.zip"
+  fi
+  WGET_BIN=`which wget`
+  if [[ ! -z ${WGET_BIN} ]]; then
+    ${WGET_BIN} https://github.com/protocolbuffers/protobuf/releases/download/v"$DESIRED_PROTO_VERSION"/${PROTOC_ZIP}
+    rm -rf protoc
+    python -c "import zipfile; zipfile.ZipFile('"${PROTOC_ZIP}"','r').extractall('protoc')"
+    PROTOC_BIN=protoc/bin/protoc
+    chmod +x ${PROTOC_BIN}
+  fi
+fi
+
+# Regenerate
+if [[ ! -z ${PROTOC_BIN} ]]; then
+  # Delete all existing Python protobuf (*_pb2.py) output
+  rm -rf tensorboardX/proto/*pb2*.py
+  ${PROTOC_BIN} tensorboardX/proto/*.proto --python_out=.
+
+  echo "Done generating tensorboardX/proto/*pb2*.py"
+else
+  echo "protoc not installed so can't regenerate tensorboardX/proto/*pb2*.py, using precompiled version."
+fi
+
diff --git a/tensorboardX/docs/Makefile b/tensorboardX/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3059a852f3d0fc778b6fd1db0492f4aaa63bbb3e
--- /dev/null
+++ b/tensorboardX/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = tensorboardX
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/tensorboardX/docs/conf.py b/tensorboardX/docs/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cbfd9abe6c5b0486788f9e7c83770d400d25e02
--- /dev/null
+++ b/tensorboardX/docs/conf.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# tensorboardX documentation build configuration file, created by
+# sphinx-quickstart on Wed Aug  9 01:38:01 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+# sys.path.insert(0, os.path.abspath('.'))
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+#import tensorboard #uncomment to shadow pip installation
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.githubpages']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'tensorboardX'
+copyright = '2017, tensorboardX Contributors'
+author = 'tensorboardX Contributors'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = ''
+# The full version, including alpha/beta/rc tags.
+release = ''
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'tensorboardXdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'tensorboardX.tex', 'tensorboardX Documentation',
+     'tensorboardX Contributors', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'tensorboardX', 'tensorboardX Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'tensorboardX', 'tensorboardX Documentation',
+     author, 'tensorboardX', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    'python':('https://docs.python.org/3', None),
+    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
+    'torch': ('http://pytorch.org/docs/master', None),
+    'matplotlib': ('http://matplotlib.sourceforge.net/', None),
+    }
diff --git a/tensorboardX/docs/index.rst b/tensorboardX/docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f75b412d518c81ad9ce3ebb441348d8739dace2c
--- /dev/null
+++ b/tensorboardX/docs/index.rst
@@ -0,0 +1,23 @@
+.. tensorboardX documentation master file, created by
+   sphinx-quickstart on Wed Aug  9 01:38:01 2017.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to tensorboardX's documentation!
+===============================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   tensorboard
+   utils
+   tutorial
+   tutorial_zh
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/tensorboardX/docs/tensorboard.rst b/tensorboardX/docs/tensorboard.rst
new file mode 100644
index 0000000000000000000000000000000000000000..66113c17e119925da92b80401c175b065a6c9123
--- /dev/null
+++ b/tensorboardX/docs/tensorboard.rst
@@ -0,0 +1,13 @@
+tensorboardX
+===================================
+.. automodule:: tensorboardX
+
+.. autoclass:: SummaryWriter
+    :members:
+    
+    .. automethod:: __init__
+
+.. autoclass:: TorchVis
+    :members:
+
+    .. automethod:: __init__
\ No newline at end of file
diff --git a/tensorboardX/docs/tutorial.rst b/tensorboardX/docs/tutorial.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1b08e545c49a33a27da11592e8d50d4742c60ab9
--- /dev/null
+++ b/tensorboardX/docs/tutorial.rst
@@ -0,0 +1,161 @@
+Tutorials
+*********
+
+What is tensorboard X?
+----------------------
+
+At first, the package was named tensorboard, and soon there are issues about
+name confliction. The first alternative name came to my mind is
+tensorboard-pytorch, but in order to make it more general, I chose tensorboardX
+which stands for tensorboard for X.
+
+Google's tensorflow's tensorboard is a web server to serve visualizations of the
+training progress of a neural network, it visualizes scalar values, images,
+text, etc.; these information are saved as events in tensorflow. It's a pity
+that other deep learning frameworks lack of such tool, so there are already
+packages letting users to log the events without tensorflow; however they only
+provides basic functionalities. The purpose of this package is to let
+researchers use a simple interface to log events within PyTorch (and then show
+visualization in tensorboard). This package currently supports logging scalar,
+image, audio, histogram, text, embedding, and the route of back-propagation. The
+following manual is tested on Ubuntu and Mac, and the environment are anaconda's
+python2 and python3.
+
+
+Create a summary writer
+-----------------------
+Before logging anything, we need to create a writer instance. This can be done with:
+
+.. code-block:: python
+
+    from tensorboardX import SummaryWriter
+    #SummaryWriter encapsulates everything
+    writer = SummaryWriter('runs/exp-1')
+    #creates writer object. The log will be saved in 'runs/exp-1'
+    writer2 = SummaryWriter()
+    #creates writer2 object with auto generated file name, the dir will be something like 'runs/Aug20-17-20-33'
+    writer3 = SummaryWriter(comment='3x learning rate')
+    #creates writer3 object with auto generated file name, the comment will be appended to the filename. The dir will be something like 'runs/Aug20-17-20-33-3xlearning rate'
+
+Each subfolder will be treated as different experiments in tensorboard. Each
+time you re-run the experiment with different settings, you should change the
+name of the sub folder such as ``runs/exp2``, ``runs/myexp`` so that you can
+easily compare different experiment settings. Type ``tensorboard runs`` to compare
+different runs in tensorboard.
+
+
+General api format
+------------------
+.. code-block:: python
+
+    add_something(tag name, object, iteration number)
+
+
+Add scalar
+-----------
+Scalar value is the most simple data type to deal with. Mostly we save the loss
+value of each training step, or the accuracy after each epoch. Sometimes I save
+the corresponding learning rate as well. It's cheap to save scalar value. Just
+log anything you think is important. To log a scalar value, use
+``writer.add_scalar('myscalar', value, iteration)``. Note that the program complains
+if you feed a PyTorch tensor. Remember to extract the scalar value by
+``x.item()`` if ``x`` is a torch scalar tensor.
+
+
+Add image
+---------
+An image is represented as 3-dimensional tensor. The simplest case is save one
+image at a time. In this case, the image should be passed as a 3-dimension
+tensor of size ``[3, H, W]``. The three dimensions correspond to R, G, B channel of
+an image. After your image is computed, use ``writer.add_image('imresult', x,
+iteration)`` to save the image. If you have a batch of images to show, use
+``torchvision``'s ``make_grid`` function to prepare the image array and send the result
+to ``add_image(...)`` (``make_grid`` takes a 4D tensor and returns tiled images in 3D tensor).
+
+.. Note::
+	Remember to normalize your image.
+
+
+Add histogram
+-------------
+Saving histograms is expensive. Both in computation time and storage. If training
+slows down after using this package, check this first. To save a histogram,
+convert the array into numpy array and save with ``writer.add_histogram('hist',
+array, iteration)``.
+
+
+Add figure
+----------
+You can save a matplotlib figure to tensorboard with the add_figure function. ``figure`` input should be ``matplotlib.pyplot.figure`` or a list of ``matplotlib.pyplot.figure``.
+Check `<https://tensorboardx.readthedocs.io/en/latest/tensorboard.html#tensorboardX.SummaryWriter.add_figure>`_ for the detailed usage.
+
+Add graph
+---------
+To visualize a model, you need a model ``m`` and the input ``t``. ``t`` can be a tensor or a list of tensors
+depending on your model. If error happens, make sure that ``m(t)`` runs without problem first. See
+`The graph demo <https://github.com/lanpa/tensorboardX/blob/master/examples/demo_graph.py>`_ for
+complete example.
+
+
+Add audio
+---------
+To log a single channel audio, use ``add_audio(tag, audio, iteration, sample_rate)``, where ``audio`` is an one dimensional array, and each element in the array represents the consecutive amplitude samples.
+For a 2 seconds audio with ``sample_rate`` 44100 Hz, the input ``x`` should have 88200 elements.
+Each element should lie in [−1, 1].
+
+Add embedding
+-------------
+Embeddings, high dimensional data, can be visualized and converted
+into human perceptible 3D data by tensorboard, which provides PCA and
+t-sne to project the data into low dimensional space. What you need to do is
+provide a bunch of points and tensorboard will do the rest for you. The bunch of
+points is passed as a tensor of size ``n x d``, where ``n`` is the number of points and
+``d`` is the feature dimension. The feature representation can either be raw data
+(*e.g.* the MNIST image) or a representation learned by your network (extracted
+feature). This determines how the points distributes. To make the visualization
+more informative, you can pass optional metadata or ``label_imgs`` for each data
+points. In this way you can see that neighboring point have similar label and
+distant points have very different label (semantically or visually). Here the
+metadata is a list of labels, and the length of the list should equal to ``n``, the
+number of the points. The ``label_imgs`` is a 4D tensor of size ``NCHW``. ``N`` should equal
+to ``n`` as well. See
+`The embedding demo <https://github.com/lanpa/tensorboardX/blob/master/examples/demo_embedding.py>`_ for
+complete example.
+
+
+Useful commands
+---------------
+Install
+=======
+
+Simply type ``pip install tensorboardX`` in a unix shell to install this package.
+To use the newest version, you might need to build from source or ``pip install
+tensorboardX —-no-cache-dir`` .  To run tensorboard web server, you need
+to install it using ``pip install tensorboard``.
+After that, type ``tensorboard --logdir=<your_log_dir>`` to start the server, where
+``your_log_dir`` is the parameter of the object constructor. I think this command is
+tedious, so I add a line alias ``tb='tensorboard --logdir '`` in ``~/.bashrc``. In
+this way, the above command is simplified as ``tb <your_log_dir>``. Use your favorite
+browser to load the tensorboard page, the address will be shown in the terminal
+after starting the server.
+
+
+Misc
+----
+Performance issue
+=================
+Logging is cheap, but display is expensive.
+For my experience, if there are 3 or more experiments to show at a time and each
+experiment have, say, 50k points, tensorboard might need a lot of time to
+present the data.
+
+
+Grouping plots
+==============
+Usually, there are many numbers to log in one experiment. For example, when
+training GANs you should log the loss of the generator, discriminator. If the
+loss is composed of two other loss functions, say L1 and MSE, you might want to
+log the value of the other two losses as well. In this case, you can write the
+tags as Gen/L1, Gen/MSE, Desc/L1, Desc/MSE. In this way, tensorboard will group
+the plots into two sections (Gen, Desc). You can also use the regular expression
+to filter data.
diff --git a/tensorboardX/docs/tutorial_zh.rst b/tensorboardX/docs/tutorial_zh.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59d03756d94003c2db796b1fad1674f3176de348
--- /dev/null
+++ b/tensorboardX/docs/tutorial_zh.rst
@@ -0,0 +1,97 @@
+Tutorials_zh
+*************
+
+緣起
+------
+Google TensorFlow 附加的工具 Tensorboard 是一個很好用的視覺化工具。他可以記錄數字，影像或者是聲音資訊，對於觀察類神經網路訓練的過程非常有幫助。很可惜的是其他的訓練框架（PyTorch, Chainer, numpy）並沒有這麼好用的工具。網路上稍加搜尋可以發現已經有一些現成的套件可以讓不同的訓練框架使用 web 介面來觀察訓練情形，不過他們可以記錄的東西比較有限或是使用起來比較複雜 (tensorboard_logger, visdom)。tensorboardX 的目的就是讓其他 tensorboard 的功能都可以輕易的被非 TensorFlow 的框架使用。
+目前這個套件除了 tensorboard beholder 之外支援所有 tensorboard 的紀錄型態。這個套件目前的標準測試環境為 Ubuntu 或是 Mac ，windows 則是有不定期手動測試；使用的 python 版本為 anaconda 的 python3。
+
+安裝
+-------
+在命令列輸入 ``pip install tensorboardX`` 即可
+或是最新版源碼安裝 ``pip install tensorboardX``
+
+使用
+-------
+建立 event writer 實體
+在紀錄任何東西之前，我們需要建立一個 event writer 實體。
+from tensorboardX import SummaryWriter 
+#SummaryWriter 是一個類別，包含這套件的所有功能。
+
+``writer = SummaryWriter('runs/exp-1')``
+#建立實體。資料存放在：``'runs/exp-1'``
+#接下來要寫入任何資料都是呼叫 ``writer.add_某功能()``
+
+``writer = SummaryWriter()``
+#使用預設名稱建立實體。資料存放在：``'runs/現在時間-機器名字'`` ex. ``'runs/Aug20-obov01'``
+
+``writer = SummaryWriter(comment='3xLR')``
+#在預設資料夾後面加上註解 檔名變為：``'runs/Aug20-obov01-3xLR'``
+上面的程式碼會在目前的工作目錄下建立一個叫 ``runs`` 的資料夾以及子目錄 ``exp1``。 每個子目錄都會被視為一個實驗。每次執行新的實驗時，比如說改了一些參數，這時請將資料夾重新命名，像是： ``runs/exp2``, ``runs/myexp`` 這樣可以便於比較實驗的結果。 建議：資料夾可以用時間命名或者是直接把參數當成資料夾的名稱。
+建立 writer 實體之後就可以開始紀錄資料了
+API 的長相大概是：``add_xxx(標籤，要記錄的東西，時間戳，其他參數)``
+
+紀錄純量
+-------------
+純量是最好記錄的東西。通常我們會把每次訓練的損失記錄下來或者是測試的準確度都是值得記錄的東西。其他數據，像是學習率也值得紀錄。
+紀錄純量的方法是 ``writer.add_scalar('myscalar', value, iteration)``
+value 可以是 PyTorch tensor ， numpy或是 float，int 之類的python原生數字類別。
+
+記錄影像
+-------------
+影像使用一個三維的矩陣來表示。這三個維度分別代表紅色，綠色，藍色的強度。一張寬200， 高100的影像其對應的矩陣大小為[3, 100, 200] （CHW）。最簡單情況是只有一張影像要存。這時候只需要注意一下是不是符合上述的規格然後將它傳到: ``writer.add_image('imresult', image, iteration)`` 即可。 
+通常訓練的時候會採用批次處理，所以有一大堆影像要存。這時候請確定你的資料維度是 ``(NCHW)``, 其中 ``N`` 是batchsize。``add_image`` 會自動將他排列成適當大小。要注意的是，如果要記錄的影像是 OpenCV/numpy 格式，他們通常呈現 ``(HWC)`` 的排列，這時候要呼叫 ``numpy.transpose`` 將其轉為正確的維度，否則會報錯。另外就是注意影像的值的範圍要介於 [0, 1] 之間。 
+
+紀錄直方圖（histogram）
+-------------------------------
+記錄直方圖很耗 CPU 資源，不要常用。如果你用了這個套件之後覺得速度變慢了請先檢查一下是不是這個原因。使用方法很簡單，呼叫 ``writer.add_histogram('hist', array, iteration)`` 即可紀錄。
+
+紀錄聲音
+-------------
+``writer.add_audio('myaudio', audio, iteration, sample_rate)``
+這功能只支援單聲道。 add_audio 要傳入的聲音資訊是個一維陣列，陣列的每一個元素代表在每一個取樣點的振幅大小。取樣頻率(sample_rate)為 44100 kHz 的情況下。一段2秒鐘的聲音應該要有88200個點；注意其中每個元素的值應該都介於正負1之間。
+
+紀錄文字
+-------------
+``writer.add_text('mytext', 'this is a pen', iteration)``
+除了一般字串之外，也支援簡單的 markdown 表格。
+
+記錄網路架構。
+--------------------------
+(實驗性的功能，模型複雜的時候不確定對不對)
+問題很多的功能。使用上比較複雜。需要準備兩個東西：網路模型 以及 你要餵給他的 tensor 
+舉例來說，令模型為 m，輸入為 x，則使用方法為：
+``add_graph(m, (x, ))`` 這裡使用 tuple 的原因是當網路有多個輸入時，可以把他擴充成
+``add_graph(m, (x, y, z))`` ，如果只有單一輸入，寫成 ``add_graph(m, x)`` 也無妨。 
+常會出錯的原因： 
+- 較新的 operator pytorch本身不支援JIT
+- 輸入是 cpu tensor，model 在 GPU 上。（或是反過來）
+- 輸入的 tensor 大小錯誤，跑到後面幾層維度消失了
+- model 寫錯，前後兩層 feature dimension 對不上
+除錯方法
+
+forward propagate 一次 ``m(x)`` 或是多個輸入時：``m((x, y, z))``
+2. 用 ``torch.onnx.export`` 導出模型，觀察錯誤訊息。
+
+高維度資料視覺化／降維 (embedding)
+---------------------------------------------------
+因為人類對物體的了解程度只有三維，所以當資料的維度超過三的時候我們沒辦法將他視覺化。這時候就需要降維來讓資料的維度小於等於三。降維運算由 tensorboard 以 Javascript 執行，演算法有 PCA 及 t-sne 兩種可選。這邊我們只需要負責提供每個點的高維度特徵即可。提供的格式是一個矩陣，一個 ``n x d`` 的矩陣 ``n`` 點的數量， ``d`` 是維度的多寡。 高維度特徵可以是原始資料。比如說影像，或是網路學到的壓縮結果。這原始資料決定了資料的分佈情形。如果要看得更清楚一點，你可以再傳 metadata / label_imgs 的參數進去（metadata是一個 python list 長度為 ``n``, ``label_imgs`` 是一個 4 維矩陣，大小是 ``nCHW``。這樣每個點就會有他對應的文字或圖在旁邊。不懂的話就看範例吧：https://github.com/lanpa/tensorboardX/blob/master/examples/demo_embedding.py
+
+紀錄短片
+---------------
+類似於紀錄影像，不過傳入的物件維度是 ``[B, C, T ,H, W]``，其中 ``T`` 是影格的數量。所以一個 30 frame 的彩色影片 維度是 ``[B, 3, 30 ,H, W]``。
+
+紀錄 pr curve
+-------------------
+根據預測的機率值以及其對應的標準答案計算 precision-recall 的結果並保存。
+``add_pr_curve (tag, labels, predictions, step)``
+labels是標準答案，predictions是程式對樣本的預測。 
+假設有十筆資料 labels就會長得像 ``[0, 0, 1, 0, 0, 1, 0, 1, 0, 1]``，predictions則長的像 ``[0.1, 0.3, 0.8, 0.2, 0.4, 0.5, 0.1, 0.7, 0.9, 0.2]``。
+
+pyplot 的圖表
+------------------------------
+
+用 matplotlib 畫了美美的圖表想紀錄？請用 ``add_figure`` 。傳入的物件是 matplotlib 的 figure。 
+顯示結果 
+Tensorboard 本質是個網頁伺服器，他讀取的資料來自於訓練網路的時候程式 (tensorboardX) 寫下的事件檔。因為 tensorboard 包含於 tensorflow，所以你需要另外安裝一份 tensorflow 在伺服器主機。我想大部分人都已經裝過了。沒裝過的話就在 unix shell 介面輸入 ``pip install tensorboard``。如果沒有使用 TensorFlow 訓練的需求，建議裝非 GPU 版本，啟動速度快得多。
+接下來在命令列輸入 ``tensorboard --logdir=<your_log_dir>`` （以前面的例子來說：``tensorboard --logdir=runs``）伺服器就會啟動了。這個指令打起來很麻煩，所以我都在 ``~/.bashrc`` 加一行：``alias tb='tensorboard --logdir '`` 如此一來指令就簡化成 ``tb <your_log_dir>``。接下來就是照著終端機上的指示打開你的瀏覽器就可以看到畫面了。
diff --git a/tensorboardX/docs/utils.rst b/tensorboardX/docs/utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..be8a4ade7ed1f82ac277603e21418022798cedf3
--- /dev/null
+++ b/tensorboardX/docs/utils.rst
@@ -0,0 +1,3 @@
+Helper functions
+===================================
+.. autofunction:: tensorboardX.utils.figure_to_image
\ No newline at end of file
diff --git a/tensorboardX/examples/RUN_AFTER_PIP_INSTALL b/tensorboardX/examples/RUN_AFTER_PIP_INSTALL
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorboardX/examples/__init__.py b/tensorboardX/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorboardX/examples/chainer/extension_logger/net.py b/tensorboardX/examples/chainer/extension_logger/net.py
new file mode 100644
index 0000000000000000000000000000000000000000..300c3d3323bce6af16e7416c94606f47c75e6420
--- /dev/null
+++ b/tensorboardX/examples/chainer/extension_logger/net.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import numpy
+
+import chainer
+from chainer import cuda
+import chainer.functions as F
+import chainer.links as L
+
+
+def add_noise(h, sigma=0.2):
+    xp = cuda.get_array_module(h.data)
+    if chainer.config.train:
+        return h + sigma * xp.random.randn(*h.shape)
+    else:
+        return h
+
+
+class Generator(chainer.Chain):
+
+    def __init__(self, n_hidden, bottom_width=4, ch=512, wscale=0.02):
+        super(Generator, self).__init__()
+        self.n_hidden = n_hidden
+        self.ch = ch
+        self.bottom_width = bottom_width
+
+        with self.init_scope():
+            w = chainer.initializers.Normal(wscale)
+            self.l0 = L.Linear(self.n_hidden, bottom_width * bottom_width * ch,
+                               initialW=w)
+            self.dc1 = L.Deconvolution2D(ch, ch // 2, 4, 2, 1, initialW=w)
+            self.dc2 = L.Deconvolution2D(ch // 2, ch // 4, 4, 2, 1, initialW=w)
+            self.dc3 = L.Deconvolution2D(ch // 4, ch // 8, 4, 2, 1, initialW=w)
+            self.dc4 = L.Deconvolution2D(ch // 8, 3, 3, 1, 1, initialW=w)
+            self.bn0 = L.BatchNormalization(bottom_width * bottom_width * ch)
+            self.bn1 = L.BatchNormalization(ch // 2)
+            self.bn2 = L.BatchNormalization(ch // 4)
+            self.bn3 = L.BatchNormalization(ch // 8)
+
+    def make_hidden(self, batchsize):
+        return numpy.random.uniform(-1, 1, (batchsize, self.n_hidden, 1, 1))\
+            .astype(numpy.float32)
+
+    def __call__(self, z):
+        h = F.reshape(F.relu(self.bn0(self.l0(z))),
+                      (len(z), self.ch, self.bottom_width, self.bottom_width))
+        h = F.relu(self.bn1(self.dc1(h)))
+        h = F.relu(self.bn2(self.dc2(h)))
+        h = F.relu(self.bn3(self.dc3(h)))
+        x = F.sigmoid(self.dc4(h))
+        return x
+
+
+class Discriminator(chainer.Chain):
+
+    def __init__(self, bottom_width=4, ch=512, wscale=0.02):
+        w = chainer.initializers.Normal(wscale)
+        super(Discriminator, self).__init__()
+        with self.init_scope():
+            self.c0_0 = L.Convolution2D(3, ch // 8, 3, 1, 1, initialW=w)
+            self.c0_1 = L.Convolution2D(ch // 8, ch // 4, 4, 2, 1, initialW=w)
+            self.c1_0 = L.Convolution2D(ch // 4, ch // 4, 3, 1, 1, initialW=w)
+            self.c1_1 = L.Convolution2D(ch // 4, ch // 2, 4, 2, 1, initialW=w)
+            self.c2_0 = L.Convolution2D(ch // 2, ch // 2, 3, 1, 1, initialW=w)
+            self.c2_1 = L.Convolution2D(ch // 2, ch // 1, 4, 2, 1, initialW=w)
+            self.c3_0 = L.Convolution2D(ch // 1, ch // 1, 3, 1, 1, initialW=w)
+            self.l4 = L.Linear(bottom_width * bottom_width * ch, 1, initialW=w)
+            self.bn0_1 = L.BatchNormalization(ch // 4, use_gamma=False)
+            self.bn1_0 = L.BatchNormalization(ch // 4, use_gamma=False)
+            self.bn1_1 = L.BatchNormalization(ch // 2, use_gamma=False)
+            self.bn2_0 = L.BatchNormalization(ch // 2, use_gamma=False)
+            self.bn2_1 = L.BatchNormalization(ch // 1, use_gamma=False)
+            self.bn3_0 = L.BatchNormalization(ch // 1, use_gamma=False)
+
+    def __call__(self, x):
+        h = add_noise(x)
+        h = F.leaky_relu(add_noise(self.c0_0(h)))
+        h = F.leaky_relu(add_noise(self.bn0_1(self.c0_1(h))))
+        h = F.leaky_relu(add_noise(self.bn1_0(self.c1_0(h))))
+        h = F.leaky_relu(add_noise(self.bn1_1(self.c1_1(h))))
+        h = F.leaky_relu(add_noise(self.bn2_0(self.c2_0(h))))
+        h = F.leaky_relu(add_noise(self.bn2_1(self.c2_1(h))))
+        h = F.leaky_relu(add_noise(self.bn3_0(self.c3_0(h))))
+        return self.l4(h)
diff --git a/tensorboardX/examples/chainer/extension_logger/train_dcgan.py b/tensorboardX/examples/chainer/extension_logger/train_dcgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..03bcfb6f9cc1d2a521412608ba4393d4133cbddd
--- /dev/null
+++ b/tensorboardX/examples/chainer/extension_logger/train_dcgan.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import argparse
+import os
+
+import chainer
+from chainer import training
+from chainer.training import extensions
+
+from net import Discriminator
+from net import Generator
+from updater import DCGANUpdater
+from visualize import out_generated_image
+from tensorboardX import SummaryWriter
+from writetensorboard import LogTensorboard
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Chainer example: DCGAN')
+    parser.add_argument('--batchsize', '-b', type=int, default=50,
+                        help='Number of images in each mini-batch')
+    parser.add_argument('--epoch', '-e', type=int, default=1000,
+                        help='Number of sweeps over the dataset to train')
+    parser.add_argument('--gpu', '-g', type=int, default=-1,
+                        help='GPU ID (negative value indicates CPU)')
+    parser.add_argument('--dataset', '-i', default='',
+                        help='Directory of image files.  Default is cifar-10.')
+    parser.add_argument('--out', '-o', default='result',
+                        help='Directory to output the result')
+    parser.add_argument('--resume', '-r', default='',
+                        help='Resume the training from snapshot')
+    parser.add_argument('--n_hidden', '-n', type=int, default=100,
+                        help='Number of hidden units (z)')
+    parser.add_argument('--seed', type=int, default=0,
+                        help='Random seed of z at visualization stage')
+    parser.add_argument('--snapshot_interval', type=int, default=1000,
+                        help='Interval of snapshot')
+    parser.add_argument('--display_interval', type=int, default=100,
+                        help='Interval of displaying log to console')
+    args = parser.parse_args()
+
+    print('GPU: {}'.format(args.gpu))
+    print('# Minibatch-size: {}'.format(args.batchsize))
+    print('# n_hidden: {}'.format(args.n_hidden))
+    print('# epoch: {}'.format(args.epoch))
+    print('')
+    writer = SummaryWriter()
+    # Set up a neural network to train
+    gen = Generator(n_hidden=args.n_hidden)
+    dis = Discriminator()
+
+    if args.gpu >= 0:
+        # Make a specified GPU current
+        chainer.cuda.get_device_from_id(args.gpu).use()
+        gen.to_gpu()  # Copy the model to the GPU
+        dis.to_gpu()
+
+    # Setup an optimizer
+    def make_optimizer(model, alpha=0.0002, beta1=0.5):
+        optimizer = chainer.optimizers.Adam(alpha=alpha, beta1=beta1)
+        optimizer.setup(model)
+        optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001), 'hook_dec')
+        return optimizer
+    opt_gen = make_optimizer(gen)
+    opt_dis = make_optimizer(dis)
+
+    if args.dataset == '':
+        # Load the CIFAR10 dataset if args.dataset is not specified
+        train, _ = chainer.datasets.get_cifar10(withlabel=False, scale=255.)
+    else:
+        all_files = os.listdir(args.dataset)
+        image_files = [f for f in all_files if ('png' in f or 'jpg' in f)]
+        print('{} contains {} image files'
+              .format(args.dataset, len(image_files)))
+        train = chainer.datasets\
+            .ImageDataset(paths=image_files, root=args.dataset)
+
+    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
+
+    # Set up a trainer
+    updater = DCGANUpdater(
+        models=(gen, dis),
+        iterator=train_iter,
+        optimizer={
+            'gen': opt_gen, 'dis': opt_dis},
+        device=args.gpu)
+    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
+
+    snapshot_interval = (args.snapshot_interval, 'iteration')
+    display_interval = (args.display_interval, 'iteration')
+    trainer.extend(
+        extensions.snapshot(filename='snapshot_iter_{.updater.iteration}.npz'),
+        trigger=snapshot_interval)
+    trainer.extend(extensions.snapshot_object(
+        gen, 'gen_iter_{.updater.iteration}.npz'), trigger=snapshot_interval)
+    trainer.extend(extensions.snapshot_object(
+        dis, 'dis_iter_{.updater.iteration}.npz'), trigger=snapshot_interval)
+    trainer.extend(extensions.LogReport(trigger=display_interval))
+    trainer.extend(LogTensorboard(trigger=display_interval, logger=writer))
+    trainer.extend(extensions.PrintReport([
+        'epoch', 'iteration', 'gen/loss', 'dis/loss',
+    ]), trigger=display_interval)
+    trainer.extend(extensions.ProgressBar(update_interval=10))
+    trainer.extend(
+        out_generated_image(
+            gen, dis,
+            10, 10, args.seed, args.out, writer),
+        trigger=snapshot_interval)
+
+    if args.resume:
+        # Resume from a snapshot
+        chainer.serializers.load_npz(args.resume, trainer)
+
+    # Run the training
+    trainer.run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tensorboardX/examples/chainer/extension_logger/updater.py b/tensorboardX/examples/chainer/extension_logger/updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9fda48ab2fb36fb02875dafb92a96de14e3e92d
--- /dev/null
+++ b/tensorboardX/examples/chainer/extension_logger/updater.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import chainer
+import chainer.functions as F
+from chainer import Variable
+
+
+class DCGANUpdater(chainer.training.StandardUpdater):
+
+    def __init__(self, *args, **kwargs):
+        self.gen, self.dis = kwargs.pop('models')
+        super(DCGANUpdater, self).__init__(*args, **kwargs)
+
+    def loss_dis(self, dis, y_fake, y_real):
+        batchsize = len(y_fake)
+        L1 = F.sum(F.softplus(-y_real)) / batchsize
+        L2 = F.sum(F.softplus(y_fake)) / batchsize
+        loss = L1 + L2
+        chainer.report({'loss': loss}, dis)
+        return loss
+
+    def loss_gen(self, gen, y_fake):
+        batchsize = len(y_fake)
+        loss = F.sum(F.softplus(-y_fake)) / batchsize
+        chainer.report({'loss': loss}, gen)
+        return loss
+
+    def update_core(self):
+        gen_optimizer = self.get_optimizer('gen')
+        dis_optimizer = self.get_optimizer('dis')
+
+        batch = self.get_iterator('main').next()
+        x_real = Variable(self.converter(batch, self.device)) / 255.
+        xp = chainer.cuda.get_array_module(x_real.data)
+
+        gen, dis = self.gen, self.dis
+        batchsize = len(batch)
+
+        y_real = dis(x_real)
+
+        z = Variable(xp.asarray(gen.make_hidden(batchsize)))
+        x_fake = gen(z)
+        y_fake = dis(x_fake)
+
+        dis_optimizer.update(self.loss_dis, dis, y_fake, y_real)
+        gen_optimizer.update(self.loss_gen, gen, y_fake)
diff --git a/tensorboardX/examples/chainer/extension_logger/visualize.py b/tensorboardX/examples/chainer/extension_logger/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b8421c42c3e5c3b712707ef5f81f9ab3c49aa1f
--- /dev/null
+++ b/tensorboardX/examples/chainer/extension_logger/visualize.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+import os
+
+import numpy as np
+from PIL import Image
+
+import chainer
+import chainer.cuda
+from chainer import Variable
+
+
+def out_generated_image(gen, dis, rows, cols, seed, dst, writer):
+    @chainer.training.make_extension()
+    def make_image(trainer):
+        np.random.seed(seed)
+        n_images = rows * cols
+        xp = gen.xp
+        z = Variable(xp.asarray(gen.make_hidden(n_images)))
+        with chainer.using_config('train', False):
+            x = gen(z)
+        writer.add_image('img', x, trainer.updater.iteration)
+
+    return make_image
diff --git a/tensorboardX/examples/chainer/extension_logger/writetensorboard.py b/tensorboardX/examples/chainer/extension_logger/writetensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..a22e9e64e3548ba22d5f4f101c5b5f2c9965fb69
--- /dev/null
+++ b/tensorboardX/examples/chainer/extension_logger/writetensorboard.py
@@ -0,0 +1,129 @@
+import json
+import os
+import shutil
+import tempfile
+
+import six
+from chainer import reporter
+from chainer import serializer as serializer_module
+from chainer.training import extension
+from chainer.training import trigger as trigger_module
+
+
+class LogTensorboard(extension.Extension):
+
+    """Trainer extension to output the accumulated results to a log file.
+
+    This extension accumulates the observations of the trainer to
+    :class:`~chainer.DictSummary` at a regular interval specified by a supplied
+    trigger, and writes them into a log file in JSON format.
+
+    There are two triggers to handle this extension. One is the trigger to
+    invoke this extension, which is used to handle the timing of accumulating
+    the results. It is set to ``1, 'iteration'`` by default. The other is the
+    trigger to determine when to emit the result. When this trigger returns
+    True, this extension appends the summary of accumulated values to the list
+    of past summaries, and writes the list to the log file. Then, this
+    extension makes a new fresh summary object which is used until the next
+    time that the trigger fires.
+
+    It also adds some entries to each result dictionary.
+
+    - ``'epoch'`` and ``'iteration'`` are the epoch and iteration counts at the
+      output, respectively.
+    - ``'elapsed_time'`` is the elapsed time in seconds since the training
+      begins. The value is taken from :attr:`Trainer.elapsed_time`.
+
+    Args:
+        keys (iterable of strs): Keys of values to accumulate. If this is None,
+            all the values are accumulated and output to the log file.
+        trigger: Trigger that decides when to aggregate the result and output
+            the values. This is distinct from the trigger of this extension
+            itself. If it is a tuple in the form ``<int>, 'epoch'`` or
+            ``<int>, 'iteration'``, it is passed to :class:`IntervalTrigger`.
+        postprocess: Callback to postprocess the result dictionaries. Each
+            result dictionary is passed to this callback on the output. This
+            callback can modify the result dictionaries, which are used to
+            output to the log file.
+        log_name (str): Name of the log file under the output directory. It can
+            be a format string: the last result dictionary is passed for the
+            formatting. For example, users can use '{iteration}' to separate
+            the log files for different iterations. If the log name is None, it
+            does not output the log to any file.
+
+    """
+
+    def __init__(self, keys=None, trigger=(1, 'epoch'), postprocess=None,
+                 log_name='log', logger=None):
+        self._keys = keys
+        self._trigger = trigger_module.get_trigger(trigger)
+        self._postprocess = postprocess
+        self._log_name = log_name
+        self._log = []
+        self._logger = logger
+        self._init_summary()
+
+    def __call__(self, trainer):
+        # accumulate the observations
+        keys = self._keys
+        observation = trainer.observation
+        summary = self._summary
+
+        if keys is None:
+            summary.add(observation)
+        else:
+            summary.add({k: observation[k] for k in keys if k in observation})
+        for k, v in observation.items():
+            #self._logger.add_scalar(k, chainer.cuda.to_cpu(observation[k].data), trainer.updater.iteration)
+            self._logger.add_scalar(
+                k, observation[k], trainer.updater.iteration)
+        if self._trigger(trainer):
+            # output the result
+            stats = self._summary.compute_mean()
+            stats_cpu = {}
+            for name, value in six.iteritems(stats):
+                stats_cpu[name] = float(value)  # copy to CPU
+
+            updater = trainer.updater
+            stats_cpu['epoch'] = updater.epoch
+            stats_cpu['iteration'] = updater.iteration
+            stats_cpu['elapsed_time'] = trainer.elapsed_time
+
+            if self._postprocess is not None:
+                self._postprocess(stats_cpu)
+
+            self._log.append(stats_cpu)
+
+            # write to the log file
+            if self._log_name is not None:
+                log_name = self._log_name.format(**stats_cpu)
+                fd, path = tempfile.mkstemp(prefix=log_name, dir=trainer.out)
+                with os.fdopen(fd, 'w') as f:
+                    json.dump(self._log, f, indent=4)
+
+                new_path = os.path.join(trainer.out, log_name)
+                shutil.move(path, new_path)
+
+            # reset the summary for the next output
+            self._init_summary()
+
+    @property
+    def log(self):
+        """The current list of observation dictionaries."""
+        return self._log
+
+    def serialize(self, serializer):
+        if hasattr(self._trigger, 'serialize'):
+            self._trigger.serialize(serializer['_trigger'])
+
+        # Note that this serialization may lose some information of small
+        # numerical differences.
+        if isinstance(serializer, serializer_module.Serializer):
+            log = json.dumps(self._log)
+            serializer('_log', log)
+        else:
+            log = serializer('_log', '')
+            self._log = json.loads(log)
+
+    def _init_summary(self):
+        self._summary = reporter.DictSummary()
diff --git a/tensorboardX/examples/chainer/plain_logger/data.py b/tensorboardX/examples/chainer/plain_logger/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30a4c3ce4cf9365a462e64e97221c0cf1e9d6ce
--- /dev/null
+++ b/tensorboardX/examples/chainer/plain_logger/data.py
@@ -0,0 +1,69 @@
+import gzip
+import os
+
+import numpy as np
+import six
+from six.moves.urllib import request
+
+parent = 'http://yann.lecun.com/exdb/mnist'
+train_images = 'train-images-idx3-ubyte.gz'
+train_labels = 'train-labels-idx1-ubyte.gz'
+test_images = 't10k-images-idx3-ubyte.gz'
+test_labels = 't10k-labels-idx1-ubyte.gz'
+num_train = 60000
+num_test = 10000
+dim = 784
+
+
+def load_mnist(images, labels, num):
+    data = np.zeros(num * dim, dtype=np.uint8).reshape((num, dim))
+    target = np.zeros(num, dtype=np.uint8).reshape((num, ))
+
+    with gzip.open(images, 'rb') as f_images,\
+            gzip.open(labels, 'rb') as f_labels:
+        f_images.read(16)
+        f_labels.read(8)
+        for i in six.moves.range(num):
+            target[i] = ord(f_labels.read(1))
+            for j in six.moves.range(dim):
+                data[i, j] = ord(f_images.read(1))
+
+    return data, target
+
+
+def download_mnist_data():
+    print('Downloading {:s}...'.format(train_images))
+    request.urlretrieve('{:s}/{:s}'.format(parent, train_images), train_images)
+    print('Done')
+    print('Downloading {:s}...'.format(train_labels))
+    request.urlretrieve('{:s}/{:s}'.format(parent, train_labels), train_labels)
+    print('Done')
+    print('Downloading {:s}...'.format(test_images))
+    request.urlretrieve('{:s}/{:s}'.format(parent, test_images), test_images)
+    print('Done')
+    print('Downloading {:s}...'.format(test_labels))
+    request.urlretrieve('{:s}/{:s}'.format(parent, test_labels), test_labels)
+    print('Done')
+
+    print('Converting training data...')
+    data_train, target_train = load_mnist(train_images, train_labels,
+                                          num_train)
+    print('Done')
+    print('Converting test data...')
+    data_test, target_test = load_mnist(test_images, test_labels, num_test)
+    mnist = {'data': np.append(data_train, data_test, axis=0),
+             'target': np.append(target_train, target_test, axis=0)}
+    print('Done')
+    print('Save output...')
+    with open('mnist.pkl', 'wb') as output:
+        six.moves.cPickle.dump(mnist, output, -1)
+    print('Done')
+    print('Convert completed')
+
+
+def load_mnist_data():
+    if not os.path.exists('mnist.pkl'):
+        download_mnist_data()
+    with open('mnist.pkl', 'rb') as mnist_pickle:
+        mnist = six.moves.cPickle.load(mnist_pickle)
+    return mnist
diff --git a/tensorboardX/examples/chainer/plain_logger/net.py b/tensorboardX/examples/chainer/plain_logger/net.py
new file mode 100644
index 0000000000000000000000000000000000000000..b299ebe4b49578d38996a4af5f58f25f2724943f
--- /dev/null
+++ b/tensorboardX/examples/chainer/plain_logger/net.py
@@ -0,0 +1,65 @@
+import six
+
+import chainer
+import chainer.functions as F
+from chainer.functions.loss.vae import gaussian_kl_divergence
+import chainer.links as L
+
+
+class VAE(chainer.Chain):
+    """Variational AutoEncoder"""
+
+    def __init__(self, n_in, n_latent, n_h):
+        super(VAE, self).__init__()
+        with self.init_scope():
+            # encoder
+            self.le1 = L.Linear(n_in, n_h)
+            self.le2_mu = L.Linear(n_h, n_latent)
+            self.le2_ln_var = L.Linear(n_h, n_latent)
+            # decoder
+            self.ld1 = L.Linear(n_latent, n_h)
+            self.ld2 = L.Linear(n_h, n_in)
+
+    def __call__(self, x, sigmoid=True):
+        """AutoEncoder"""
+        return self.decode(self.encode(x)[0], sigmoid)
+
+    def encode(self, x):
+        h1 = F.tanh(self.le1(x))
+        mu = self.le2_mu(h1)
+        ln_var = self.le2_ln_var(h1)  # log(sigma**2)
+        return mu, ln_var
+
+    def decode(self, z, sigmoid=True):
+        h1 = F.tanh(self.ld1(z))
+        h2 = self.ld2(h1)
+        if sigmoid:
+            return F.sigmoid(h2)
+        else:
+            return h2
+
+    def get_loss_func(self, C=1.0, k=1):
+        """Get loss function of VAE.
+
+        The loss value is equal to ELBO (Evidence Lower Bound)
+        multiplied by -1.
+
+        Args:
+            C (int): Usually this is 1.0. Can be changed to control the
+                second term of ELBO bound, which works as regularization.
+            k (int): Number of Monte Carlo samples used in encoded vector.
+        """
+        def lf(x):
+            mu, ln_var = self.encode(x)
+            batchsize = len(mu.data)
+            # reconstruction loss
+            rec_loss = 0
+            for l in six.moves.range(k):
+                z = F.gaussian(mu, ln_var)
+                rec_loss += F.bernoulli_nll(x, self.decode(z, sigmoid=False)) \
+                    / (k * batchsize)
+            self.rec_loss = rec_loss
+            self.loss = self.rec_loss + \
+                C * gaussian_kl_divergence(mu, ln_var) / batchsize
+            return self.loss
+        return lf
diff --git a/tensorboardX/examples/chainer/plain_logger/train_vae.py b/tensorboardX/examples/chainer/plain_logger/train_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a5fd587f564674440f36ae66762f36f41b5a41a
--- /dev/null
+++ b/tensorboardX/examples/chainer/plain_logger/train_vae.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+"""Chainer example: train a VAE on MNIST
+"""
+from __future__ import print_function
+import argparse
+
+import matplotlib
+# Disable interactive backend
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import six
+
+import chainer
+from chainer import computational_graph
+from chainer import cuda
+from chainer import optimizers
+from chainer import serializers
+from tensorboardX import SummaryWriter
+import data
+import net
+
+writer = SummaryWriter()
+
+parser = argparse.ArgumentParser(description='Chainer example: MNIST')
+parser.add_argument('--initmodel', '-m', default='',
+                    help='Initialize the model from given file')
+parser.add_argument('--resume', '-r', default='',
+                    help='Resume the optimization from snapshot')
+parser.add_argument('--gpu', '-g', default=-1, type=int,
+                    help='GPU ID (negative value indicates CPU)')
+parser.add_argument('--epoch', '-e', default=100, type=int,
+                    help='number of epochs to learn')
+parser.add_argument('--dimz', '-z', default=20, type=int,
+                    help='dimention of encoded vector')
+parser.add_argument('--batchsize', '-b', type=int, default=100,
+                    help='learning minibatch size')
+parser.add_argument('--test', action='store_true',
+                    help='Use tiny datasets for quick tests')
+args = parser.parse_args()
+
+batchsize = args.batchsize
+n_epoch = args.epoch
+n_latent = args.dimz
+
+writer.add_text('config', str(args))
+
+print('GPU: {}'.format(args.gpu))
+print('# dim z: {}'.format(args.dimz))
+print('# Minibatch-size: {}'.format(args.batchsize))
+print('# epoch: {}'.format(args.epoch))
+print('')
+
+# Prepare dataset
+print('load MNIST dataset')
+mnist = data.load_mnist_data()
+mnist['data'] = mnist['data'].astype(np.float32)
+mnist['data'] /= 255
+mnist['target'] = mnist['target'].astype(np.int32)
+
+if args.test:
+    mnist['data'] = mnist['data'][0:100]
+    mnist['target'] = mnist['target'][0:100]
+    N = 30
+else:
+    N = 60000
+
+x_train, x_test = np.split(mnist['data'],   [N])
+y_train, y_test = np.split(mnist['target'], [N])
+N_test = y_test.size
+
+# Prepare VAE model, defined in net.py
+model = net.VAE(784, n_latent, 500)
+if args.gpu >= 0:
+    cuda.get_device_from_id(args.gpu).use()
+    model.to_gpu()
+xp = np if args.gpu < 0 else cuda.cupy
+
+# Setup optimizer
+optimizer = optimizers.Adam()
+optimizer.setup(model)
+
+# Init/Resume
+if args.initmodel:
+    print('Load model from', args.initmodel)
+    serializers.load_npz(args.initmodel, model)
+if args.resume:
+    print('Load optimizer state from', args.resume)
+    serializers.load_npz(args.resume, optimizer)
+
+# Learning loop
+for epoch in six.moves.range(1, n_epoch + 1):
+    print('epoch', epoch)
+
+    # training
+    perm = np.random.permutation(N)
+    sum_loss = 0       # total loss
+    sum_rec_loss = 0   # reconstruction loss
+    for i in six.moves.range(0, N, batchsize):
+        x = chainer.Variable(xp.asarray(x_train[perm[i:i + batchsize]]))
+        optimizer.update(model.get_loss_func(), x)
+        if epoch == 1 and i == 0:
+            with open('graph.dot', 'w') as o:
+                g = computational_graph.build_computational_graph(
+                    (model.loss, ))
+                o.write(g.dump())
+            print('graph generated')
+        writer.add_scalar('train/loss', model.loss, epoch * N + i)
+        writer.add_scalar('train/rec_loss', model.rec_loss, epoch * N + i)
+        sum_loss += float(model.loss.data) * len(x.data)
+        sum_rec_loss += float(model.rec_loss.data) * len(x.data)
+
+    print('train mean loss={}, mean reconstruction loss={}'
+          .format(sum_loss / N, sum_rec_loss / N))
+
+    # evaluation
+    sum_loss = 0
+    sum_rec_loss = 0
+    with chainer.no_backprop_mode():
+        for i in six.moves.range(0, N_test, batchsize):
+            x = chainer.Variable(xp.asarray(x_test[i:i + batchsize]))
+            loss_func = model.get_loss_func(k=10)
+            loss_func(x)
+            sum_loss += float(model.loss.data) * len(x.data)
+            sum_rec_loss += float(model.rec_loss.data) * len(x.data)
+            writer.add_scalar('test/loss', model.loss, epoch * N_test + i)
+            writer.add_scalar('test/rec_loss', model.rec_loss,
+                              epoch * N_test + i)
+            writer.add_image('reconstructed', model(
+                x).reshape(-1, 1, 28, 28), epoch * N_test + i)
+            writer.add_image('input', x.reshape(-1, 1, 28, 28),
+                             epoch * N_test + i)
+            del model.loss
+    print('test  mean loss={}, mean reconstruction loss={}'
+          .format(sum_loss / N_test, sum_rec_loss / N_test))
+
+
+# Save the model and the optimizer
+print('save the model')
+serializers.save_npz('mlp.model', model)
+print('save the optimizer')
+serializers.save_npz('mlp.state', optimizer)
+
+model.to_cpu()
+
+
+# original images and reconstructed images
+def save_images(x, filename):
+    fig, ax = plt.subplots(3, 3, figsize=(9, 9), dpi=100)
+    for ai, xi in zip(ax.flatten(), x):
+        ai.imshow(xi.reshape(28, 28))
+    fig.savefig(filename)
+
+
+train_ind = [1, 3, 5, 10, 2, 0, 13, 15, 17]
+x = chainer.Variable(np.asarray(x_train[train_ind]))
+with chainer.no_backprop_mode():
+    x1 = model(x)
+save_images(x.data, 'train')
+save_images(x1.data, 'train_reconstructed')
+
+test_ind = [3, 2, 1, 18, 4, 8, 11, 17, 61]
+x = chainer.Variable(np.asarray(x_test[test_ind]))
+with chainer.no_backprop_mode():
+    x1 = model(x)
+save_images(x.data, 'test')
+save_images(x1.data, 'test_reconstructed')
+
+
+# draw images from randomly sampled z
+z = chainer.Variable(np.random.normal(0, 1, (9, n_latent)).astype(np.float32))
+x = model.decode(z)
+save_images(x.data, 'sampled')
diff --git a/tensorboardX/examples/demo.py b/tensorboardX/examples/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..373169e9be487e315f8ef73c552895656c2c68a5
--- /dev/null
+++ b/tensorboardX/examples/demo.py
@@ -0,0 +1,89 @@
+import torch
+import torchvision.utils as vutils
+import numpy as np
+import torchvision.models as models
+from torchvision import datasets
+from tensorboardX import SummaryWriter
+import datetime
+
+resnet18 = models.resnet18(False)
+writer = SummaryWriter()
+sample_rate = 44100
+freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440]
+
+true_positive_counts = [75, 64, 21, 5, 0]
+false_positive_counts = [150, 105, 18, 0, 0]
+true_negative_counts = [0, 45, 132, 150, 150]
+false_negative_counts = [0, 11, 54, 70, 75]
+precision = [0.3333333, 0.3786982, 0.5384616, 1.0, 0.0]
+recall = [1.0, 0.8533334, 0.28, 0.0666667, 0.0]
+
+
+for n_iter in range(100):
+    s1 = torch.rand(1)  # value to keep
+    s2 = torch.rand(1)
+    # data grouping by `slash`
+    writer.add_scalar('data/scalar_systemtime', s1[0], n_iter)
+    # data grouping by `slash`
+    writer.add_scalar('data/scalar_customtime', s1[0], n_iter, walltime=n_iter)
+    writer.add_scalars('data/scalar_group', {"xsinx": n_iter * np.sin(n_iter),
+                                             "xcosx": n_iter * np.cos(n_iter),
+                                             "arctanx": np.arctan(n_iter)}, n_iter)
+    x = torch.rand(32, 3, 64, 64)  # output from network
+    if n_iter % 10 == 0:
+        x = vutils.make_grid(x, normalize=True, scale_each=True)
+        writer.add_image('Image', x, n_iter)  # Tensor
+        writer.add_image_with_boxes('imagebox_label', torch.ones(3, 240, 240) * 0.5,
+             torch.Tensor([[10, 10, 100, 100], [101, 101, 200, 200]]),
+             n_iter, 
+             labels=['abcde' + str(n_iter), 'fgh' + str(n_iter)])
+        x = torch.zeros(sample_rate * 2)
+        for i in range(x.size(0)):
+            # sound amplitude should in [-1, 1]
+            x[i] = np.cos(freqs[n_iter // 10] * np.pi *
+                          float(i) / float(sample_rate))
+        writer.add_audio('myAudio', x, n_iter)
+        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
+        writer.add_text('markdown Text', '''a|b\n-|-\nc|d''', n_iter)
+        for name, param in resnet18.named_parameters():
+            if 'bn' not in name:
+                writer.add_histogram(name, param, n_iter)
+        writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(
+            100), n_iter)  # needs tensorboard 0.4RC or later
+        writer.add_pr_curve_raw('prcurve with raw data', true_positive_counts,
+                                false_positive_counts,
+                                true_negative_counts,
+                                false_negative_counts,
+                                precision,
+                                recall, n_iter)
+# export scalar data to JSON for external processing
+writer.export_scalars_to_json("./all_scalars.json")
+
+dataset = datasets.MNIST('mnist', train=False, download=True)
+images = dataset.test_data[:100].float()
+label = dataset.test_labels[:100]
+features = images.view(100, 784)
+writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
+writer.add_embedding(features, global_step=1, tag='noMetadata')
+dataset = datasets.MNIST('mnist', train=True, download=True)
+images_train = dataset.train_data[:100].float()
+labels_train = dataset.train_labels[:100]
+features_train = images_train.view(100, 784)
+
+all_features = torch.cat((features, features_train))
+all_labels = torch.cat((label, labels_train))
+all_images = torch.cat((images, images_train))
+dataset_label = ['test'] * 100 + ['train'] * 100
+all_labels = list(zip(all_labels, dataset_label))
+
+writer.add_embedding(all_features, metadata=all_labels, label_img=all_images.unsqueeze(1),
+                     metadata_header=['digit', 'dataset'], global_step=2)
+
+# VIDEO
+vid_images = dataset.train_data[:16 * 48]
+vid = vid_images.view(16, 48, 1, 28, 28)  # BxTxCxHxW
+
+writer.add_video('video', vid_tensor=vid)
+writer.add_video('video_1_fps', vid_tensor=vid, fps=1)
+
+writer.close()
diff --git a/tensorboardX/examples/demo_beholder.py b/tensorboardX/examples/demo_beholder.py
new file mode 100644
index 0000000000000000000000000000000000000000..067c6dc7c959b8a9996424707fa892db1d51693c
--- /dev/null
+++ b/tensorboardX/examples/demo_beholder.py
@@ -0,0 +1,56 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Simple MNIST classifier to demonstrate features of Beholder.
+
+Based on tensorflow/examples/tutorials/mnist/mnist_with_summaries.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorboardX.beholder as beholder_lib
+import time
+
+from collections import namedtuple
+
+
+LOG_DIRECTORY = '/tmp/beholder-demo'
+tensor_and_name = namedtuple('tensor_and_name', 'tensor, name')
+
+
+def beholder_pytorch():
+    for i in range(1000):
+        fake_param = [tensor_and_name(np.random.randn(128, 768, 3), 'test' + str(i))
+                      for i in range(5)]
+        arrays = [tensor_and_name(np.random.randn(128, 768, 3), 'test' + str(i))
+                  for i in range(5)]
+        beholder = beholder_lib.Beholder(logdir=LOG_DIRECTORY)
+        beholder.update(
+            trainable=fake_param,
+            arrays=arrays,
+            frame=np.random.randn(128, 128),
+        )
+        time.sleep(0.1)
+        print(i)
+
+
+if __name__ == '__main__':
+    import os
+    if not os.path.exists(LOG_DIRECTORY):
+        os.makedirs(LOG_DIRECTORY)
+    print(LOG_DIRECTORY)
+    beholder_pytorch()
diff --git a/tensorboardX/examples/demo_caffe2.py b/tensorboardX/examples/demo_caffe2.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa2c687c44744d9e6bfcc81aa356cc1da13e9313
--- /dev/null
+++ b/tensorboardX/examples/demo_caffe2.py
@@ -0,0 +1,273 @@
+try:
+    import caffe2.python.predictor.predictor_exporter as pe
+except ImportError:
+    print('Please check that Caffe2 is installed correctly to run this demo.')
+import numpy as np
+import os
+import shutil
+
+from caffe2.python import core, model_helper, workspace, brew
+from tensorboardX import TorchVis
+
+"""
+This is a demo showcasing specific functionality for Caffe2. Shown here are
+    add_scalar (with both raw numerical data and Caffe2 blob names)
+    add_scalars (with both raw numerical data and Caffe2 blob names)
+    add_graph (visualizing a Caffe2 model as a graph)
+
+NOTE: lmdb must be installed and enabled with -DUSE_LMDB=ON for this demo to work.
+"""
+
+# If you would like to see some really detailed initializations,
+# you can change --caffe2_log_level=0 to --caffe2_log_level=-1
+core.GlobalInit(['caffe2', '--caffe2_log_level=0'])
+print("Necessities imported!")
+
+
+# This section preps your image and test set in a lmdb database
+def DownloadResource(url, path):
+    '''Downloads resources from s3 by url and unzips them to the provided path'''
+    import requests
+    from six import BytesIO
+    import zipfile
+    print("Downloading... {} to {}".format(url, path))
+    r = requests.get(url, stream=True)
+    z = zipfile.ZipFile(BytesIO(r.content))
+    z.extractall(path)
+    print("Completed download and extraction.")
+
+
+current_folder = os.path.join(os.path.expanduser('~'), 'caffe2_notebooks')
+data_folder = os.path.join(current_folder, 'tutorial_data', 'mnist')
+root_folder = os.path.join(current_folder, 'tutorial_files', 'tutorial_mnist')
+db_missing = False
+
+if not os.path.exists(data_folder):
+    os.makedirs(data_folder)
+    print("Your data folder was not found!! This was generated: {}".format(data_folder))
+
+# Look for existing database: lmdb
+if os.path.exists(os.path.join(data_folder, "mnist-train-nchw-lmdb")):
+    print("lmdb train db found!")
+else:
+    db_missing = True
+
+if os.path.exists(os.path.join(data_folder, "mnist-test-nchw-lmdb")):
+    print("lmdb test db found!")
+else:
+    db_missing = True
+
+# attempt the download of the db if either was missing
+if db_missing:
+    print("one or both of the MNIST lmbd dbs not found!!")
+    db_url = "http://download.caffe2.ai/databases/mnist-lmdb.zip"
+    try:
+        DownloadResource(db_url, data_folder)
+    except Exception as ex:
+        print(
+            "Failed to download dataset. Please download it manually from {}".format(db_url))
+        print("Unzip it and place the two database folders here: {}".format(data_folder))
+        raise ex
+
+if os.path.exists(root_folder):
+    print("Looks like you ran this before, so we need to cleanup those old files...")
+    shutil.rmtree(root_folder)
+
+os.makedirs(root_folder)
+workspace.ResetWorkspace(root_folder)
+
+print("training data folder:" + data_folder)
+print("workspace root folder:" + root_folder)
+
+# END DATA PREPARATION #
+
+# Create TorchVis in preparation for writing. Default format is 'tensorboard'
+tv = TorchVis()
+
+
+def AddInput(model, batch_size, db, db_type):
+    # load the data
+    data_uint8, label = model.TensorProtosDBInput(
+        [], ["data_uint8", "label"], batch_size=batch_size,
+        db=db, db_type=db_type)
+    # cast the data to float
+    data = model.Cast(data_uint8, "data", to=core.DataType.FLOAT)
+    # scale data from [0,255] down to [0,1]
+    data = model.Scale(data, data, scale=float(1. / 256))
+    # don't need the gradient for the backward pass
+    data = model.StopGradient(data, data)
+    return data, label
+
+
+def AddLeNetModel(model, data):
+    '''
+    This part is the standard LeNet model: from data to the softmax prediction.
+
+    For each convolutional layer we specify dim_in - number of input channels
+    and dim_out - number or output channels. Also each Conv and MaxPool layer changes the
+    image size. For example, kernel of size 5 reduces each side of an image by 4.
+
+    While when we have kernel and stride sizes equal 2 in a MaxPool layer, it divides
+    each side in half.
+    '''
+    # Image size: 28 x 28 -> 24 x 24
+    conv1 = brew.conv(model, data, 'conv1', dim_in=1, dim_out=20, kernel=5)
+    # Image size: 24 x 24 -> 12 x 12
+    pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
+    # Image size: 12 x 12 -> 8 x 8
+    conv2 = brew.conv(model, pool1, 'conv2', dim_in=20, dim_out=100, kernel=5)
+    # Image size: 8 x 8 -> 4 x 4
+    pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
+    # 50 * 4 * 4 stands for dim_out from previous layer multiplied by the
+    # image size
+    fc3 = brew.fc(model, pool2, 'fc3', dim_in=100 * 4 * 4, dim_out=500)
+    relu = brew.relu(model, fc3, fc3)
+    pred = brew.fc(model, relu, 'pred', 500, 10)
+    softmax = brew.softmax(model, pred, 'softmax')
+    return softmax
+
+
+def AddAccuracy(model, softmax, label):
+    """Adds an accuracy op to the model"""
+    accuracy = brew.accuracy(model, [softmax, label], "accuracy")
+    return accuracy
+
+
+def AddTrainingOperators(model, softmax, label):
+    """Adds training operators to the model."""
+    xent = model.LabelCrossEntropy([softmax, label], 'xent')
+    # compute the expected loss
+    loss = model.AveragedLoss(xent, "loss")
+    # track the accuracy of the model
+    AddAccuracy(model, softmax, label)
+    # use the average loss we just computed to add gradient operators to the
+    # model
+    model.AddGradientOperators([loss])
+    # do a simple stochastic gradient descent
+    ITER = brew.iter(model, "iter")
+    # set the learning rate schedule
+    LR = model.LearningRate(
+        ITER, "LR", base_lr=-0.1, policy="step", stepsize=1, gamma=0.999)
+    # ONE is a constant value that is used in the gradient update. We only need
+    # to create it once, so it is explicitly placed in param_init_net.
+    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
+    # Now, for each parameter, we do the gradient updates.
+    for param in model.params:
+        # Note how we get the gradient of each parameter - ModelHelper keeps
+        # track of that.
+        param_grad = model.param_to_grad[param]
+        # The update is a simple weighted sum: param = param + param_grad * LR
+        model.WeightedSum([param, ONE, param_grad, LR], param)
+
+
+def AddBookkeepingOperators(model):
+    """This adds a few bookkeeping operators that we can inspect later.
+
+    These operators do not affect the training procedure: they only collect
+    statistics and prints them to file or to logs.
+    """
+    # Print basically prints out the content of the blob. to_file=1 routes the
+    # printed output to a file. The file is going to be stored under
+    #     root_folder/[blob name]
+    model.Print('accuracy', [], to_file=1)
+    model.Print('loss', [], to_file=1)
+    # Summarizes the parameters. Different from Print, Summarize gives some
+    # statistics of the parameter, such as mean, std, min and max.
+    for param in model.params:
+        model.Summarize(param, [], to_file=1)
+        model.Summarize(model.param_to_grad[param], [], to_file=1)
+    # Now, if we really want to be verbose, we can summarize EVERY blob
+    # that the model produces; it is probably not a good idea, because that
+    # is going to take time - summarization do not come for free. For this
+    # demo, we will only show how to summarize the parameters and their
+    # gradients.
+
+
+arg_scope = {"order": "NCHW"}
+train_model = model_helper.ModelHelper(name="mnist_train", arg_scope=arg_scope)
+data, label = AddInput(
+    train_model, batch_size=64,
+    db=os.path.join(data_folder, 'mnist-train-nchw-lmdb'),
+    db_type='lmdb')
+softmax = AddLeNetModel(train_model, data)
+AddTrainingOperators(train_model, softmax, label)
+AddBookkeepingOperators(train_model)
+
+# Visualize the Caffe2 model in Tensorboard
+tv.add_graph(train_model, data)
+
+# Testing model. We will set the batch size to 100, so that the testing
+# pass is 100 iterations (10,000 images in total).
+# For the testing model, we need the data input part, the main LeNetModel
+# part, and an accuracy part. Note that init_params is set False because
+# we will be using the parameters obtained from the train model.
+test_model = model_helper.ModelHelper(
+    name="mnist_test", arg_scope=arg_scope, init_params=False)
+data, label = AddInput(
+    test_model, batch_size=100,
+    db=os.path.join(data_folder, 'mnist-test-nchw-lmdb'),
+    db_type='lmdb')
+softmax = AddLeNetModel(test_model, data)
+AddAccuracy(test_model, softmax, label)
+
+# Deployment model. We simply need the main LeNetModel part.
+deploy_model = model_helper.ModelHelper(
+    name="mnist_deploy", arg_scope=arg_scope, init_params=False)
+AddLeNetModel(deploy_model, "data")
+# You may wonder what happens with the param_init_net part of the deploy_model.
+# No, we will not use them, since during deployment time we will not randomly
+# initialize the parameters, but load the parameters from the db.
+
+with open(os.path.join(root_folder, "train_net.pbtxt"), 'w') as fid:
+    fid.write(str(train_model.net.Proto()))
+with open(os.path.join(root_folder, "train_init_net.pbtxt"), 'w') as fid:
+    fid.write(str(train_model.param_init_net.Proto()))
+with open(os.path.join(root_folder, "test_net.pbtxt"), 'w') as fid:
+    fid.write(str(test_model.net.Proto()))
+with open(os.path.join(root_folder, "test_init_net.pbtxt"), 'w') as fid:
+    fid.write(str(test_model.param_init_net.Proto()))
+with open(os.path.join(root_folder, "deploy_net.pbtxt"), 'w') as fid:
+    fid.write(str(deploy_model.net.Proto()))
+print("Protocol buffers files have been created in your root folder: " + root_folder)
+
+# The parameter initialization network only needs to be run once.
+workspace.RunNetOnce(train_model.param_init_net)
+# creating the network
+workspace.CreateNet(train_model.net, overwrite=True)
+# set the number of iterations and track the accuracy & loss
+total_iters = 200
+accuracy = np.zeros(total_iters)
+loss = np.zeros(total_iters)
+# Now, we will manually run the network for 200 iterations.
+for i in range(total_iters):
+    workspace.RunNet(train_model.net)
+    accuracy[i] = workspace.FetchBlob('accuracy')
+    loss[i] = workspace.FetchBlob('loss')
+    scalar_dict_raw = {'accuracy': accuracy[i], 'loss': loss[i]}
+    scalar_dict_blobname = {'accuracy': 'accuracy', 'loss': 'loss'}
+    # Can pass raw numerical data
+    tv.add_scalars('training_raw', scalar_dict_raw, i)
+    # Can also pass blobname corresponding to data, for fetching
+    tv.add_scalars('training_blobname', scalar_dict_blobname, i)
+
+data = workspace.FetchBlob('data')
+softmax = workspace.FetchBlob('softmax')
+
+# Convolutions for this mini-batch
+conv = workspace.FetchBlob('conv1')
+shape = list(conv.shape)
+shape[1] = 1
+# We can look into any channel. This of it as a feature model learned
+conv = conv[:, 15, :, :].reshape(shape)
+
+# run a test pass on the test net
+workspace.RunNetOnce(test_model.param_init_net)
+workspace.CreateNet(test_model.net, overwrite=True)
+test_accuracy = np.zeros(100)
+for i in range(100):
+    workspace.RunNet(test_model.net.Proto().name)
+    test_accuracy[i] = workspace.FetchBlob('accuracy')
+    tv.add_scalar('test_accuracy_raw', test_accuracy[i], i)
+    tv.add_scalar('test_accuracy_blobname', 'accuracy', i)
+# After the execution is done, let's plot the values.
+print('test_accuracy: %f' % test_accuracy.mean())
diff --git a/tensorboardX/examples/demo_custom_scalars.py b/tensorboardX/examples/demo_custom_scalars.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef78499ca769355dc3bfdec63f81a5338fcb7fc9
--- /dev/null
+++ b/tensorboardX/examples/demo_custom_scalars.py
@@ -0,0 +1,42 @@
+from numpy.random import rand
+from tensorboardX import SummaryWriter
+import time
+
+
+with SummaryWriter() as writer:
+    for n_iter in range(100):
+        writer.add_scalar('twse/0050', rand(), n_iter)
+        writer.add_scalar('twse/2330', rand(), n_iter)
+        t = rand()
+        writer.add_scalar('dow/aaa', t, n_iter)
+        writer.add_scalar('dow/bbb', t - 1, n_iter)
+        writer.add_scalar('dow/ccc', t + 1, n_iter)
+        writer.add_scalar('nasdaq/aaa', rand(), n_iter)
+        writer.add_scalar('nasdaq/bbb', rand(), n_iter)
+        writer.add_scalar('nasdaq/ccc', rand(), n_iter)
+
+    layout = {'Taiwan': {'twse': ['Multiline', ['twse/0050', 'twse/2330']]},
+              'USA': {'dow': ['Margin', ['dow/aaa', 'dow/bbb', 'dow/ccc']],
+                      'nasdaq': ['Margin', ['nasdaq/aaa', 'nasdaq/bbb', 'nasdaq/ccc']]}}
+    writer.add_custom_scalars(layout)
+#    writer.add_custom_scalars(layout) second call has no effect
+
+time.sleep(1)
+
+with SummaryWriter() as writer:
+    for n_iter in range(100):
+        writer.add_scalar('twse/0050', rand(), n_iter)
+        writer.add_scalar('twse/2330', rand(), n_iter)
+
+    writer.add_custom_scalars_multilinechart(['twse/0050', 'twse/2330'])
+
+time.sleep(1)
+
+with SummaryWriter() as writer:
+    for n_iter in range(100):
+        t = rand()
+        writer.add_scalar('dow/aaa', t, n_iter)
+        writer.add_scalar('dow/bbb', t - 1, n_iter)
+        writer.add_scalar('dow/ccc', t + 1, n_iter)
+
+    writer.add_custom_scalars_marginchart(['dow/aaa', 'dow/bbb', 'dow/ccc'])
diff --git a/tensorboardX/examples/demo_embedding.py b/tensorboardX/examples/demo_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..77274d183317fc8abd6d3b06ca4283f5ce4c2252
--- /dev/null
+++ b/tensorboardX/examples/demo_embedding.py
@@ -0,0 +1,96 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+from torch.autograd.variable import Variable
+from tensorboardX import SummaryWriter
+from torch.utils.data import TensorDataset, DataLoader
+
+# EMBEDDING VISUALIZATION FOR A TWO-CLASSES PROBLEM
+
+# just a bunch of layers
+
+
+class M(nn.Module):
+    def __init__(self):
+        super(M, self).__init__()
+        self.cn1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3)
+        self.cn2 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3)
+        self.fc1 = nn.Linear(in_features=128, out_features=2)
+
+    def forward(self, i):
+        i = self.cn1(i)
+        i = F.relu(i)
+        i = F.max_pool2d(i, 2)
+        i = self.cn2(i)
+        i = F.relu(i)
+        i = F.max_pool2d(i, 2)
+        i = i.view(len(i), -1)
+        i = self.fc1(i)
+        i = F.log_softmax(i, dim=1)
+        return i
+
+# get some random data around value
+
+
+def get_data(value, shape):
+    data = torch.ones(shape) * value
+    # add some noise
+    data += torch.randn(shape)**2
+    return data
+
+
+# dataset
+# cat some data with different values
+data = torch.cat(
+    (get_data(
+        0, (100, 1, 14, 14)), get_data(
+            0.5, (100, 1, 14, 14))), 0)
+# labels
+labels = torch.cat((torch.zeros(100), torch.ones(100)), 0)
+# generator
+gen = DataLoader(TensorDataset(data, labels), batch_size=25, shuffle=True)
+# network
+m = M()
+#loss and optim
+loss = nn.NLLLoss()
+optimizer = torch.optim.Adam(params=m.parameters())
+# settings for train and log
+num_epochs = 20
+embedding_log = 5
+writer = SummaryWriter(comment='mnist_embedding_training')
+
+# TRAIN
+for epoch in range(num_epochs):
+    for j, sample in enumerate(gen):
+        n_iter = (epoch * len(gen)) + j
+        # reset grad
+        m.zero_grad()
+        optimizer.zero_grad()
+        # get batch data
+        data_batch = Variable(sample[0], requires_grad=True).float()
+        label_batch = Variable(sample[1], requires_grad=False).long()
+        # FORWARD
+        out = m(data_batch)
+        loss_value = loss(out, label_batch)
+        # BACKWARD
+        loss_value.backward()
+        optimizer.step()
+        # LOGGING
+        writer.add_scalar('loss', loss_value.data.item(), n_iter)
+
+        if j % embedding_log == 0:
+            print("loss_value:{}".format(loss_value.data.item()))
+            # we need 3 dimension for tensor to visualize it!
+            out = torch.cat((out.data, torch.ones(len(out), 1)), 1)
+            writer.add_embedding(
+                out,
+                metadata=label_batch.data,
+                label_img=data_batch.data,
+                global_step=n_iter)
+
+writer.close()
+
+# tensorboard --logdir runs
+# you should now see a dropdown list with all the timestep,
+# last timestep should have a visible separation between the two classes
diff --git a/tensorboardX/examples/demo_graph.py b/tensorboardX/examples/demo_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa8362c9d79824cfd620f98e099802f1a853556a
--- /dev/null
+++ b/tensorboardX/examples/demo_graph.py
@@ -0,0 +1,273 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from torch.autograd import Variable
+from tensorboardX import SummaryWriter
+
+dummy_input = (torch.zeros(1, 3),)
+
+
+class LinearInLinear(nn.Module):
+    def __init__(self):
+        super(LinearInLinear, self).__init__()
+        self.l = nn.Linear(3, 5)
+
+    def forward(self, x):
+        return self.l(x)
+
+with SummaryWriter(comment='LinearInLinear') as w:
+    w.add_graph(LinearInLinear(), dummy_input, True)
+
+
+class MultipleInput(nn.Module):
+    def __init__(self):
+        super(MultipleInput, self).__init__()
+        self.Linear_1 = nn.Linear(3, 5)
+
+
+    def forward(self, x, y):
+        return self.Linear_1(x+y)
+
+with SummaryWriter(comment='MultipleInput') as w:
+    w.add_graph(MultipleInput(), (torch.zeros(1, 3), torch.zeros(1, 3)), True)
+
+class MultipleOutput(nn.Module):
+    def __init__(self):
+        super(MultipleOutput, self).__init__()
+        self.Linear_1 = nn.Linear(3, 5)
+        self.Linear_2 = nn.Linear(3, 7)
+
+    def forward(self, x):
+        return self.Linear_1(x), self.Linear_2(x)
+
+with SummaryWriter(comment='MultipleOutput') as w:
+    w.add_graph(MultipleOutput(), dummy_input, True)
+
+
+class MultipleOutput_shared(nn.Module):
+    def __init__(self):
+        super(MultipleOutput_shared, self).__init__()
+        self.Linear_1 = nn.Linear(3, 5)
+
+    def forward(self, x):
+        return self.Linear_1(x), self.Linear_1(x)
+
+with SummaryWriter(comment='MultipleOutput_shared') as w:
+    w.add_graph(MultipleOutput_shared(), dummy_input, True)
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super(SimpleModel, self).__init__()
+
+    def forward(self, x):
+        return x * 2
+
+
+model = SimpleModel()
+dummy_input = (torch.zeros(1, 2, 3),)
+
+with SummaryWriter(comment='constantModel') as w:
+    w.add_graph(model, dummy_input, True)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        # self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += residual
+        out = F.relu(out)
+        return out
+
+
+dummy_input = torch.rand(1, 3, 224, 224)
+
+with SummaryWriter(comment='basicblock') as w:
+    model = BasicBlock(3, 3)
+    w.add_graph(model, (dummy_input, ), verbose=True)
+
+
+
+
+class Net1(nn.Module):
+    def __init__(self):
+        super(Net1, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+        self.bn = nn.BatchNorm2d(20)
+
+    def forward(self, x):
+        x = F.max_pool2d(self.conv1(x), 2)
+        x = F.relu(x) + F.relu(-x)
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = self.bn(x)
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        x = F.softmax(x, dim=1)
+        return x
+
+
+class Net2(nn.Module):
+    def __init__(self):
+        super(Net2, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        x = F.log_softmax(x, dim=1)
+        return x
+
+
+dummy_input = Variable(torch.rand(13, 1, 28, 28))
+
+model = Net1()
+with SummaryWriter(comment='Net1') as w:
+    w.add_graph(model, (dummy_input, ))
+
+model = Net2()
+with SummaryWriter(comment='Net2') as w:
+    w.add_graph(model, (dummy_input, ))
+
+
+class SiameseNetwork(nn.Module):
+    def __init__(self):
+        super(SiameseNetwork, self).__init__()
+        self.cnn1 = Net1()
+
+    def forward_once(self, x):
+        output = self.cnn1(x)
+        return output
+
+    def forward(self, input1, input2):
+        output1 = self.forward_once(input1)
+        output2 = self.forward_once(input2)
+        return output1, output2
+
+model = SiameseNetwork()
+with SummaryWriter(comment='SiameseNetwork') as w:
+    w.add_graph(model, (dummy_input, dummy_input))
+
+
+dummy_input = torch.Tensor(1, 3, 224, 224)
+
+with SummaryWriter(comment='alexnet') as w:
+    model = torchvision.models.alexnet()
+    w.add_graph(model, (dummy_input, ))
+
+with SummaryWriter(comment='vgg19') as w:
+    model = torchvision.models.vgg19()
+    w.add_graph(model, (dummy_input, ))
+
+with SummaryWriter(comment='densenet121') as w:
+    model = torchvision.models.densenet121()
+    w.add_graph(model, (dummy_input, ))
+
+with SummaryWriter(comment='resnet18') as w:
+    model = torchvision.models.resnet18()
+    w.add_graph(model, (dummy_input, ))
+
+
+
+class RNN(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(RNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.i2h = nn.Linear(
+            n_categories +
+            input_size +
+            hidden_size,
+            hidden_size)
+        self.i2o = nn.Linear(
+            n_categories +
+            input_size +
+            hidden_size,
+            output_size)
+        self.o2o = nn.Linear(hidden_size + output_size, output_size)
+        self.dropout = nn.Dropout(0.1)
+        self.softmax = nn.LogSoftmax(dim=1)
+
+    def forward(self, category, input, hidden):
+        input_combined = torch.cat((category, input, hidden), 1)
+        hidden = self.i2h(input_combined)
+        output = self.i2o(input_combined)
+        output_combined = torch.cat((hidden, output), 1)
+        output = self.o2o(output_combined)
+        output = self.dropout(output)
+        output = self.softmax(output)
+        return output, hidden, input
+
+    def initHidden(self):
+        return torch.zeros(1, self.hidden_size)
+
+
+n_letters = 100
+n_hidden = 128
+n_categories = 10
+rnn = RNN(n_letters, n_hidden, n_categories)
+cat = torch.Tensor(1, n_categories)
+dummy_input = torch.Tensor(1, n_letters)
+hidden = torch.Tensor(1, n_hidden)
+
+
+out, hidden, input = rnn(cat, dummy_input, hidden)
+with SummaryWriter(comment='RNN') as w:
+    w.add_graph(rnn, (cat, dummy_input, hidden), verbose=False)
+
+
+
+lstm = torch.nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
+inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5
+
+# initialize the hidden state.
+hidden = (torch.randn(1, 1, 3),
+          torch.randn(1, 1, 3))
+for i in inputs:
+    out, hidden = lstm(i.view(1, 1, -1), hidden)
+
+with SummaryWriter(comment='lstm') as w:
+    w.add_graph(lstm, (torch.randn(1, 3).view(1, 1, -1), hidden), verbose=True)
+
+
+import pytest
+print('expect error here:')
+with pytest.raises(Exception) as e_info:
+    dummy_input = torch.rand(1, 1, 224, 224)
+    with SummaryWriter(comment='basicblock_error') as w:
+        w.add_graph(model, (dummy_input, ))  # error
diff --git a/tensorboardX/examples/demo_hparams.py b/tensorboardX/examples/demo_hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b0742a933ed3e9cd6ef0bb1e182df5d9aaac13
--- /dev/null
+++ b/tensorboardX/examples/demo_hparams.py
@@ -0,0 +1,24 @@
+from tensorboardX import SummaryWriter
+import time
+import random
+
+
+hparam = {'lr': [0.1, 0.01, 0.001],
+          'bsize': [1, 2, 4],
+          'n_hidden': [100, 200]}
+
+metrics = {'accuracy', 'loss'}
+
+def train(lr, bsize, n_hidden):
+    x = random.random()
+    return x, x*5
+
+with SummaryWriter() as w:
+    for lr in hparam['lr']:
+        for bsize in hparam['bsize']:
+            for n_hidden in hparam['n_hidden']:
+                accu, loss = train(lr, bsize, n_hidden)
+                
+                w.add_hparams({'lr': lr, 'bsize': bsize, 'n_hidden': n_hidden},
+                                    {'accuracy': accu, 'loss': loss})
+
diff --git a/tensorboardX/examples/demo_matplotlib.py b/tensorboardX/examples/demo_matplotlib.py
new file mode 100644
index 0000000000000000000000000000000000000000..3357bb8cdf057dad3c215323206626355d40ffa0
--- /dev/null
+++ b/tensorboardX/examples/demo_matplotlib.py
@@ -0,0 +1,18 @@
+import matplotlib.pyplot as plt
+plt.switch_backend('agg')
+
+fig = plt.figure()
+
+c1 = plt.Circle((0.2, 0.5), 0.2, color='r')
+c2 = plt.Circle((0.8, 0.5), 0.2, color='r')
+
+ax = plt.gca()
+ax.add_patch(c1)
+ax.add_patch(c2)
+plt.axis('scaled')
+
+
+from tensorboardX import SummaryWriter
+writer = SummaryWriter()
+writer.add_figure('matplotlib', fig)
+writer.close()
diff --git a/tensorboardX/examples/demo_multiple_embedding.py b/tensorboardX/examples/demo_multiple_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..255ff3abf8f18889e2756cc3c4f161c5c56ef726
--- /dev/null
+++ b/tensorboardX/examples/demo_multiple_embedding.py
@@ -0,0 +1,65 @@
+import math
+import numpy as np
+from tensorboardX import SummaryWriter
+
+
+def main():
+    degrees = np.linspace(0, 3600 * math.pi / 180.0, 3600)
+    degrees = degrees.reshape(3600, 1)
+    labels = ["%d" % (i) for i in range(0, 3600)]
+
+    with SummaryWriter() as writer:
+        # Maybe make a bunch of data that's always shifted in some
+        # way, and that will be hard for PCA to turn into a sphere?
+
+        for epoch in range(0, 16):
+            shift = epoch * 2 * math.pi / 16.0
+            mat = np.concatenate([
+                np.sin(shift + degrees * 2 * math.pi / 180.0),
+                np.sin(shift + degrees * 3 * math.pi / 180.0),
+                np.sin(shift + degrees * 5 * math.pi / 180.0),
+                np.sin(shift + degrees * 7 * math.pi / 180.0),
+                np.sin(shift + degrees * 11 * math.pi / 180.0)
+            ], axis=1)
+            writer.add_embedding(
+                mat=mat,
+                metadata=labels,
+                tag="sin",
+                global_step=epoch)
+
+            mat = np.concatenate([
+                np.cos(shift + degrees * 2 * math.pi / 180.0),
+                np.cos(shift + degrees * 3 * math.pi / 180.0),
+                np.cos(shift + degrees * 5 * math.pi / 180.0),
+                np.cos(shift + degrees * 7 * math.pi / 180.0),
+                np.cos(shift + degrees * 11 * math.pi / 180.0)
+            ], axis=1)
+            writer.add_embedding(
+                mat=mat,
+                metadata=labels,
+                tag="cos",
+                global_step=epoch)
+
+            mat = np.concatenate([
+                np.tan(shift + degrees * 2 * math.pi / 180.0),
+                np.tan(shift + degrees * 3 * math.pi / 180.0),
+                np.tan(shift + degrees * 5 * math.pi / 180.0),
+                np.tan(shift + degrees * 7 * math.pi / 180.0),
+                np.tan(shift + degrees * 11 * math.pi / 180.0)
+            ], axis=1)
+            writer.add_embedding(
+                mat=mat,
+                metadata=labels,
+                tag="tan",
+                global_step=epoch)
+
+
+if __name__ == "__main__":
+    main()
+
+# tensorboard --logdir runs
+# Under "Projection, you should see
+#  48 tensor found named
+#     cos:cos-00000 to cos:cos-00016
+#     sin:sin-00000 to sin:sin-00016
+#     tan:tan-00000 to tan:tan-00016
diff --git a/tensorboardX/examples/demo_nvidia_smi.py b/tensorboardX/examples/demo_nvidia_smi.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed4fe1c218e0335cb0e09c5a7cb89954986291bd
--- /dev/null
+++ b/tensorboardX/examples/demo_nvidia_smi.py
@@ -0,0 +1,24 @@
+"""
+write gpu and (gpu) memory usage of nvidia cards as scalar
+"""
+from tensorboardX import SummaryWriter
+import time
+import torch
+try:
+    import nvidia_smi
+    nvidia_smi.nvmlInit()
+    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)  # gpu0
+except ImportError:
+    print('This demo needs nvidia-ml-py or nvidia-ml-py3')
+    exit()
+
+
+with SummaryWriter() as writer:
+    x = []
+    for n_iter in range(50):
+        x.append(torch.Tensor(1000, 1000).cuda())
+        res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
+        writer.add_scalar('nv/gpu', res.gpu, n_iter)
+        res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+        writer.add_scalar('nv/gpu_mem', res.used, n_iter)
+        time.sleep(0.1)
diff --git a/tensorboardX/examples/demo_onnx.py b/tensorboardX/examples/demo_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..427efaca87624c3fccf3b1e96be9a4ab153d08de
--- /dev/null
+++ b/tensorboardX/examples/demo_onnx.py
@@ -0,0 +1,14 @@
+from tensorboardX import SummaryWriter
+
+import subprocess
+zoo_address = 'https://onnxzoo.blob.core.windows.net/models/opset_8/mnist/mnist.tar.gz'
+
+res = subprocess.call(['wget', '-nc', zoo_address])
+assert res == 0, 'cannot download example onnx model from the zoo'
+res = subprocess.call(['tar', 'xf', 'mnist.tar.gz', '-C', 'examples/', 'mnist/model.onnx'])
+
+
+
+with SummaryWriter() as w:
+    w.add_onnx_graph('examples/mnist/model.onnx')
+    # w.add_onnx_graph('/Users/dexter/Downloads/resnet50/model.onnx')
diff --git a/tensorboardX/examples/demo_purge.py b/tensorboardX/examples/demo_purge.py
new file mode 100644
index 0000000000000000000000000000000000000000..68dece208bce3a369974c57278e90a5119cc12b0
--- /dev/null
+++ b/tensorboardX/examples/demo_purge.py
@@ -0,0 +1,13 @@
+from time import sleep
+from tensorboardX import SummaryWriter
+
+with SummaryWriter(logdir='runs/purge') as w:
+    for i in range(100):
+        w.add_scalar('purgetest', i, i)
+
+sleep(1.0)
+
+with SummaryWriter(logdir='runs/purge', purge_step=42) as w:
+    # event 42~99 are removed (inclusively)
+    for i in range(42, 100):
+        w.add_scalar('purgetest', 42, i)
diff --git a/tensorboardX/examples/tensorboardX b/tensorboardX/examples/tensorboardX
new file mode 100644
index 0000000000000000000000000000000000000000..f87fda4faca602870065cdaf9276f66adeac1098
--- /dev/null
+++ b/tensorboardX/examples/tensorboardX
@@ -0,0 +1 @@
+../tensorboardX/
\ No newline at end of file
diff --git a/tensorboardX/screenshots/Demo.gif b/tensorboardX/screenshots/Demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..36156f70ab880ed3827793883fca3302864b4dd1
Binary files /dev/null and b/tensorboardX/screenshots/Demo.gif differ
diff --git a/tensorboardX/screenshots/audio.png b/tensorboardX/screenshots/audio.png
new file mode 100644
index 0000000000000000000000000000000000000000..c5d869288a16390a41e48d422dd174891ee28833
Binary files /dev/null and b/tensorboardX/screenshots/audio.png differ
diff --git a/tensorboardX/screenshots/distribution.png b/tensorboardX/screenshots/distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..27b754fc53a8ebf2756dffda43f220541f1bd4b4
Binary files /dev/null and b/tensorboardX/screenshots/distribution.png differ
diff --git a/tensorboardX/screenshots/embedding.png b/tensorboardX/screenshots/embedding.png
new file mode 100644
index 0000000000000000000000000000000000000000..977160fe3f158ad0058db345108712a71f7ad29f
Binary files /dev/null and b/tensorboardX/screenshots/embedding.png differ
diff --git a/tensorboardX/screenshots/graph.png b/tensorboardX/screenshots/graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..08df6355c028f9a8dce448de2ec6ef052e82ef21
Binary files /dev/null and b/tensorboardX/screenshots/graph.png differ
diff --git a/tensorboardX/screenshots/histogram.png b/tensorboardX/screenshots/histogram.png
new file mode 100644
index 0000000000000000000000000000000000000000..3cada45083a34fa2659221dacbb7e7eebc5dff32
Binary files /dev/null and b/tensorboardX/screenshots/histogram.png differ
diff --git a/tensorboardX/screenshots/image.png b/tensorboardX/screenshots/image.png
new file mode 100644
index 0000000000000000000000000000000000000000..bf0fee3d087ca9c52499f15014932120f04f0a0b
--- /dev/null
+++ b/tensorboardX/screenshots/image.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d5942de0f04c4b906d0607bce2024e317a6e7ef70059af28b9b0b25e69636ea
+size 1077074
diff --git a/tensorboardX/screenshots/scalar.png b/tensorboardX/screenshots/scalar.png
new file mode 100644
index 0000000000000000000000000000000000000000..fcdea9aedcadb1dae5c0b7c285a25b775499e9d1
Binary files /dev/null and b/tensorboardX/screenshots/scalar.png differ
diff --git a/tensorboardX/screenshots/text.png b/tensorboardX/screenshots/text.png
new file mode 100644
index 0000000000000000000000000000000000000000..1c1ac7a320c785ecfc2114ee993fa09c4fc6da73
Binary files /dev/null and b/tensorboardX/screenshots/text.png differ
diff --git a/tensorboardX/setup.cfg b/tensorboardX/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..5bb011148f27024fefa5d213cfca6467fc2e7494
--- /dev/null
+++ b/tensorboardX/setup.cfg
@@ -0,0 +1,5 @@
+[metadata]
+license_file = LICENSE
+
+[bdist_wheel]
+universal = 1
diff --git a/tensorboardX/setup.py b/tensorboardX/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0bc96e241a9a7c864dce4171ddee71810d762b
--- /dev/null
+++ b/tensorboardX/setup.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import subprocess
+import os
+from setuptools import setup, find_packages
+from setuptools.command.develop import develop
+from setuptools.command.install import install
+
+# Dynamically compile protos
+def compileProtoBuf():
+    res = subprocess.call(['bash', './compile.sh'])
+    assert res == 0, 'cannot compile protobuf'
+
+class PostDevelopCommand(develop):
+    """Post-installation for development mode."""
+    def run(self):
+        compileProtoBuf()
+        develop.run(self)
+
+
+class PostInstallCommand(install):
+    """Post-installation for installation mode."""
+    def run(self):
+        compileProtoBuf()
+        import os
+        os.system("pip install protobuf numpy six")
+        install.run(self)
+
+with open('HISTORY.rst') as history_file:
+    history = history_file.read()
+
+preparing_PyPI_package = False
+version_git = version = '1.8'
+
+if not preparing_PyPI_package:
+    if os.path.exists('.git'):
+        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip()
+        version_git = version_git + '+' + sha[:7]
+
+    with open('tensorboardX/__init__.py', 'a') as f:
+        f.write('\n__version__ = "{}"\n'.format(version_git))
+
+requirements = [
+    'numpy',
+    'protobuf >= 3.6.1',
+    'six',
+]
+
+test_requirements = [
+    'pytest',
+    'matplotlib',
+    'crc32c',
+]
+
+setup(
+    name='tensorboardX',
+    version=version_git,
+    description='TensorBoardX lets you watch Tensors Flow without Tensorflow',
+    long_description=history,
+    author='Tzu-Wei Huang',
+    author_email='huang.dexter@gmail.com',
+    url='https://github.com/lanpa/tensorboardX',
+    packages=['tensorboardX'],
+    include_package_data=True,
+    install_requires=requirements,
+    license='MIT license',
+    zip_safe=False,
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Natural Language :: English',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+    ],
+    cmdclass={
+        'develop': PostDevelopCommand,
+        'install': PostInstallCommand,
+    },
+    test_suite='tests',
+    tests_require=test_requirements
+)
+
+
+# checklist: update History.rst readme.md
+# change preparing_PyPI_package to True
+# remove __version__ = "1.old" in __init__.py
+# commit
+# add tag
+# python setup.py sdist bdist_wheel --universal
+# twine upload dist/*
+# push commit
\ No newline at end of file
diff --git a/tensorboardX/tensorboardX.patch b/tensorboardX/tensorboardX.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d9ae329d4ef490dbf884ac254ddbdde51fad38ae
--- /dev/null
+++ b/tensorboardX/tensorboardX.patch
@@ -0,0 +1,63 @@
+diff --git a/tensorboardX/summary.py b/tensorboardX/summary.py
+index 27d99ea..f5bf234 100644
+--- a/tensorboardX/summary.py
++++ b/tensorboardX/summary.py
+@@ -373,36 +373,24 @@ def make_video(tensor, fps):
+ 
+ def audio(tag, tensor, sample_rate=44100):
+     tensor = make_np(tensor)
+-    tensor = tensor.squeeze()
+     if abs(tensor).max() > 1:
+         print('warning: audio amplitude out of range, auto clipped.')
+         tensor = tensor.clip(-1, 1)
+-    assert(tensor.ndim == 1), 'input tensor should be 1 dimensional.'
+-
+-    tensor_list = [int(32767.0 * x) for x in tensor]
++    assert(tensor.ndim == 2), 'input tensor should be 2 dimensional.'
++    length_frames, num_channels = tensor.shape
++    assert num_channels == 1 or num_channels == 2, f'Expected 1/2 channels, got {num_channels}'
++    import soundfile
+     import io
+-    import wave
+-    import struct
+-    fio = io.BytesIO()
+-    Wave_write = wave.open(fio, 'wb')
+-    Wave_write.setnchannels(1)
+-    Wave_write.setsampwidth(2)
+-    Wave_write.setframerate(sample_rate)
+-    tensor_enc = b''
+-    tensor_enc += struct.pack("<" + "h" * len(tensor_list), *tensor_list)
+-
+-    Wave_write.writeframes(tensor_enc)
+-    Wave_write.close()
+-    audio_string = fio.getvalue()
+-    fio.close()
++    with io.BytesIO() as fio:
++        soundfile.write(fio, tensor, samplerate=sample_rate, format='wav')
++        audio_string = fio.getvalue()
+     audio = Summary.Audio(sample_rate=sample_rate,
+-                          num_channels=1,
+-                          length_frames=len(tensor_list),
++                          num_channels=num_channels,
++                          length_frames=length_frames,
+                           encoded_audio_string=audio_string,
+                           content_type='audio/wav')
+     return Summary(value=[Summary.Value(tag=tag, audio=audio)])
+ 
+-
+ def custom_scalars(layout):
+     categoriesnames = layout.keys()
+     categories = []
+diff --git a/tensorboardX/writer.py b/tensorboardX/writer.py
+index 06337a7..58d57a1 100644
+--- a/tensorboardX/writer.py
++++ b/tensorboardX/writer.py
+@@ -716,7 +716,7 @@ class SummaryWriter(object):
+             sample_rate (int): sample rate in Hz
+             walltime (float): Optional override default walltime (time.time()) of event
+         Shape:
+-            snd_tensor: :math:`(1, L)`. The values should lie between [-1, 1].
++            snd_tensor: :math:`(L, c)`. The values should lie between [-1, 1].
+         """
+         if self._check_caffe2_blob(snd_tensor):
+             snd_tensor = workspace.FetchBlob(snd_tensor)
diff --git a/tensorboardX/tensorboardX/__init__.py b/tensorboardX/tensorboardX/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..23c052f7880f6e16b5e9523681c0fc1c18a70e13
--- /dev/null
+++ b/tensorboardX/tensorboardX/__init__.py
@@ -0,0 +1,8 @@
+"""A module for visualization with tensorboard
+"""
+
+from .record_writer import RecordWriter
+from .torchvis import TorchVis
+from .writer import FileWriter, SummaryWriter
+
+__version__ = "1.8"  # will be overwritten if run setup.py
diff --git a/tensorboardX/tensorboardX/beholder/__init__.py b/tensorboardX/tensorboardX/beholder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..26390b34a38baf85d7cd2da5e8d614e7dba40dc3
--- /dev/null
+++ b/tensorboardX/tensorboardX/beholder/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .beholder import Beholder
+from .beholder import BeholderHook
diff --git a/tensorboardX/tensorboardX/beholder/beholder.py b/tensorboardX/tensorboardX/beholder/beholder.py
new file mode 100644
index 0000000000000000000000000000000000000000..89a337f9b38432a9e0e56ca7f3658bfb80a3cec2
--- /dev/null
+++ b/tensorboardX/tensorboardX/beholder/beholder.py
@@ -0,0 +1,225 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from ..proto.summary_pb2 import Summary
+from ..proto.summary_pb2 import SummaryMetadata
+from ..proto.tensor_pb2 import TensorProto
+from ..proto.tensor_shape_pb2 import TensorShapeProto
+
+import os
+import time
+
+import numpy as np
+# import tensorflow as tf
+
+# from tensorboard.plugins.beholder import im_util
+# from . import im_util
+from .file_system_tools import read_pickle,\
+    write_pickle, write_file
+from .shared_config import PLUGIN_NAME, TAG_NAME,\
+    SUMMARY_FILENAME, DEFAULT_CONFIG, CONFIG_FILENAME, SUMMARY_COLLECTION_KEY_NAME, SECTION_INFO_FILENAME
+from . import video_writing
+# from .visualizer import Visualizer
+
+
+class Beholder(object):
+
+    def __init__(self, logdir):
+        self.PLUGIN_LOGDIR = logdir + '/plugins/' + PLUGIN_NAME
+
+        self.is_recording = False
+        self.video_writer = video_writing.VideoWriter(
+            self.PLUGIN_LOGDIR,
+            outputs=[video_writing.FFmpegVideoOutput, video_writing.PNGVideoOutput])
+
+        self.last_image_shape = []
+        self.last_update_time = time.time()
+        self.config_last_modified_time = -1
+        self.previous_config = dict(DEFAULT_CONFIG)
+
+        if not os.path.exists(self.PLUGIN_LOGDIR + '/config.pkl'):
+            os.makedirs(self.PLUGIN_LOGDIR)
+            write_pickle(DEFAULT_CONFIG,
+                         '{}/{}'.format(self.PLUGIN_LOGDIR, CONFIG_FILENAME))
+
+        # self.visualizer = Visualizer(self.PLUGIN_LOGDIR)
+    def _get_config(self):
+        '''Reads the config file from disk or creates a new one.'''
+        filename = '{}/{}'.format(self.PLUGIN_LOGDIR, CONFIG_FILENAME)
+        modified_time = os.path.getmtime(filename)
+
+        if modified_time != self.config_last_modified_time:
+            config = read_pickle(filename, default=self.previous_config)
+            self.previous_config = config
+        else:
+            config = self.previous_config
+
+        self.config_last_modified_time = modified_time
+        return config
+
+    def _write_summary(self, frame):
+        '''Writes the frame to disk as a tensor summary.'''
+        path = '{}/{}'.format(self.PLUGIN_LOGDIR, SUMMARY_FILENAME)
+        smd = SummaryMetadata()
+        tensor = TensorProto(
+            dtype='DT_FLOAT',
+            float_val=frame.reshape(-1).tolist(),
+            tensor_shape=TensorShapeProto(
+                dim=[TensorShapeProto.Dim(size=frame.shape[0]),
+                     TensorShapeProto.Dim(size=frame.shape[1]),
+                     TensorShapeProto.Dim(size=frame.shape[2])]
+            )
+        )
+        summary = Summary(value=[Summary.Value(
+            tag=TAG_NAME, metadata=smd, tensor=tensor)]).SerializeToString()
+        write_file(summary, path)
+
+    @staticmethod
+    def stats(tensor_and_name):
+        imgstats = []
+        for (img, name) in tensor_and_name:
+            immax = img.max()
+            immin = img.min()
+            imgstats.append(
+                {
+                    'height': img.shape[0],
+                    'max': str(immax),
+                    'mean': str(img.mean()),
+                    'min': str(immin),
+                    'name': name,
+                    'range': str(immax - immin),
+                    'shape': str((img.shape[1], img.shape[2]))
+                })
+        return imgstats
+
+    def _get_final_image(self, config, trainable=None, arrays=None, frame=None):
+        if config['values'] == 'frames':
+            # print('===frames===')
+            final_image = frame
+        elif config['values'] == 'arrays':
+            # print('===arrays===')
+            final_image = np.concatenate([arr for arr, _ in arrays])
+            stat = self.stats(arrays)
+            write_pickle(
+                stat, '{}/{}'.format(self.PLUGIN_LOGDIR, SECTION_INFO_FILENAME))
+        elif config['values'] == 'trainable_variables':
+            # print('===trainable===')
+            final_image = np.concatenate([arr for arr, _ in trainable])
+            stat = self.stats(trainable)
+            write_pickle(
+                stat, '{}/{}'.format(self.PLUGIN_LOGDIR, SECTION_INFO_FILENAME))
+        if len(final_image.shape) == 2:  # Map grayscale images to 3D tensors.
+            final_image = np.expand_dims(final_image, -1)
+
+        return final_image
+
+    def _enough_time_has_passed(self, FPS):
+        '''For limiting how often frames are computed.'''
+        if FPS == 0:
+            return False
+        else:
+            earliest_time = self.last_update_time + (1.0 / FPS)
+            return time.time() >= earliest_time
+
+    def _update_frame(self, trainable, arrays, frame, config):
+        final_image = self._get_final_image(config, trainable, arrays, frame)
+        self._write_summary(final_image)
+        self.last_image_shape = final_image.shape
+
+        return final_image
+
+    def _update_recording(self, frame, config):
+        '''Adds a frame to the current video output.'''
+        # pylint: disable=redefined-variable-type
+        should_record = config['is_recording']
+
+        if should_record:
+            if not self.is_recording:
+                self.is_recording = True
+                print('Starting recording using %s',
+                      self.video_writer.current_output().name())
+            self.video_writer.write_frame(frame)
+        elif self.is_recording:
+            self.is_recording = False
+            self.video_writer.finish()
+            print('Finished recording')
+
+    # TODO: blanket try and except for production? I don't someone's script to die
+    #       after weeks of running because of a visualization.
+    def update(self, trainable=None, arrays=None, frame=None):
+        '''Creates a frame and writes it to disk.
+
+        Args:
+            trainable: a list of namedtuple (tensors, name).
+            arrays: a list of namedtuple (tensors, name).
+            frame: lalala
+        '''
+
+        new_config = self._get_config()
+        if True or self._enough_time_has_passed(self.previous_config['FPS']):
+            # self.visualizer.update(new_config)
+            self.last_update_time = time.time()
+            final_image = self._update_frame(
+                trainable, arrays, frame, new_config)
+            self._update_recording(final_image, new_config)
+
+    ##############################################################################
+    # @staticmethod
+    # def gradient_helper(optimizer, loss, var_list=None):
+    #   '''A helper to get the gradients out at each step.
+
+    #   Args:
+    #     optimizer: the optimizer op.
+    #     loss: the op that computes your loss value.
+
+    #   Returns: the gradient tensors and the train_step op.
+    #   '''
+    #   if var_list is None:
+    #     var_list = tf.trainable_variables()
+
+    #   grads_and_vars = optimizer.compute_gradients(loss, var_list=var_list)
+    #   grads = [pair[0] for pair in grads_and_vars]
+
+    #   return grads, optimizer.apply_gradients(grads_and_vars)
+
+
+# implements pytorch backward later
+class BeholderHook():
+    pass
+    # """SessionRunHook implementation that runs Beholder every step.
+
+    # Convenient when using tf.train.MonitoredSession:
+    # ```python
+    # beholder_hook = BeholderHook(LOG_DIRECTORY)
+    # with MonitoredSession(..., hooks=[beholder_hook]) as sess:
+    #   sess.run(train_op)
+    # ```
+    # """
+    # def __init__(self, logdir):
+    #   """Creates new Hook instance
+
+    #   Args:
+    #     logdir: Directory where Beholder should write data.
+    #   """
+    #   self._logdir = logdir
+    #   self.beholder = None
+
+    # def begin(self):
+    #   self.beholder = Beholder(self._logdir)
+
+    # def after_run(self, run_context, unused_run_values):
+    #   self.beholder.update(run_context.session)
diff --git a/tensorboardX/tensorboardX/beholder/file_system_tools.py b/tensorboardX/tensorboardX/beholder/file_system_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..b565abd73759e498ae98728ae6db90ca125c75d2
--- /dev/null
+++ b/tensorboardX/tensorboardX/beholder/file_system_tools.py
@@ -0,0 +1,38 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+
+# import tensorflow as tf
+# from google.protobuf import message
+
+
+def write_file(contents, path, mode='wb'):
+    with open(path, mode) as new_file:
+        new_file.write(contents)
+
+
+def write_pickle(obj, path):
+    with open(path, 'wb') as new_file:
+        pickle.dump(obj, new_file)
+
+
+def read_pickle(path, default=None):
+    with open(path, 'rb') as pickle_file:
+        result = pickle.load(pickle_file)
+    return result
diff --git a/tensorboardX/tensorboardX/beholder/shared_config.py b/tensorboardX/tensorboardX/beholder/shared_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..b987a50cfb7955d5f11a23794ad8b3c56ebeb52e
--- /dev/null
+++ b/tensorboardX/tensorboardX/beholder/shared_config.py
@@ -0,0 +1,40 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+PLUGIN_NAME = 'beholder'
+TAG_NAME = 'beholder-frame'
+SUMMARY_FILENAME = 'frame.summary'
+CONFIG_FILENAME = 'config.pkl'
+SECTION_INFO_FILENAME = 'section-info.pkl'
+SUMMARY_COLLECTION_KEY_NAME = 'summaries_beholder'
+
+DEFAULT_CONFIG = {
+    'values': 'trainable_variables',
+    'mode': 'variance',
+    'scaling': 'layer',
+    'window_size': 15,
+    'FPS': 10,
+    'is_recording': False,
+    'show_all': False,
+    'colormap': 'magma'
+}
+
+SECTION_HEIGHT = 128
+IMAGE_WIDTH = 512 + 256
+
+TB_WHITE = 245
diff --git a/tensorboardX/tensorboardX/beholder/video_writing.py b/tensorboardX/tensorboardX/beholder/video_writing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b35e6ccfe1b46b96a5a608880cbbeb8c7731d0e
--- /dev/null
+++ b/tensorboardX/tensorboardX/beholder/video_writing.py
@@ -0,0 +1,201 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import os
+import subprocess
+import time
+
+import numpy as np
+
+
+class VideoWriter(object):
+    """Video file writer that can use different output types.
+
+    Each VideoWriter instance writes video files to a specified directory, using
+    the first available VideoOutput from the provided list.
+    """
+
+    def __init__(self, directory, outputs):
+        self.directory = directory
+        # Filter to the available outputs
+        self.outputs = [out for out in outputs if out.available()]
+        if not self.outputs:
+            raise IOError('No available video outputs')
+        self.output_index = 0
+        self.output = None
+        self.frame_shape = None
+
+    def current_output(self):
+        return self.outputs[self.output_index]
+
+    def write_frame(self, np_array):
+        # Reset whenever we encounter a new frame shape.
+        if self.frame_shape != np_array.shape:
+            if self.output:
+                self.output.close()
+            self.output = None
+            self.frame_shape = np_array.shape
+            print('Starting video with frame shape: %s', self.frame_shape)
+        # Write the frame, advancing across output types as necessary.
+        original_output_index = self.output_index
+        for self.output_index in range(original_output_index, len(self.outputs)):
+            try:
+                if not self.output:
+                    new_output = self.outputs[self.output_index]
+                    if self.output_index > original_output_index:
+                        print('Falling back to video output %s',
+                              new_output.name())
+                    self.output = new_output(self.directory, self.frame_shape)
+                self.output.emit_frame(np_array)
+                return
+            except (IOError, OSError) as e:
+                print('Video output type %s not available: %s',
+                      self.current_output().name(), str(e))
+                if self.output:
+                    self.output.close()
+                self.output = None
+        raise IOError('Exhausted available video outputs')
+
+    def finish(self):
+        if self.output:
+            self.output.close()
+        self.output = None
+        self.frame_shape = None
+        # Reconsider failed outputs when video is manually restarted.
+        self.output_index = 0
+
+
+class VideoOutput(object):
+    """Base class for video outputs supported by VideoWriter."""
+
+    __metaclass__ = abc.ABCMeta
+
+    # Would add @abc.abstractmethod in python 3.3+
+    @classmethod
+    def available(cls):
+        raise NotImplementedError()
+
+    @classmethod
+    def name(cls):
+        return cls.__name__
+
+    @abc.abstractmethod
+    def emit_frame(self, np_array):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def close(self):
+        raise NotImplementedError()
+
+
+class PNGVideoOutput(VideoOutput):
+    """Video output implemented by writing individual PNGs to disk."""
+
+    @classmethod
+    def available(cls):
+        return True
+
+    def __init__(self, directory, frame_shape):
+        del frame_shape  # unused
+        self.directory = directory + '/video-frames-{}'.format(time.time())
+        self.frame_num = 0
+        os.makedirs(self.directory)
+
+    def emit_frame(self, np_array):
+        filename = self.directory + '/{:05}.png'.format(self.frame_num)
+        self._write_image(np_array.astype(np.uint8), filename)
+        self.frame_num += 1
+
+    def _write_image(self, im, filename):
+        from PIL import Image
+        Image.fromarray(im).save(filename)
+
+    def close(self):
+        pass
+
+
+class FFmpegVideoOutput(VideoOutput):
+    """Video output implemented by streaming to FFmpeg with .mp4 output."""
+
+    @classmethod
+    def available(cls):
+        # Silently check if ffmpeg is available.
+        try:
+            with open(os.devnull, 'wb') as devnull:
+                subprocess.check_call(
+                    ['ffmpeg', '-version'], stdout=devnull, stderr=devnull)
+            return True
+        except (OSError, subprocess.CalledProcessError):
+            return False
+
+    def __init__(self, directory, frame_shape):
+        self.filename = directory + '/video-{}.webm'.format(time.time())
+        if len(frame_shape) != 3:
+            raise ValueError(
+                'Expected rank-3 array for frame, got %s' % str(frame_shape))
+        # Set input pixel format based on channel count.
+        if frame_shape[2] == 1:
+            pix_fmt = 'gray'
+        elif frame_shape[2] == 3:
+            pix_fmt = 'rgb24'
+        else:
+            raise ValueError('Unsupported channel count %d' % frame_shape[2])
+
+        command = [
+            'ffmpeg',
+            '-y',  # Overwite output
+            # Input options - raw video file format and codec.
+            '-f', 'rawvideo',
+            '-vcodec', 'rawvideo',
+            # Width x height.
+            '-s', '%dx%d' % (frame_shape[1], frame_shape[0]),
+            '-pix_fmt', pix_fmt,
+            '-r', '15',  # Frame rate: arbitrarily use 15 frames per second.
+            '-i', '-',  # Use stdin.
+            '-an',  # No audio.
+            # Output options - use lossless VP9 codec inside .webm.
+            '-vcodec', 'libvpx-vp9',
+            '-lossless', '1',
+            # Using YUV is most compatible, though conversion from RGB skews colors.
+            '-pix_fmt', 'yuv420p',
+            self.filename
+        ]
+        PIPE = subprocess.PIPE
+        self.ffmpeg = subprocess.Popen(
+            command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+
+    def _handle_error(self):
+        _, stderr = self.ffmpeg.communicate()
+        bar = '=' * 40
+        print('Error writing to FFmpeg:\n{}\n{}\n{}',
+              bar, stderr, bar)
+
+    def emit_frame(self, np_array):
+        try:
+            self.ffmpeg.stdin.write(np_array.tobytes())
+            self.ffmpeg.stdin.flush()
+        except IOError:
+            self._handle_error()
+            raise IOError('Failure invoking FFmpeg')
+
+    def close(self):
+        if self.ffmpeg.poll() is None:
+            # Close stdin and consume and discard stderr/stdout.
+            self.ffmpeg.communicate()
+        self.ffmpeg = None
diff --git a/tensorboardX/tensorboardX/caffe2_graph.py b/tensorboardX/tensorboardX/caffe2_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..0162dcec1194a47ada92e81ff6a6f12f96ea0ba7
--- /dev/null
+++ b/tensorboardX/tensorboardX/caffe2_graph.py
@@ -0,0 +1,819 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import copy
+import logging
+import os
+import re
+import six
+
+from builtins import bytes
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+
+from .proto.graph_pb2 import GraphDef
+from .proto.node_def_pb2 import NodeDef
+from .proto.tensor_shape_pb2 import TensorShapeProto
+
+
+def _make_unique_name(seen, name, min_version=0):
+    '''
+    Make the name unique by appending a unique number to the name. Used for SSA.
+
+    Args:
+        seen (set): Set of names that have already been used (with respect to
+            some context).
+        name (string): The name to make unique
+        min_version (number): Starting index. Is incremented continually until
+            it can make the resulting name unique relative to 'seen'.
+
+    Returns:
+        x (string): A version of name that is not in seen.
+    '''
+    assert name is not None
+    i = min_version
+    x = '%s_%d' % (name, i) if i else name
+    while x in seen:
+        i += 1
+        x = '%s_%d' % (name, i)
+    seen.add(x)
+    return x
+
+
+def _rename_tensorflow_style(shapes, blob_name_tracker, ops):
+    '''
+    Convert some of the common names in Caffe2 to tensorflow.
+    NOTE: The common names in both Caffe2 and Tensorflow are currently
+        hardcoded, if either side changes at some point, then this code should
+        change as well.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+
+    Returns:
+        None. The _rename_all() call modifies blob_name_tracker and ops in-place.
+    '''
+    WEIGHT = re.compile(r"(_w)$")
+    WEIGHT_ = re.compile(r"(_w_)")
+    BN = re.compile(r"(_bn)$")
+    BN_ = re.compile(r"(_bn_)")
+    BIAS = re.compile(r"(_b)$")
+    BIAS_ = re.compile(r"(_b_)")
+    SCALE = re.compile(r"(_s)$")
+    SCALE_ = re.compile(r"(_s_)")
+    SUM = re.compile(r"(_sum)$")
+    SUM_ = re.compile(r"(_sum_)")
+    BRANCH = re.compile(r"(_branch)")
+
+    def f(name):
+        inter_name = WEIGHT_.sub('/weight_', WEIGHT.sub('/weight', name))
+        inter_name = BN_.sub('/batchnorm_', BN.sub('/batchnorm', inter_name))
+        inter_name = BIAS_.sub('/bias_', BIAS.sub('/bias', inter_name))
+        inter_name = SCALE_.sub('/scale_', SCALE.sub('/scale', inter_name))
+        inter_name = SUM_.sub('/sum_', SUM.sub('/sum', inter_name))
+        new_name = BRANCH.sub('/branch', inter_name)
+        return new_name
+    _rename_all(shapes, blob_name_tracker, ops, f)
+
+
+def _convert_to_ssa(shapes, blob_name_tracker, ops):
+    '''
+    Convert an operator graph to SSA (i.e. out-of-place).
+    i.e. blobs will be renamed so that each blob is produced only once.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+
+    Returns:
+        None. Modifies blob_name_tracker and ops in-place.
+    '''
+    ir = core.IR(ops)
+    seen = set()
+    versioned = {}
+    new_shapes = {}
+    new_blob_name_tracker = {}
+
+    def ssa_name(name, versions):
+        assert name in versions
+        version = versions[name]
+        if (name, version) in versioned:
+            return versioned[(name, version)]
+        # Always setting name2 = `{name}_{version}` would work, but we also try
+        # to avoid a trailing `_0`, so we have to be careful not to introduce
+        # name collisions, such as (foo_1, 0) = foo_1 = (foo, 1).
+        # Note: operator names (if any) will be handled later.
+        new_name = _make_unique_name(seen, name, min_version=version)
+        versioned[(name, version)] = new_name
+        # Transfer shape.
+        if name in shapes:
+            new_shapes[new_name] = shapes[name]
+        if blob_name_tracker and name in blob_name_tracker:
+            new_blob_name_tracker[new_name] = blob_name_tracker[name]
+        return new_name
+
+    for (op, ssa) in zip(ops, ir.ssa):
+        assert op is ssa.op
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        op.input.extend(ssa_name(name, ssa.in_versions) for name in inputs)
+        op.output.extend(ssa_name(name, ssa.out_versions) for name in outputs)
+
+    shapes.clear()
+    shapes.update(new_shapes)
+    if blob_name_tracker:
+        blob_name_tracker.clear()
+        blob_name_tracker.update(new_blob_name_tracker)
+
+
+def _get_blob_names(ops):
+    '''
+    Get all the operator input and output blobs and perform dedup on their names.
+
+    Args:
+        ops: List of Caffe2 operators to extract inputs and outputs from
+
+    Returns:
+        set containing distinct inputs and outputs from 'ops'
+    '''
+    names = set()
+    for op in ops:
+        names.update(op.input)
+        names.update(op.output)
+    return {name: name for name in names}
+
+
+def _remap_keys(old_dict, rename_fn):
+    '''
+    Rename keys of 'old_dict' according to 'rename_fn'.
+
+    Args:
+        old_dict: Dictionary (i.e. containing blob_name -> blob_name
+            relationships.)
+        remap_fn: Function string -> string for renaming.
+
+    Returns:
+        None. Modifies old_dict in-place.
+    '''
+    new_dict = {rename_fn(key): value for key,
+                value in six.iteritems(old_dict)}
+    old_dict.clear()
+    old_dict.update(new_dict)
+
+
+def _rename_all(shapes, blob_name_tracker, ops, rename_fn):
+    '''
+    Rename all the names in the operators.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+        rename_fn: Function string -> string that specifies how to rename
+
+    Returns:
+        None. Modifies shapes, blob_name_tracker and ops in-place using the
+            specified 'rename_fn'.
+    '''
+    seen = set()
+    renamed = {}
+
+    def g(name):
+        """ Collision-free version of f.
+        """
+        if name is None:
+            return None
+        if name in renamed:
+            return renamed[name]
+        new_name = _make_unique_name(seen, rename_fn(name))
+        renamed[name] = new_name
+        return new_name
+
+    for op in ops:
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        op.input.extend(g(name) for name in inputs)
+        op.output.extend(g(name) for name in outputs)
+
+    _remap_keys(shapes, g)
+    if blob_name_tracker:
+        _remap_keys(blob_name_tracker, g)
+    # Rename all operator names (if any) independently so that the
+    # unique-fication happens only once in _fill_missing_operator_names().
+    seen.clear()
+    renamed.clear()
+    for op in ops:
+        op.name = g(op.name)
+
+
+def _add_gradient_scope(shapes, blob_name_tracker, ops):
+    """
+    For all operators or blobs with name containing "_grad", add a
+    "GRADIENTS/" scope.
+    Note: breaks graph execution since the blob -> gradient mapping is
+    hardcoded.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+
+    Returns:
+        None. Modifies shapes, blob_name_tracker and ops in-place by renaming.
+    """
+    def f(name):
+        if '_grad' in name:
+            return 'GRADIENTS/{}'.format(name)
+        else:
+            return name
+    _rename_all(shapes, blob_name_tracker, ops, f)
+
+
+def _replace_colons(shapes, blob_name_tracker, ops, repl):
+    '''
+    `:i` has a special meaning in Tensorflow. This function replaces all colons
+    with $ to avoid any possible conflicts.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+        repl: String representing the text to replace ':' with. Usually this is
+            '$'.
+
+    Returns:
+        None. Modifies blob_name_tracker in-place.
+
+    '''
+    def f(name):
+        return name.replace(':', repl)
+    _rename_all(shapes, blob_name_tracker, ops, f)
+
+
+def _fill_missing_operator_names(ops):
+    '''
+    Give missing operators a name.
+    We expect C2 operators to be generally unnamed. This gives them a scope
+    (inferred from their outputs) and a name after their type. Duplicates will
+    be postfixed by an index.
+
+    Args:
+        ops: List of Caffe2 operators to assign names to.
+
+    Returns:
+        None: Modifies 'ops' in-place.
+    '''
+    seen = set()
+    for op in ops:
+        # Make sure operator names don't collide with blobs.
+        seen.update(op.input)
+        seen.update(op.output)
+    for op in ops:
+        if op.name:
+            name = op.name
+        elif op.output or op.input:
+            name_list = [os.path.dirname(name)
+                         for name in op.output or op.input]
+            scope = os.path.commonprefix(name_list)
+            name = os.path.join(scope, op.type)
+        else:
+            name = op.type
+        assert(name)
+        op.name = _make_unique_name(seen, name)
+
+
+def _tf_device(device_option):
+    '''
+    Handle the devices.
+
+    Args:
+        device_option (caffe2_pb2.DeviceOption): DeviceOption protobuf,
+            associated to an operator, that contains information such as
+            device_type (optional), cuda_gpu_id (optional), node_name (optional,
+            tells which node the operator should execute on). See caffe2.proto
+            in caffe2/proto for the full list.
+
+    Returns:
+        Formatted string representing device information contained in
+            device_option.
+    '''
+    if not device_option.HasField("device_type"):
+        return ""
+    if device_option.device_type == caffe2_pb2.CPU or device_option.device_type == caffe2_pb2.MKLDNN:
+        return "/cpu:*"
+    if device_option.device_type == caffe2_pb2.CUDA:
+        return "/gpu:{}".format(device_option.device_id)
+    raise Exception("Unhandled device", device_option)
+
+
+def _add_tf_shape(attr_dict, ints):
+    '''
+    Converts a list of ints to a TensorShapeProto representing the dimensions of
+    a blob/object.
+
+    Args:
+        attr_dict: Dictionary to update (usually attributes of a Node)
+        ints: List of integers representing dimensions of some object.
+
+    Returns:
+        None. Modifies attr_dict in-place.
+    '''
+    shape_proto = TensorShapeProto()
+    for i in ints:
+        dim = TensorShapeProto.Dim()
+        dim.size = i
+        shape_proto.dim.extend([dim])
+    attr_dict['_output_shapes'].list.shape.extend([shape_proto])
+
+
+def _set_tf_attr(attr_dict, arg):
+    '''
+    Add attributes to a node. Key is the arg.name, and values can be shape,
+        floats, strings, ints or an empty list.
+
+    Args:
+        attr_dict: Dictionary to update (usually attributes of a Node)
+        arg: Object with name and data fields.
+
+    Returns:
+        None. Modifies attr_dict in-place.
+    '''
+    k = arg.name
+    if k == 'shape' and arg.ints:
+        _add_tf_shape(attr_dict, arg.ints)
+        return
+    # Float
+    if arg.HasField("f"):
+        attr_dict[k].f = arg.f
+        return
+    # Integer
+    if arg.HasField("i"):
+        attr_dict[k].i = arg.i
+        return
+    # String
+    if arg.HasField("s"):
+        attr_dict[k].s = (
+            arg.s if isinstance(arg.s, bytes) else str(arg.s).encode('utf-8')
+        )
+        return
+    if arg.floats:
+        attr_dict[k].list.f.extend(arg.floats)
+        return
+    if arg.ints:
+        attr_dict[k].list.i.extend(arg.ints)
+        return
+    if arg.strings:
+        attr_dict[k].list.s.extend(
+            s if isinstance(s, bytes) else str(s).encode('utf-8')
+            for s in arg.strings
+        )
+        return
+    # The value is an empty list.
+    attr_dict[k].list.s.extend([])
+
+
+def _operator_to_node(shapes, op):
+    '''
+    Converts an operator to a node in a TF graph.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        op: The Caffe2 operator to convert to a TF graph node.
+
+    Returns:
+        n: The TF graph node created from op.
+    '''
+    assert op.name, op
+    n = NodeDef()
+    n.name = op.name
+    n.input.extend(op.input)
+    n.op = op.type
+    n.device = _tf_device(op.device_option)
+    if shapes:
+        # Add shapes in order.
+        for output in op.output:
+            if output not in shapes:
+                break
+            _add_tf_shape(n.attr, shapes[output])
+    for arg in op.arg:
+        _set_tf_attr(n.attr, arg)
+    return n
+
+
+def _operator_to_node_simp(op, inter_blobs, seen):
+    '''
+    Convert the operators to nodes.
+
+    Args:
+        op: Caffe2 operator to convert to node
+        inter_blobs: Set of intermediate blobs
+        seen: Names that have already been used and are not unique
+
+    Returns:
+        nodes: Nodes representing 'op' and the outputs of 'op'
+    '''
+    assert op
+    nodes = []
+    outputs = [o for o in op.output if o not in inter_blobs]
+    seen.update(outputs)
+    len_outputs = len(outputs)
+    if len_outputs == 1:
+        n = NodeDef()
+        n.name = outputs[0]
+        # Here we are sure the name is unique.
+        n.input.extend(op.input)
+        n.op = op.type
+        n.device = _tf_device(op.device_option)
+        for arg in op.arg:
+            _set_tf_attr(n.attr, arg)
+        nodes.append(n)
+    elif len_outputs > 1:
+        # Create a name that is likely unique
+        if op.name:
+            name = op.name
+        else:
+            name_list = [name for name in outputs]
+            scope = os.path.commonprefix(name_list)
+            name = os.path.join(scope, op.type)
+        assert(name)
+        op.name = _make_unique_name(seen, name)
+        device = _tf_device(op.device_option)
+
+        # Create additional output nodes
+        for output in outputs:
+            n = NodeDef()
+            n.name = output
+            n.input.extend([op.name])
+            n.op = 'Blob'
+            n.device = device
+            nodes.append(n)
+
+        # Node for the current op
+        n = NodeDef()
+        n.name = op.name
+        n.input.extend(op.input)
+        n.op = op.type
+        n.device = device
+        for arg in op.arg:
+            _set_tf_attr(n.attr, arg)
+        nodes.append(n)
+
+    return nodes
+
+
+def _blob_to_node(producing_ops, shapes, name):
+    '''
+    Converts a blob (operator input or output) to a node in a TF graph.
+
+    Args:
+        producing_ops: Dictionary of blob name to list of
+            (producing_op, blob_index within producing_op.output) mapping.
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        name: String representing the name of this blob.
+
+    Returns:
+        n: The TF graph node created from this blob.
+    '''
+    assert name
+    n = NodeDef()
+    n.name = name
+    # Get all ops that have the blob corresponding to 'name' as one of their
+    # outputs. See _operators_to_graph_def.
+    produced_by = producing_ops.get(name, [])
+    if len(produced_by) > 0:
+        n.op = 'Blob'
+    else:
+        # This blob is not produced but is instead a TF Placeholder where a
+        # value is passed in.
+        n.op = 'Placeholder'
+    n.input.extend('%s:%d' % (p_op.name, i) for p_op, i in produced_by)
+    if produced_by:
+        device = produced_by[0][0].device_option
+        if (all(producer[0].device_option == device for producer in produced_by)):
+            n.device = _tf_device(device)
+    if shapes and name in shapes:
+        _add_tf_shape(n.attr, shapes[name])
+    return n
+
+
+def _clear_debug_info(ops, perform_clear):
+    '''
+    Removes debug information from operators, they are copious.
+
+    Args:
+        ops: List of Caffe2 operators
+        perform_clear: Boolean passed from _operators_to_graph_def specifying
+            whether to remove the debug information. This boolean is passed into
+            this function to reduce the complexity of _operators_to_graph_def.
+
+    Returns:
+        None. Modifies the list of Caffe2 operators in-place and removes the
+        'debug_info' field.
+
+    '''
+    if not perform_clear:
+        return
+
+    for op in ops:
+        if op.HasField('debug_info'):
+            op.ClearField('debug_info')
+
+
+def _check_if_forward(blob):
+    '''
+    Blobs with names containing '_m' or 'grad' are part of the backward pass.
+        This function references facebookresearch/Detectron/detectron/utils/net.py.
+
+    Args:
+        blob: The blob to inspect
+
+    Returns:
+        Boolean representing whether this blob is part of the forward pass
+    '''
+    #
+    return (blob.find('__m') < 0 or blob.find('grad') < 0)
+
+
+def _check_if_cpu(blob):
+    '''
+    Check if the blob's name starts with '_gpu'.
+
+    Args:
+        blob: The blob to inspect
+
+    Returns:
+        Boolean representing whether this blob is associated with a gpu
+    '''
+    return not blob.startswith('_gpu')
+
+
+def _compute_in_out(ops):
+    '''
+    Find the input, intermediate and output nodes of a set of operators.
+
+    Args:
+        ops: List of Caffe2 operators to look through
+
+    Returns:
+        input_blobs: The input nodes of the set of operators
+        inter_blobs: The intermediate nodes of the set of operators
+        output_blobs: The output nodes of the set of operators
+    '''
+    in_blobs = set()
+    out_blobs = set()
+
+    for op in ops:
+        for input_blob in op.input:
+            in_blobs.add(input_blob)
+        for output_blob in op.output:
+            out_blobs.add(output_blob)
+
+    input_blobs = list(in_blobs.difference(out_blobs))
+    output_blobs = list(out_blobs.difference(in_blobs))
+    inter_blobs = {b for b in output_blobs if b.startswith('_')}
+    output_blobs = [b for b in output_blobs if b not in inter_blobs]
+
+    return input_blobs, inter_blobs, output_blobs
+
+
+def _filter_ops(ops, filter_fn, perform_filter):
+    '''
+    Filter unwanted operators based on criteria in 'filter_fn'.
+
+    Args:
+        ops: List of Caffe2 operators to filter
+        filter_fn: Criteria function for whether inputs/outputs in an operator
+            should be filtered.
+        perform_filter: Boolean passed from _operators_to_graph_def specifying
+            whether to filter operators
+
+    Returns:
+        new_ops: Subset of ops containing a subset of their inputs and outputs.
+    '''
+    if not perform_filter:
+        return ops
+
+    new_ops = []
+    for op in ops:
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        new_inputs = [i for i in inputs if filter_fn(i)]
+        new_outputs = [o for o in outputs if filter_fn(o)]
+
+        # Only add the op if output is not empty
+        if new_outputs:
+            op.input.extend(new_inputs)
+            op.output.extend(new_outputs)
+            new_ops.append(op)
+
+    return new_ops
+
+
+def _operators_to_graph_def(
+    shapes,
+    ops,
+    colon_replacement='$',
+    with_ssa=True,
+    with_gradient_scope=True,
+    blob_name_tracker=None,
+    show_simplified=False,
+    custom_rename=None
+):
+    '''
+    Main function to convert set of operators to a graph.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        ops: List of Caffe2 operators, representing some computation graph
+        ### **kwargs (model_to_graph_def, nets_to_graph_def, protos_to_graph_def) ###
+        colon_replacement: Symbol to replace ':' with. ':i' in TF has a special
+            meaning, so we need to replace it with a non-conflicting symbol.
+        with_ssa: Boolean
+        with_gradient_scope: Boolean
+        blob_name_tracker: Dictionary tracking names of blobs (inputs/outputs
+            from operators)
+        show_simplified: Whether to show a simplified version of the model graph
+            Sets all of the following values:
+                clear_debug_info: Boolean representing whether to silence debug
+                    info (which can be very verbose)
+                show_forward_only: Boolean representing whether to only show
+                    blobs involved in the forward pass
+                show_cpu_only: Boolean representing whether to only show blobs
+                    that are not associated with a gpu
+                use_tensorflow_naming: Boolean representing whether to convert
+                    some common Caffe2 naming conventions to their Tensorflow
+                    counterparts
+        custom_rename: Function string -> string that defines a custom
+            renaming function to use.
+
+    Returns:
+        current_graph: GraphDef representing the computation graph formed by the
+            set of operators.
+    '''
+    if blob_name_tracker is not None:
+        blob_name_tracker.clear()
+    else:
+        blob_name_tracker = {}
+
+    blob_name_tracker.update(_get_blob_names(ops))
+
+    _clear_debug_info(ops, show_simplified)  # clear_debug_info
+    ops = _filter_ops(ops, _check_if_forward,
+                      show_simplified)  # show_forward_only
+    ops = _filter_ops(ops, _check_if_cpu, show_simplified)  # show_cpu_only
+    if custom_rename:
+        _rename_all(shapes, blob_name_tracker, ops, custom_rename)
+    if colon_replacement:
+        _replace_colons(shapes, blob_name_tracker, ops, colon_replacement)
+    if with_ssa:
+        _convert_to_ssa(shapes, blob_name_tracker, ops)
+    if with_gradient_scope:
+        _add_gradient_scope(shapes, blob_name_tracker, ops)
+    _fill_missing_operator_names(ops)
+    if show_simplified:  # use_tensorflow_naming
+        _rename_tensorflow_style(shapes, blob_name_tracker, ops)
+    producing_ops = {}
+    blobs = []
+    input_blobs, inter_blobs, _ = _compute_in_out(ops)
+    current_graph = GraphDef()
+    seen = set(input_blobs)
+    for op in ops:
+        nodes_from_op = _operator_to_node_simp(op, inter_blobs, seen) if \
+            show_simplified else \
+            [_operator_to_node(shapes, op)]  # .extend() expects an iterable
+        current_graph.node.extend(nodes_from_op)
+        for input_blob in op.input:
+            blobs.append(input_blob)
+        for i, output_blob in enumerate(op.output):
+            blobs.append(output_blob)
+            producing_ops.setdefault(output_blob, []).append((op, i))
+
+    if show_simplified:
+        # Show a cleaner, easier-to-interpret version of the model graph
+        blobs = input_blobs
+
+    for blob in blobs:
+        current_graph.node.extend([_blob_to_node(producing_ops, {}, blob)])
+
+    return current_graph
+
+
+def _propagate_device_option(net_def):
+    '''
+    Propagate the device options from net to operators.
+
+    Args:
+        net_def: A caffe2_pb2.NetDef representing a computation graph. The graph
+            consists of Caffe2 operators.
+
+    Returns:
+        None. Iterates through all ops contained within the net. For each op,
+            modifies the op device_option in-place to be the net device_option
+            if the op has no pre-existing device_option, and leaves the op as-is
+            if it already has a device_option.
+    '''
+    if not net_def.HasField("device_option"):
+        return
+    for op in net_def.op:
+        if not op.HasField("device_option"):
+            op.device_option.CopyFrom(net_def.device_option)
+
+
+def _try_get_shapes(nets):
+    '''
+    Get missing shapes for all blobs contained in the nets.
+
+    Args:
+        nets: List of core.Net to extract blob shape information from.
+
+    Returns:
+        Dictionary containing blob name to shape/dimensions mapping. The net
+            is a computation graph that is composed of operators, and the
+            operators have input and output blobs, each with their own dims.
+    '''
+    try:
+        # Note: this will inspect the workspace for better or worse.
+        # We don't care about the types, only the shapes
+        shapes, _ = workspace.InferShapesAndTypes(nets)
+        return shapes
+    except Exception as e:
+        logging.warning('Failed to compute shapes: %s', e)
+        return {}
+
+
+def model_to_graph_def(model, **kwargs):
+    '''
+    Convert a Caffe2 model to a Tensorflow graph. This function extracts
+    'param_init_net' and 'net' from the model and passes it to nets_to_graph()
+    for further processing.
+
+    Args:
+        model (cnn.CNNModelHelper, model_helper.ModelHelper): The model to
+            extract the nets (instances of core.Net) from.
+
+    Returns:
+        Call to nets_to_graph_def() with extracted 'param_init_net', 'net' and
+            **kwargs. See _operators_to_graph_def for detailed **kwargs.
+    '''
+    nets = [model.param_init_net, model.net]
+    return nets_to_graph_def(nets, **kwargs)
+
+
+def nets_to_graph_def(nets, shapes=None, **kwargs):
+    '''
+    Convert a set of Caffe2 nets to a Tensorflow graph.
+
+    Args:
+        nets: List of core.Nets. core.Net is a wrapper around a NetDef protobuf.
+            The corresponding protobuf can be extracted using .Proto().
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+
+    Returns:
+        Call to protos_to_graph_def() with the extracted NetDef protobufs and
+            **kwargs. See _operators_to_graph_def for detailed **kwargs.
+    '''
+    # if shapes is None:
+    #     shapes = _try_get_shapes(nets)
+    # _try_get_shapes(nets) depends on workspace.InferShapesAndTypes(nets),
+    # which is currently broken (segfault). We omit the shapes for now.
+    shapes = {}
+    nets = [copy.deepcopy(net.Proto()) for net in nets]
+    shapes = copy.deepcopy(shapes)
+    return protos_to_graph_def(nets, shapes, **kwargs)
+
+
+def protos_to_graph_def(net_defs, shapes=None, **kwargs):
+    '''
+    Convert a set of Caffe2 net definitions to a Tensorflow graph.
+
+    Args:
+        net_defs: List of caffe2_pb2.NetDef protobufs representing computation
+            graphs.
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+
+    Returns:
+        Call to _operators_to_graph_def() with the extracted operators from the
+            NetDefs and **kwargs. See _operators_to_graph_def for detailed
+            **kwargs.
+    '''
+    for net in net_defs:
+        _propagate_device_option(net)
+    shapes = copy.deepcopy(shapes or {})
+    ops = [op for net_def in net_defs for op in net_def.op]
+    return _operators_to_graph_def(shapes, ops, **kwargs)
diff --git a/tensorboardX/tensorboardX/crc32c.py b/tensorboardX/tensorboardX/crc32c.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4179131dff40f4a05a620b3a18ae63317234276
--- /dev/null
+++ b/tensorboardX/tensorboardX/crc32c.py
@@ -0,0 +1,134 @@
+# https://www.ietf.org/rfc/rfc3309.txt
+import array
+import os
+
+try:
+    if os.environ.get('CRC32C_SW_MODE', None) is None:
+        os.environ['CRC32C_SW_MODE'] = 'auto'
+    from crc32c import crc32 as _crc32c_native
+except ImportError:
+    _crc32c_native = None
+
+
+CRC_TABLE = (
+    0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+    0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+    0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+    0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+    0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+    0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+    0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+    0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+    0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+    0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+    0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+    0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+    0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+    0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+    0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+    0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+    0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+    0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+    0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+    0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+    0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+    0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+    0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+    0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+    0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+    0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+    0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+    0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+    0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+    0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+    0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+    0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+    0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+    0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+    0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+    0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+    0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+    0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+    0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+    0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+    0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+    0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+    0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+    0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+    0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+    0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+    0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+    0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+    0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+    0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+    0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+    0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+    0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+    0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+    0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+    0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+    0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+    0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+    0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+    0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+    0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+    0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+    0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+    0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
+)
+
+CRC_INIT = 0
+
+_MASK = 0xFFFFFFFF
+
+
+def crc_update(crc, data):
+    """Update CRC-32C checksum with data.
+
+    Args:
+      crc: 32-bit checksum to update as long.
+      data: byte array, string or iterable over bytes.
+
+    Returns:
+      32-bit updated CRC-32C as long.
+    """
+
+    if type(data) != array.array or data.itemsize != 1:
+        buf = array.array("B", data)
+    else:
+        buf = data
+
+    crc ^= _MASK
+    for b in buf:
+        table_index = (crc ^ b) & 0xff
+        crc = (CRC_TABLE[table_index] ^ (crc >> 8)) & _MASK
+    return crc ^ _MASK
+
+
+def crc_finalize(crc):
+    """Finalize CRC-32C checksum.
+
+    This function should be called as last step of crc calculation.
+
+    Args:
+      crc: 32-bit checksum as long.
+
+    Returns:
+      finalized 32-bit checksum as long
+    """
+    return crc & _MASK
+
+
+def _crc32c(data):
+    """Compute CRC-32C checksum of the data.
+
+    Args:
+      data: byte array, string or iterable over bytes.
+
+    Returns:
+      32-bit CRC-32C checksum of data as long.
+    """
+    return crc_finalize(crc_update(CRC_INIT, data))
+
+
+crc32c = _crc32c if _crc32c_native is None else _crc32c_native
diff --git a/tensorboardX/tensorboardX/embedding.py b/tensorboardX/tensorboardX/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fec4c2f65944499acc3d54818c99412895d9b8e
--- /dev/null
+++ b/tensorboardX/tensorboardX/embedding.py
@@ -0,0 +1,77 @@
+import os
+
+
+def make_tsv(metadata, save_path, metadata_header=None):
+    if not metadata_header:
+        metadata = [str(x) for x in metadata]
+    else:
+        assert len(metadata_header) == len(metadata[0]), \
+            'len of header must be equal to the number of columns in metadata'
+        metadata = ['\t'.join(str(e) for e in l)
+                    for l in [metadata_header] + metadata]
+    import sys
+    if sys.version_info[0] == 3:
+        with open(os.path.join(save_path, 'metadata.tsv'), 'w', encoding='utf8') as f:
+            for x in metadata:
+                f.write(x + '\n')
+    else:
+        with open(os.path.join(save_path, 'metadata.tsv'), 'wb') as f:
+            for x in metadata:
+                f.write((x + '\n').encode('utf-8'))
+
+
+# https://github.com/tensorflow/tensorboard/issues/44 image label will be squared
+def make_sprite(label_img, save_path):
+    import math
+    import numpy as np
+    from .x2num import make_np
+    from .utils import make_grid
+    from PIL import Image
+    # this ensures the sprite image has correct dimension as described in
+    # https://www.tensorflow.org/get_started/embedding_viz
+    # There are some constraints for the sprite image:
+    # 1. The sprite image should be square.
+    # 2. Each image patch in the sprite image should be square.
+    # 2. The content is row major order, so we can padding the image on the
+    #    bottom, but not on the right, otherwise, TB will treat some padded location
+    #    as images to be shown.
+    # args: label_img: tensor in NCHW
+
+    assert label_img.shape[2] == label_img.shape[3], 'Image should be square, see tensorflow/tensorboard#670'
+    total_pixels = label_img.shape[0] * label_img.shape[2] * label_img.shape[3]
+    pixels_one_side = total_pixels ** 0.5
+    number_of_images_per_row = int(math.ceil(pixels_one_side / label_img.shape[3]))
+    arranged_img_CHW = make_grid(make_np(label_img), ncols=number_of_images_per_row)
+    arranged_img_HWC = arranged_img_CHW.transpose(1, 2, 0)  # chw -> hwc
+
+    arranged_augment_square_HWC = np.ndarray((arranged_img_CHW.shape[2], arranged_img_CHW.shape[2], 3))
+    arranged_augment_square_HWC[:arranged_img_HWC.shape[0], :, :] = arranged_img_HWC
+    im = Image.fromarray(np.uint8((arranged_augment_square_HWC * 255).clip(0, 255)))
+    im.save(os.path.join(save_path, 'sprite.png'))
+
+
+def append_pbtxt(metadata, label_img, save_path, subdir, global_step, tag):
+    from posixpath import join
+    with open(os.path.join(save_path, 'projector_config.pbtxt'), 'a') as f:
+        # step = os.path.split(save_path)[-1]
+        f.write('embeddings {\n')
+        f.write('tensor_name: "{}:{}"\n'.format(
+            tag, str(global_step).zfill(5)))
+        f.write('tensor_path: "{}"\n'.format(join(subdir, 'tensors.tsv')))
+        if metadata is not None:
+            f.write('metadata_path: "{}"\n'.format(
+                join(subdir, 'metadata.tsv')))
+        if label_img is not None:
+            f.write('sprite {\n')
+            f.write('image_path: "{}"\n'.format(join(subdir, 'sprite.png')))
+            f.write('single_image_dim: {}\n'.format(label_img.shape[3]))
+            f.write('single_image_dim: {}\n'.format(label_img.shape[2]))
+            f.write('}\n')
+        f.write('}\n')
+
+
+def make_mat(matlist, save_path):
+    with open(os.path.join(save_path, 'tensors.tsv'), 'w') as f:
+        for x in matlist:
+            x = [str(i.item()) for i in x]
+            f.write('\t'.join(x) + '\n')
diff --git a/tensorboardX/tensorboardX/event_file_writer.py b/tensorboardX/tensorboardX/event_file_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f3d0ced8efacabacfb150a2a9325dbb84e3dac8
--- /dev/null
+++ b/tensorboardX/tensorboardX/event_file_writer.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Writes events to disk in a logdir."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import socket
+import threading
+import time
+
+import six
+
+from .proto import event_pb2
+from .record_writer import RecordWriter, directory_check
+
+
+class EventsWriter(object):
+    '''Writes `Event` protocol buffers to an event file.'''
+
+    def __init__(self, file_prefix, filename_suffix=''):
+        '''
+        Events files have a name of the form
+        '/some/file/path/events.out.tfevents.[timestamp].[hostname]'
+        '''
+        self._file_name = file_prefix + ".out.tfevents." + str(time.time())[:10] + "." +\
+            socket.gethostname() + filename_suffix
+        self._num_outstanding_events = 0
+        self._py_recordio_writer = RecordWriter(self._file_name)
+        # Initialize an event instance.
+        self._event = event_pb2.Event()
+        self._event.wall_time = time.time()
+        self._event.file_version = 'brain.Event:2'
+        self._lock = threading.Lock()
+        self.write_event(self._event)
+
+    def write_event(self, event):
+        '''Append "event" to the file.'''
+
+        # Check if event is of type event_pb2.Event proto.
+        if not isinstance(event, event_pb2.Event):
+            raise TypeError("Expected an event_pb2.Event proto, "
+                            " but got %s" % type(event))
+        return self._write_serialized_event(event.SerializeToString())
+
+    def _write_serialized_event(self, event_str):
+        with self._lock:
+            self._num_outstanding_events += 1
+            self._py_recordio_writer.write(event_str)
+
+    def flush(self):
+        '''Flushes the event file to disk.'''
+        with self._lock:
+            self._num_outstanding_events = 0
+            self._py_recordio_writer.flush()
+        return True
+
+    def close(self):
+        '''Call self.flush().'''
+        return_value = self.flush()
+        with self._lock:
+            self._py_recordio_writer.close()
+        return return_value
+
+
+class EventFileWriter(object):
+    """Writes `Event` protocol buffers to an event file.
+
+    The `EventFileWriter` class creates an event file in the specified directory,
+    and asynchronously writes Event protocol buffers to the file. The Event file
+    is encoded using the tfrecord format, which is similar to RecordIO.
+    """
+
+    def __init__(self, logdir, max_queue_size=10, flush_secs=120, filename_suffix=''):
+        """Creates a `EventFileWriter` and an event file to write to.
+
+        On construction the summary writer creates a new event file in `logdir`.
+        This event file will contain `Event` protocol buffers, which are written to
+        disk via the add_event method.
+        The other arguments to the constructor control the asynchronous writes to
+        the event file:
+
+        Args:
+          logdir: A string. Directory where event file will be written.
+          max_queue_size: Integer. Size of the queue for pending events and summaries.
+          flush_secs: Number. How often, in seconds, to flush the
+            pending events and summaries to disk.
+        """
+        self._logdir = logdir
+        directory_check(self._logdir)
+        self._event_queue = six.moves.queue.Queue(max_queue_size)
+        self._ev_writer = EventsWriter(os.path.join(
+            self._logdir, "events"), filename_suffix)
+        self._flush_secs = flush_secs
+        self._closed = False
+        self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
+                                          flush_secs)
+
+        self._worker.start()
+
+    def get_logdir(self):
+        """Returns the directory where event file will be written."""
+        return self._logdir
+
+    def reopen(self):
+        """Reopens the EventFileWriter.
+        Can be called after `close()` to add more events in the same directory.
+        The events will go into a new events file and a new write/flush worker
+        is created. Does nothing if the EventFileWriter was not closed.
+        """
+        if self._closed:
+            self._closed = False
+            self._worker = _EventLoggerThread(
+                self._event_queue, self._ev_writer, self._flush_secs
+            )
+            self._worker.start()
+
+    def add_event(self, event):
+        """Adds an event to the event file.
+
+        Args:
+          event: An `Event` protocol buffer.
+        """
+        if not self._closed:
+            self._event_queue.put(event)
+
+    def flush(self):
+        """Flushes the event file to disk.
+
+        Call this method to make sure that all pending events have been written to
+        disk.
+        """
+        if not self._closed:
+            self._event_queue.join()
+            self._ev_writer.flush()
+
+    def close(self):
+        """Performs a final flush of the event file to disk, stops the
+        write/flush worker and closes the file. Call this method when you do not
+        need the summary writer anymore.
+        """
+        if not self._closed:
+            self.flush()
+            self._worker.stop()
+            self._ev_writer.close()
+            self._closed = True
+
+
+class _EventLoggerThread(threading.Thread):
+    """Thread that logs events."""
+
+    def __init__(self, queue, record_writer, flush_secs):
+        """Creates an _EventLoggerThread.
+        Args:
+          queue: A Queue from which to dequeue data.
+          record_writer: An data writer. Used to log brain events for
+           the visualizer.
+          flush_secs: How often, in seconds, to flush the
+            pending file to disk.
+        """
+        threading.Thread.__init__(self)
+        self.daemon = True
+        self._queue = queue
+        self._record_writer = record_writer
+        self._flush_secs = flush_secs
+        # The first data will be flushed immediately.
+        self._next_flush_time = 0
+        self._has_pending_data = False
+        self._shutdown_signal = object()
+
+    def stop(self):
+        self._queue.put(self._shutdown_signal)
+        self.join()
+
+    def run(self):
+        # Here wait on the queue until an data appears, or till the next
+        # time to flush the writer, whichever is earlier. If we have an
+        # data, write it. If not, an empty queue exception will be raised
+        # and we can proceed to flush the writer.
+        while True:
+            now = time.time()
+            queue_wait_duration = self._next_flush_time - now
+            data = None
+            try:
+                if queue_wait_duration > 0:
+                    data = self._queue.get(True, queue_wait_duration)
+                else:
+                    data = self._queue.get(False)
+
+                if data == self._shutdown_signal:
+                    return
+                self._record_writer.write_event(data)
+                self._has_pending_data = True
+            except six.moves.queue.Empty:
+                pass
+            finally:
+                if data:
+                    self._queue.task_done()
+
+            now = time.time()
+            if now > self._next_flush_time:
+                if self._has_pending_data:
+                    # Small optimization - if there are no pending data,
+                    # there's no need to flush, since each flush can be
+                    # expensive (e.g. uploading a new file to a server).
+                    self._record_writer.flush()
+                    self._has_pending_data = False
+                # Do it again in flush_secs.
+                self._next_flush_time = now + self._flush_secs
diff --git a/tensorboardX/tensorboardX/onnx_graph.py b/tensorboardX/tensorboardX/onnx_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..90eb031c23d65e456261253c13e929dfc7ce8970
--- /dev/null
+++ b/tensorboardX/tensorboardX/onnx_graph.py
@@ -0,0 +1,54 @@
+from .proto.graph_pb2 import GraphDef
+from .proto.node_def_pb2 import NodeDef
+from .proto.versions_pb2 import VersionDef
+from .proto.attr_value_pb2 import AttrValue
+from .proto.tensor_shape_pb2 import TensorShapeProto
+
+
+def load_onnx_graph(fname):
+    import onnx
+    m = onnx.load(fname)
+    g = m.graph
+    return parse(g)
+
+
+def parse(graph):
+    nodes_proto = []
+    nodes = []
+    import itertools
+    for node in itertools.chain(graph.input, graph.output):
+        nodes_proto.append(node)
+
+    for node in nodes_proto:
+        print(node.name)
+        shapeproto = TensorShapeProto(
+            dim=[TensorShapeProto.Dim(size=d.dim_value) for d in node.type.tensor_type.shape.dim])
+        nodes.append(NodeDef(
+            name=node.name.encode(encoding='utf_8'),
+            op='Variable',
+            input=[],
+            attr={
+                'dtype': AttrValue(type=node.type.tensor_type.elem_type),
+                'shape': AttrValue(shape=shapeproto),
+            })
+        )
+
+    for node in graph.node:
+        attr = []
+        for s in node.attribute:
+            attr.append(' = '.join([str(f[1]) for f in s.ListFields()]))
+        attr = ', '.join(attr).encode(encoding='utf_8')
+        print(node.output[0])
+        nodes.append(NodeDef(
+            name=node.output[0].encode(encoding='utf_8'),
+            op=node.op_type,
+            input=node.input,
+            attr={'parameters': AttrValue(s=attr)},
+        ))
+
+    # two pass token replacement, appends opname to object id
+    mapping = {}
+    for node in nodes:
+        mapping[node.name] = node.op + '_' + node.name
+
+    return GraphDef(node=nodes, versions=VersionDef(producer=22))
diff --git a/tensorboardX/tensorboardX/proto/__init__.py b/tensorboardX/tensorboardX/proto/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorboardX/tensorboardX/proto/api.proto b/tensorboardX/tensorboardX/proto/api.proto
new file mode 100644
index 0000000000000000000000000000000000000000..1e2f3f9e89d2be5e634e6b65744cd7b9e1f52d17
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/api.proto
@@ -0,0 +1,400 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines a proto3-based REST API that the HParams web-component of the plugin
+// would use to read data from a hyperparameter-tuning experiment.
+// This file defines the message types (resources) used
+// to pass information into and out of the API methods. These messages will be
+// transmitted using proto3 native JSON encoding. See http_api.md for a
+// description of the actual HTTP API.
+
+// General note: in what follows we use the field 'name' of a message to
+// stores its id. We avoid calling this field 'id' since it is a reserved word
+// in Python, as well as to be more compliant with the API style guide
+// detailed in https://cloud.google.com/apis/design/.
+
+// IMPORTANT: If you change any of the messages here, make sure to also update
+// api.d.ts accordingly.
+
+syntax = "proto3";
+
+import "google/protobuf/struct.proto";
+
+package tensorboardX.hparam;
+
+// Represents a single experiment.
+// An experiment consists of multiple "sessions". Typically, in each session
+// a model is trained for a given set of hyperparameter values. In each session
+// a training program may generate one or more series of real numbers--each
+// containing the evaluation of some metric on the model at different training
+// steps.
+//
+// Note that Sessions can consist of multiple Tensorboard "runs", since in
+// a distributed Tensorflow deployment, training can be accomplished using
+// several cooporating processes, each one emitting Summary data to a different
+// log directory or run. For example, in a single session one process could
+// periodically compute the loss on the validation set, and another could
+// compute the loss on the training set.
+// NEXT_TAG: 7
+message Experiment {
+  // -- Experiments are scoped by a global name.
+  // Currently, Tensorboard supports displaying data for a single experiment.
+  string name = 6;
+
+  // A description. May contain markdown.
+  string description = 1;
+
+  // An id for the owning user or group.
+  string user = 2;
+
+  // The time the experiment was created. In seconds since the UNIX epoch.
+  double time_created_secs = 3;
+
+  // Information about each hyperparameter used in the experiment.
+  repeated HParamInfo hparam_infos = 4;
+
+  // Information about each metric used in the experiment.
+  repeated MetricInfo metric_infos = 5;
+}
+
+// NEXT_TAG: 7
+message HParamInfo {
+  // An id for the hyperparameter.
+  string name = 1;
+
+  // A string used to display the hyperparameter in the UI. If empty, the UI
+  // will display the 'name' field.
+  string display_name = 2;
+
+  // A description. May contain markdown.
+  string description = 3;
+
+  // The data type of this hyperparameter.
+  DataType type = 4;
+
+  // Specifies the set of values this hyperparameter can hold. The UI assumes
+  // every instance of this hyperparameter will hold a value from this set. It
+  // is used by the UI to allow filtering so that only session groups (see
+  // below) whose associated hyperparameter value "passes" the filter are
+  // displayed. If this is not populated, the domain is assumed to be the
+  // entire domain of the type of the hyperparameter.
+  oneof domain {
+    // A discrete set of the values this hyperparameter can hold.
+    google.protobuf.ListValue domain_discrete = 5;
+    // Numeric data type only. The (real) interval from which values of this
+    // hyperparameter are taken.
+    Interval domain_interval = 6;
+  }
+}
+
+enum DataType {
+  DATA_TYPE_UNSET = 0;
+  DATA_TYPE_STRING = 1;
+  DATA_TYPE_BOOL = 2;
+  DATA_TYPE_FLOAT64 = 3;
+}
+
+// Represents the closed interval [min_value, max_value] of the real line.
+// NEXT_TAG: 3
+message Interval {
+  double min_value = 1;
+  double max_value = 2;
+}
+
+// NEXT_TAG: 3
+message MetricName {
+  // An identifier for a metric. A metric is a real-valued function of the
+  // model. The UI can plot metrics for a session evaluated on the model at
+  // different training steps.
+  //
+  // We identify a metric by a (group, tag) pair of strings. The UI treats
+  // both of these as opaque strings. The only requirement is that the pair
+  // uniquely identifies a metric in the experiment.
+  //
+  // We use a pair so the UI could allow the user to group metrics for a
+  // single session by either group or tag to be displayed in the same chart.
+  // For instance, one can set the metric group to correspond to the dataset
+  // on which the model was evaluated, and the UI can then display different
+  // metrics describing the same underlying computation and using different
+  // datasets, on the same chart.
+  //
+  // When exporting summaries from Tensorflow, in a typical setup, a
+  // training session exports evaluations of metrics at different training steps
+  // as Scalar-plugin summaries--each having a run of the form
+  // "<session_base_log_dir>/<sub_dir>", and some associated tag. The same
+  // metric for different sessions would use the same sub_dir and tag, but
+  // would have a different session_base_log_dir. For example, a session
+  // computing two metrics: model loss on the validation set and model loss on
+  // the training set, can export these as scalar summaries with the tag "loss"
+  // and runs session_base_log_dir/validation and session_base_log_dir/training,
+  // respectively. In this setup, the 'group' field can be populated with
+  // the "sub_dir" associated with the metric, and the 'tag' field can be
+  // populated with the tag: "loss".
+  string group = 1;
+  string tag = 2;
+}
+
+// NEXT_TAG: 6
+message MetricInfo {
+  MetricName name = 1;
+
+  // A string used to display the metric in the UI. If empty, the UI
+  // will display the 'name' field.
+  string display_name = 3;
+
+  // A description. May contain markdown.
+  string description = 4;
+
+  // The dataset type (validation, training) on which the metric is computed.
+  DatasetType dataset_type = 5;
+}
+
+enum DatasetType {
+  DATASET_UNKNOWN = 0;
+  DATASET_TRAINING = 1;
+  DATASET_VALIDATION = 2;
+}
+
+// In some experiments, the user trains a model with the same set of
+// hyperparameters multiple times to get the distribution of metric
+// evaluations, when the computation (such as the training algorithm, or metric
+// evaluation) is non-deterministic. To make the UI aware of this, sessions
+// are partitioned into groups: each group consists of all training sessions
+// which share the same values for the hyperparameters. In experiments with no
+// repeated executions, each group consists of exactly one session.
+// NEXT_TAG: 6
+message SessionGroup {
+  string name = 1;
+
+  // Stores the hyperparameters for sessions within this group as a mapping
+  // from the hyperparameter name to its value.
+  map<string /* hparam name */, google.protobuf.Value> hparams = 2;
+
+  // A list of pairs (metric, value)--one for each metric in the experiment.
+  // The value denotes the evaluation of the corresponding metric on
+  // the model aggregated across the sessions in this group. The exact method
+  // of aggregation is specified in the comments of ListSessionGroupsRequest.
+  // Unfortunately, we can't store these as a map, since proto maps can't have
+  // message keys.
+  repeated MetricValue metric_values = 3;
+
+  // The sessions belonging to this group.
+  repeated Session sessions = 4;
+
+  // An optional link to a web page monitoring the session group.
+  string monitor_url = 5;
+}
+
+// NEXT_TAG: 5
+message MetricValue {
+  MetricName name = 1;
+
+  double value = 2;
+
+  // The training step at which this value is computed.
+  int32 training_step = 3;
+
+  // The wall time in seconds since UNIX epoch at which this value is computed.
+  double wall_time_secs = 4;
+}
+
+// NEXT_TAG: 8
+message Session {
+  // An id for the session. Unique within an experiment (not just the group).
+  string name = 1;
+
+  // In seconds since the UNIX epoch.
+  double start_time_secs = 2;
+
+  // In seconds since the UNIX epoch.
+  // May be 0 if unavailable or the session has not finished yet.
+  double end_time_secs = 3;
+
+  // May be STATUS_UNKNOWN if unavailable.
+  Status status = 4;
+
+  // A URI for a resource that will allow the user to reconstruct the model for
+  // this session. E.g., in Tensorflow this could point to a directory where the
+  // checkpoints are stored. Currently, this is treated opaquely by the UI
+  // and only displayed to the user as it is passed here.
+  string model_uri = 5;
+
+  // Stores each metric evaluation on the model at the current training step.
+  // Unfortunately, we can't store these as a map, since proto maps can't have
+  // message keys.
+  repeated MetricValue metric_values = 6;
+
+  // An optional link to a web page monitoring the session.
+  string monitor_url = 7;
+}
+
+// Represents the status of a Session.
+enum Status {
+  STATUS_UNKNOWN = 0;
+  STATUS_SUCCESS = 1;
+  STATUS_FAILURE = 2;
+  STATUS_RUNNING = 3;
+}
+
+// Parameters for a GetExperiment API call.
+// Each experiment is scoped by a unique global id.
+// NEXT_TAG: 2
+message GetExperimentRequest {
+  // REQUIRED
+  string experiment_name = 1;
+}
+
+// Parameters for a ListSessionGroups API call.
+// Computes a list of the current session groups allowing for filtering and
+// sorting by metrics and hyperparameter values. Returns a "slice" of
+// that list specified by start_index and slice_size.
+// NEXT_TAG: 8
+message ListSessionGroupsRequest {
+  string experiment_name = 6;
+
+  // Filters the set of sessions (from which the session groups are formed) to
+  // contain only these sessions whose status is contained in
+  // 'allowed_statuses'.
+  repeated Status allowed_statuses = 7;
+
+  // A list of ColParams messages--one for each "column" of a session group. A
+  // session group column contains either a metric evaluated at the current
+  // reported computation step or a hyperparameter value. In addition to
+  // 'regular' values, a column may take on a special 'missing-value' which
+  // denotes that the hyperparameter or metric is not available
+  // for the session group (for example, if the metric is not used in the
+  // group).
+  //
+  // The ColParams messages in the repeated field below configure filtering and
+  // sorting of the resulting collection of session groups. See the comments of
+  // the fields in the ColParam message below for more details.
+  repeated ColParams col_params = 1;
+
+  // Fields controlling how to aggregate metrics across sessions within a
+  // session group.
+  // If aggregation_type is AGGREGATION_AVG, each metric value of the
+  // session group is the average of the values of the metric across the
+  // sessions.
+  // Otherwise, the session group metric values are taken directly from a
+  // "representative" session in the group, selected as a session for which
+  // primary_metric takes on its minimum, maximum, or median value, as
+  // specified by the choice of aggregation_type (for median, if the number of
+  // sessions in the group is even, a session with a lower "middle" value is
+  // chosen as the representative session).
+  AggregationType aggregation_type = 2;
+
+  // See comment for 'aggregation_type' above.
+  MetricName aggregation_metric = 3;
+
+  // The next two parameters determine the "slice" of the full list of
+  // session groups--sorted and filtered by the parameters above--to return.
+  // The 0-based index of the first session group to return.
+  int32 start_index = 4;
+
+  // The number of session groups to return starting at the session group
+  // indexed by 'start_index'. The actual number of session groups returned
+  // is min{slice_size, total_size - start_index}, where
+  // total_size is the number of session groups in the full list
+  // sorted and filtered by the parameters above (if start_index > total_size
+  // no session groups are returned).
+  int32 slice_size = 5;
+}
+
+// Defines parmeters for a ListSessionGroupsRequest for a specific column.
+// See the comment for "ListSessionGroupsRequest" above for more details.
+// NEXT_TAG: 9
+message ColParams {
+  oneof name {
+    MetricName metric = 1;
+    string hparam = 2;
+  }
+
+  // Sorting.
+  // The final order of session groups in the response is defined by the sub
+  // collection of ColParams messages (out of the
+  // ListSessionGroupsRequest.col_params repeated field) whose 'order' field
+  // (below) is not ORDER_UNSPECIFIED. In each of the messages in this
+  // sub-collection, the next two fields specify the ordering of the values
+  // and missing_values in the associated column of the session group. The
+  // order of the ColParams messages themselves within the sub-collection
+  // determines the "significance" of the associated column as a sorting key:
+  // with the first being the primary sorting key, the second being the
+  // secondary sorting key, etc.
+  // Note: The 'session group name' is added as a least significant sorting
+  // key to the keys defined by the user, so the order in the response is always
+  // deterministic.
+  SortOrder order = 3;
+  // This field is ignored if order is ORDER_UNSPECIFIED.
+  // Otherwise, if true, missing values are ordered before every other value in
+  // the column; if false they are ordered after every other value in the
+  // column.
+  bool missing_values_first = 4;
+
+  // Filtering.
+  // The 'filter' oneof specifies a subset of the domain of the values a column
+  // may take. Only session groups with each of their column values belonging
+  // to this subset are included in the response. If this field is not
+  // specified, the subset is taken to be the entire column domain.
+  oneof filter {
+    // Only valid for string-valued hyperparameter columns. The subset is
+    // the set of all strings matching the regular expression stored
+    // in 'regexp' as a partial match (use '^<regexp>$' to have a full
+    // match against regexp).
+    string filter_regexp = 5;
+
+    // Only valid for numeric-valued columns. The subset is the given interval.
+    Interval filter_interval = 6;
+
+    // Valid for all data types. The subset is defined explicitly.
+    google.protobuf.ListValue filter_discrete = 7;
+  }
+  // Specifies whether to exclude session groups whose column value is missing
+  // from the response.
+  bool exclude_missing_values = 8;
+}
+
+enum SortOrder {
+  ORDER_UNSPECIFIED = 0;
+  ORDER_ASC = 1;
+  ORDER_DESC = 2;
+}
+
+enum AggregationType {
+  AGGREGATION_UNSET = 0;
+  AGGREGATION_AVG = 1;
+  AGGREGATION_MEDIAN = 2;
+  AGGREGATION_MIN = 3;
+  AGGREGATION_MAX = 4;
+}
+
+// See ListSessionGroups in http_api.md.
+// NEXT_TAG: 4
+message ListSessionGroupsResponse {
+  repeated SessionGroup session_groups = 1;
+
+  // Denotes the total number of session groups in the full filtered list.
+  // (Recall that this response may only be a slice).
+  // It is used by the UI to calculate total number of pages and can be
+  // set here to -1 to mean "unknown".
+  int32 total_size = 3;
+}
+
+// See ListMetricEvalsRequest in http_api.md.
+// NEXT_TAG: 4
+message ListMetricEvalsRequest {
+  string experiment_name = 3;
+  string session_name = 1;
+  MetricName metric_name = 2;
+}
diff --git a/tensorboardX/tensorboardX/proto/api_pb2.py b/tensorboardX/tensorboardX/proto/api_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f104d39f351cce08912f87cf68eb2ffbb95d8527
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/api_pb2.py
@@ -0,0 +1,1124 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/api.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/api.proto',
+  package='tensorboardX.hparam',
+  syntax='proto3',
+  serialized_options=None,
+  serialized_pb=_b('\n\x1ctensorboardX/proto/api.proto\x12\x13tensorboardX.hparam\x1a\x1cgoogle/protobuf/struct.proto\"\xc6\x01\n\nExperiment\x12\x0c\n\x04name\x18\x06 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x0c\n\x04user\x18\x02 \x01(\t\x12\x19\n\x11time_created_secs\x18\x03 \x01(\x01\x12\x35\n\x0chparam_infos\x18\x04 \x03(\x0b\x32\x1f.tensorboardX.hparam.HParamInfo\x12\x35\n\x0cmetric_infos\x18\x05 \x03(\x0b\x32\x1f.tensorboardX.hparam.MetricInfo\"\xed\x01\n\nHParamInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x14\n\x0c\x64isplay_name\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x03 \x01(\t\x12+\n\x04type\x18\x04 \x01(\x0e\x32\x1d.tensorboardX.hparam.DataType\x12\x35\n\x0f\x64omain_discrete\x18\x05 \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12\x38\n\x0f\x64omain_interval\x18\x06 \x01(\x0b\x32\x1d.tensorboardX.hparam.IntervalH\x00\x42\x08\n\x06\x64omain\"0\n\x08Interval\x12\x11\n\tmin_value\x18\x01 \x01(\x01\x12\x11\n\tmax_value\x18\x02 \x01(\x01\"(\n\nMetricName\x12\r\n\x05group\x18\x01 \x01(\t\x12\x0b\n\x03tag\x18\x02 \x01(\t\"\x9e\x01\n\nMetricInfo\x12-\n\x04name\x18\x01 \x01(\x0b\x32\x1f.tensorboardX.hparam.MetricName\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x04 \x01(\t\x12\x36\n\x0c\x64\x61taset_type\x18\x05 \x01(\x0e\x32 .tensorboardX.hparam.DatasetType\"\xa3\x02\n\x0cSessionGroup\x12\x0c\n\x04name\x18\x01 \x01(\t\x12?\n\x07hparams\x18\x02 \x03(\x0b\x32..tensorboardX.hparam.SessionGroup.HparamsEntry\x12\x37\n\rmetric_values\x18\x03 \x03(\x0b\x32 .tensorboardX.hparam.MetricValue\x12.\n\x08sessions\x18\x04 \x03(\x0b\x32\x1c.tensorboardX.hparam.Session\x12\x13\n\x0bmonitor_url\x18\x05 \x01(\t\x1a\x46\n\x0cHparamsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value:\x02\x38\x01\"z\n\x0bMetricValue\x12-\n\x04name\x18\x01 \x01(\x0b\x32\x1f.tensorboardX.hparam.MetricName\x12\r\n\x05value\x18\x02 \x01(\x01\x12\x15\n\rtraining_step\x18\x03 \x01(\x05\x12\x16\n\x0ewall_time_secs\x18\x04 \x01(\x01\"\xd5\x01\n\x07Session\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x17\n\x0fstart_time_secs\x18\x02 \x01(\x01\x12\x15\n\rend_time_secs\x18\x03 \x01(\x01\x12+\n\x06status\x18\x04 \x01(\x0e\x32\x1b.tensorboardX.hparam.Status\x12\x11\n\tmodel_uri\x18\x05 \x01(\t\x12\x37\n\rmetric_values\x18\x06 \x03(\x0b\x32 .tensorboardX.hparam.MetricValue\x12\x13\n\x0bmonitor_url\x18\x07 \x01(\t\"/\n\x14GetExperimentRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"\xc4\x02\n\x18ListSessionGroupsRequest\x12\x17\n\x0f\x65xperiment_name\x18\x06 \x01(\t\x12\x35\n\x10\x61llowed_statuses\x18\x07 \x03(\x0e\x32\x1b.tensorboardX.hparam.Status\x12\x32\n\ncol_params\x18\x01 \x03(\x0b\x32\x1e.tensorboardX.hparam.ColParams\x12>\n\x10\x61ggregation_type\x18\x02 \x01(\x0e\x32$.tensorboardX.hparam.AggregationType\x12;\n\x12\x61ggregation_metric\x18\x03 \x01(\x0b\x32\x1f.tensorboardX.hparam.MetricName\x12\x13\n\x0bstart_index\x18\x04 \x01(\x05\x12\x12\n\nslice_size\x18\x05 \x01(\x05\"\xd9\x02\n\tColParams\x12\x31\n\x06metric\x18\x01 \x01(\x0b\x32\x1f.tensorboardX.hparam.MetricNameH\x00\x12\x10\n\x06hparam\x18\x02 \x01(\tH\x00\x12-\n\x05order\x18\x03 \x01(\x0e\x32\x1e.tensorboardX.hparam.SortOrder\x12\x1c\n\x14missing_values_first\x18\x04 \x01(\x08\x12\x17\n\rfilter_regexp\x18\x05 \x01(\tH\x01\x12\x38\n\x0f\x66ilter_interval\x18\x06 \x01(\x0b\x32\x1d.tensorboardX.hparam.IntervalH\x01\x12\x35\n\x0f\x66ilter_discrete\x18\x07 \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x01\x12\x1e\n\x16\x65xclude_missing_values\x18\x08 \x01(\x08\x42\x06\n\x04nameB\x08\n\x06\x66ilter\"j\n\x19ListSessionGroupsResponse\x12\x39\n\x0esession_groups\x18\x01 \x03(\x0b\x32!.tensorboardX.hparam.SessionGroup\x12\x12\n\ntotal_size\x18\x03 \x01(\x05\"}\n\x16ListMetricEvalsRequest\x12\x17\n\x0f\x65xperiment_name\x18\x03 \x01(\t\x12\x14\n\x0csession_name\x18\x01 \x01(\t\x12\x34\n\x0bmetric_name\x18\x02 \x01(\x0b\x32\x1f.tensorboardX.hparam.MetricName*`\n\x08\x44\x61taType\x12\x13\n\x0f\x44\x41TA_TYPE_UNSET\x10\x00\x12\x14\n\x10\x44\x41TA_TYPE_STRING\x10\x01\x12\x12\n\x0e\x44\x41TA_TYPE_BOOL\x10\x02\x12\x15\n\x11\x44\x41TA_TYPE_FLOAT64\x10\x03*P\n\x0b\x44\x61tasetType\x12\x13\n\x0f\x44\x41TASET_UNKNOWN\x10\x00\x12\x14\n\x10\x44\x41TASET_TRAINING\x10\x01\x12\x16\n\x12\x44\x41TASET_VALIDATION\x10\x02*X\n\x06Status\x12\x12\n\x0eSTATUS_UNKNOWN\x10\x00\x12\x12\n\x0eSTATUS_SUCCESS\x10\x01\x12\x12\n\x0eSTATUS_FAILURE\x10\x02\x12\x12\n\x0eSTATUS_RUNNING\x10\x03*A\n\tSortOrder\x12\x15\n\x11ORDER_UNSPECIFIED\x10\x00\x12\r\n\tORDER_ASC\x10\x01\x12\x0e\n\nORDER_DESC\x10\x02*\x7f\n\x0f\x41ggregationType\x12\x15\n\x11\x41GGREGATION_UNSET\x10\x00\x12\x13\n\x0f\x41GGREGATION_AVG\x10\x01\x12\x16\n\x12\x41GGREGATION_MEDIAN\x10\x02\x12\x13\n\x0f\x41GGREGATION_MIN\x10\x03\x12\x13\n\x0f\x41GGREGATION_MAX\x10\x04\x62\x06proto3')
+  ,
+  dependencies=[google_dot_protobuf_dot_struct__pb2.DESCRIPTOR,])
+
+_DATATYPE = _descriptor.EnumDescriptor(
+  name='DataType',
+  full_name='tensorboardX.hparam.DataType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DATA_TYPE_UNSET', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DATA_TYPE_STRING', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DATA_TYPE_BOOL', index=2, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DATA_TYPE_FLOAT64', index=3, number=3,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2370,
+  serialized_end=2466,
+)
+_sym_db.RegisterEnumDescriptor(_DATATYPE)
+
+DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE)
+_DATASETTYPE = _descriptor.EnumDescriptor(
+  name='DatasetType',
+  full_name='tensorboardX.hparam.DatasetType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DATASET_UNKNOWN', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DATASET_TRAINING', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DATASET_VALIDATION', index=2, number=2,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2468,
+  serialized_end=2548,
+)
+_sym_db.RegisterEnumDescriptor(_DATASETTYPE)
+
+DatasetType = enum_type_wrapper.EnumTypeWrapper(_DATASETTYPE)
+_STATUS = _descriptor.EnumDescriptor(
+  name='Status',
+  full_name='tensorboardX.hparam.Status',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='STATUS_UNKNOWN', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STATUS_SUCCESS', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STATUS_FAILURE', index=2, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STATUS_RUNNING', index=3, number=3,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2550,
+  serialized_end=2638,
+)
+_sym_db.RegisterEnumDescriptor(_STATUS)
+
+Status = enum_type_wrapper.EnumTypeWrapper(_STATUS)
+_SORTORDER = _descriptor.EnumDescriptor(
+  name='SortOrder',
+  full_name='tensorboardX.hparam.SortOrder',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='ORDER_UNSPECIFIED', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='ORDER_ASC', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='ORDER_DESC', index=2, number=2,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2640,
+  serialized_end=2705,
+)
+_sym_db.RegisterEnumDescriptor(_SORTORDER)
+
+SortOrder = enum_type_wrapper.EnumTypeWrapper(_SORTORDER)
+_AGGREGATIONTYPE = _descriptor.EnumDescriptor(
+  name='AggregationType',
+  full_name='tensorboardX.hparam.AggregationType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='AGGREGATION_UNSET', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='AGGREGATION_AVG', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='AGGREGATION_MEDIAN', index=2, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='AGGREGATION_MIN', index=3, number=3,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='AGGREGATION_MAX', index=4, number=4,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2707,
+  serialized_end=2834,
+)
+_sym_db.RegisterEnumDescriptor(_AGGREGATIONTYPE)
+
+AggregationType = enum_type_wrapper.EnumTypeWrapper(_AGGREGATIONTYPE)
+DATA_TYPE_UNSET = 0
+DATA_TYPE_STRING = 1
+DATA_TYPE_BOOL = 2
+DATA_TYPE_FLOAT64 = 3
+DATASET_UNKNOWN = 0
+DATASET_TRAINING = 1
+DATASET_VALIDATION = 2
+STATUS_UNKNOWN = 0
+STATUS_SUCCESS = 1
+STATUS_FAILURE = 2
+STATUS_RUNNING = 3
+ORDER_UNSPECIFIED = 0
+ORDER_ASC = 1
+ORDER_DESC = 2
+AGGREGATION_UNSET = 0
+AGGREGATION_AVG = 1
+AGGREGATION_MEDIAN = 2
+AGGREGATION_MIN = 3
+AGGREGATION_MAX = 4
+
+
+
+_EXPERIMENT = _descriptor.Descriptor(
+  name='Experiment',
+  full_name='tensorboardX.hparam.Experiment',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.hparam.Experiment.name', index=0,
+      number=6, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='description', full_name='tensorboardX.hparam.Experiment.description', index=1,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='user', full_name='tensorboardX.hparam.Experiment.user', index=2,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='time_created_secs', full_name='tensorboardX.hparam.Experiment.time_created_secs', index=3,
+      number=3, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='hparam_infos', full_name='tensorboardX.hparam.Experiment.hparam_infos', index=4,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='metric_infos', full_name='tensorboardX.hparam.Experiment.metric_infos', index=5,
+      number=5, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=84,
+  serialized_end=282,
+)
+
+
+_HPARAMINFO = _descriptor.Descriptor(
+  name='HParamInfo',
+  full_name='tensorboardX.hparam.HParamInfo',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.hparam.HParamInfo.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='display_name', full_name='tensorboardX.hparam.HParamInfo.display_name', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='description', full_name='tensorboardX.hparam.HParamInfo.description', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='tensorboardX.hparam.HParamInfo.type', index=3,
+      number=4, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='domain_discrete', full_name='tensorboardX.hparam.HParamInfo.domain_discrete', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='domain_interval', full_name='tensorboardX.hparam.HParamInfo.domain_interval', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='domain', full_name='tensorboardX.hparam.HParamInfo.domain',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=285,
+  serialized_end=522,
+)
+
+
+_INTERVAL = _descriptor.Descriptor(
+  name='Interval',
+  full_name='tensorboardX.hparam.Interval',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='min_value', full_name='tensorboardX.hparam.Interval.min_value', index=0,
+      number=1, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='max_value', full_name='tensorboardX.hparam.Interval.max_value', index=1,
+      number=2, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=524,
+  serialized_end=572,
+)
+
+
+_METRICNAME = _descriptor.Descriptor(
+  name='MetricName',
+  full_name='tensorboardX.hparam.MetricName',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='group', full_name='tensorboardX.hparam.MetricName.group', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tag', full_name='tensorboardX.hparam.MetricName.tag', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=574,
+  serialized_end=614,
+)
+
+
+_METRICINFO = _descriptor.Descriptor(
+  name='MetricInfo',
+  full_name='tensorboardX.hparam.MetricInfo',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.hparam.MetricInfo.name', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='display_name', full_name='tensorboardX.hparam.MetricInfo.display_name', index=1,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='description', full_name='tensorboardX.hparam.MetricInfo.description', index=2,
+      number=4, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='dataset_type', full_name='tensorboardX.hparam.MetricInfo.dataset_type', index=3,
+      number=5, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=617,
+  serialized_end=775,
+)
+
+
+_SESSIONGROUP_HPARAMSENTRY = _descriptor.Descriptor(
+  name='HparamsEntry',
+  full_name='tensorboardX.hparam.SessionGroup.HparamsEntry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='tensorboardX.hparam.SessionGroup.HparamsEntry.key', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='tensorboardX.hparam.SessionGroup.HparamsEntry.value', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=999,
+  serialized_end=1069,
+)
+
+_SESSIONGROUP = _descriptor.Descriptor(
+  name='SessionGroup',
+  full_name='tensorboardX.hparam.SessionGroup',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.hparam.SessionGroup.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='hparams', full_name='tensorboardX.hparam.SessionGroup.hparams', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='metric_values', full_name='tensorboardX.hparam.SessionGroup.metric_values', index=2,
+      number=3, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='sessions', full_name='tensorboardX.hparam.SessionGroup.sessions', index=3,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='monitor_url', full_name='tensorboardX.hparam.SessionGroup.monitor_url', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_SESSIONGROUP_HPARAMSENTRY, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=778,
+  serialized_end=1069,
+)
+
+
+_METRICVALUE = _descriptor.Descriptor(
+  name='MetricValue',
+  full_name='tensorboardX.hparam.MetricValue',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.hparam.MetricValue.name', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='tensorboardX.hparam.MetricValue.value', index=1,
+      number=2, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='training_step', full_name='tensorboardX.hparam.MetricValue.training_step', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='wall_time_secs', full_name='tensorboardX.hparam.MetricValue.wall_time_secs', index=3,
+      number=4, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1071,
+  serialized_end=1193,
+)
+
+
+_SESSION = _descriptor.Descriptor(
+  name='Session',
+  full_name='tensorboardX.hparam.Session',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.hparam.Session.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='start_time_secs', full_name='tensorboardX.hparam.Session.start_time_secs', index=1,
+      number=2, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='end_time_secs', full_name='tensorboardX.hparam.Session.end_time_secs', index=2,
+      number=3, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='status', full_name='tensorboardX.hparam.Session.status', index=3,
+      number=4, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='model_uri', full_name='tensorboardX.hparam.Session.model_uri', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='metric_values', full_name='tensorboardX.hparam.Session.metric_values', index=5,
+      number=6, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='monitor_url', full_name='tensorboardX.hparam.Session.monitor_url', index=6,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1196,
+  serialized_end=1409,
+)
+
+
+_GETEXPERIMENTREQUEST = _descriptor.Descriptor(
+  name='GetExperimentRequest',
+  full_name='tensorboardX.hparam.GetExperimentRequest',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='experiment_name', full_name='tensorboardX.hparam.GetExperimentRequest.experiment_name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1411,
+  serialized_end=1458,
+)
+
+
+_LISTSESSIONGROUPSREQUEST = _descriptor.Descriptor(
+  name='ListSessionGroupsRequest',
+  full_name='tensorboardX.hparam.ListSessionGroupsRequest',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='experiment_name', full_name='tensorboardX.hparam.ListSessionGroupsRequest.experiment_name', index=0,
+      number=6, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='allowed_statuses', full_name='tensorboardX.hparam.ListSessionGroupsRequest.allowed_statuses', index=1,
+      number=7, type=14, cpp_type=8, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='col_params', full_name='tensorboardX.hparam.ListSessionGroupsRequest.col_params', index=2,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='aggregation_type', full_name='tensorboardX.hparam.ListSessionGroupsRequest.aggregation_type', index=3,
+      number=2, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='aggregation_metric', full_name='tensorboardX.hparam.ListSessionGroupsRequest.aggregation_metric', index=4,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='start_index', full_name='tensorboardX.hparam.ListSessionGroupsRequest.start_index', index=5,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='slice_size', full_name='tensorboardX.hparam.ListSessionGroupsRequest.slice_size', index=6,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1461,
+  serialized_end=1785,
+)
+
+
+_COLPARAMS = _descriptor.Descriptor(
+  name='ColParams',
+  full_name='tensorboardX.hparam.ColParams',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='metric', full_name='tensorboardX.hparam.ColParams.metric', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='hparam', full_name='tensorboardX.hparam.ColParams.hparam', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='order', full_name='tensorboardX.hparam.ColParams.order', index=2,
+      number=3, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='missing_values_first', full_name='tensorboardX.hparam.ColParams.missing_values_first', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='filter_regexp', full_name='tensorboardX.hparam.ColParams.filter_regexp', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='filter_interval', full_name='tensorboardX.hparam.ColParams.filter_interval', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='filter_discrete', full_name='tensorboardX.hparam.ColParams.filter_discrete', index=6,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='exclude_missing_values', full_name='tensorboardX.hparam.ColParams.exclude_missing_values', index=7,
+      number=8, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='name', full_name='tensorboardX.hparam.ColParams.name',
+      index=0, containing_type=None, fields=[]),
+    _descriptor.OneofDescriptor(
+      name='filter', full_name='tensorboardX.hparam.ColParams.filter',
+      index=1, containing_type=None, fields=[]),
+  ],
+  serialized_start=1788,
+  serialized_end=2133,
+)
+
+
+_LISTSESSIONGROUPSRESPONSE = _descriptor.Descriptor(
+  name='ListSessionGroupsResponse',
+  full_name='tensorboardX.hparam.ListSessionGroupsResponse',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='session_groups', full_name='tensorboardX.hparam.ListSessionGroupsResponse.session_groups', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='total_size', full_name='tensorboardX.hparam.ListSessionGroupsResponse.total_size', index=1,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2135,
+  serialized_end=2241,
+)
+
+
+_LISTMETRICEVALSREQUEST = _descriptor.Descriptor(
+  name='ListMetricEvalsRequest',
+  full_name='tensorboardX.hparam.ListMetricEvalsRequest',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='experiment_name', full_name='tensorboardX.hparam.ListMetricEvalsRequest.experiment_name', index=0,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='session_name', full_name='tensorboardX.hparam.ListMetricEvalsRequest.session_name', index=1,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='metric_name', full_name='tensorboardX.hparam.ListMetricEvalsRequest.metric_name', index=2,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2243,
+  serialized_end=2368,
+)
+
+_EXPERIMENT.fields_by_name['hparam_infos'].message_type = _HPARAMINFO
+_EXPERIMENT.fields_by_name['metric_infos'].message_type = _METRICINFO
+_HPARAMINFO.fields_by_name['type'].enum_type = _DATATYPE
+_HPARAMINFO.fields_by_name['domain_discrete'].message_type = google_dot_protobuf_dot_struct__pb2._LISTVALUE
+_HPARAMINFO.fields_by_name['domain_interval'].message_type = _INTERVAL
+_HPARAMINFO.oneofs_by_name['domain'].fields.append(
+  _HPARAMINFO.fields_by_name['domain_discrete'])
+_HPARAMINFO.fields_by_name['domain_discrete'].containing_oneof = _HPARAMINFO.oneofs_by_name['domain']
+_HPARAMINFO.oneofs_by_name['domain'].fields.append(
+  _HPARAMINFO.fields_by_name['domain_interval'])
+_HPARAMINFO.fields_by_name['domain_interval'].containing_oneof = _HPARAMINFO.oneofs_by_name['domain']
+_METRICINFO.fields_by_name['name'].message_type = _METRICNAME
+_METRICINFO.fields_by_name['dataset_type'].enum_type = _DATASETTYPE
+_SESSIONGROUP_HPARAMSENTRY.fields_by_name['value'].message_type = google_dot_protobuf_dot_struct__pb2._VALUE
+_SESSIONGROUP_HPARAMSENTRY.containing_type = _SESSIONGROUP
+_SESSIONGROUP.fields_by_name['hparams'].message_type = _SESSIONGROUP_HPARAMSENTRY
+_SESSIONGROUP.fields_by_name['metric_values'].message_type = _METRICVALUE
+_SESSIONGROUP.fields_by_name['sessions'].message_type = _SESSION
+_METRICVALUE.fields_by_name['name'].message_type = _METRICNAME
+_SESSION.fields_by_name['status'].enum_type = _STATUS
+_SESSION.fields_by_name['metric_values'].message_type = _METRICVALUE
+_LISTSESSIONGROUPSREQUEST.fields_by_name['allowed_statuses'].enum_type = _STATUS
+_LISTSESSIONGROUPSREQUEST.fields_by_name['col_params'].message_type = _COLPARAMS
+_LISTSESSIONGROUPSREQUEST.fields_by_name['aggregation_type'].enum_type = _AGGREGATIONTYPE
+_LISTSESSIONGROUPSREQUEST.fields_by_name['aggregation_metric'].message_type = _METRICNAME
+_COLPARAMS.fields_by_name['metric'].message_type = _METRICNAME
+_COLPARAMS.fields_by_name['order'].enum_type = _SORTORDER
+_COLPARAMS.fields_by_name['filter_interval'].message_type = _INTERVAL
+_COLPARAMS.fields_by_name['filter_discrete'].message_type = google_dot_protobuf_dot_struct__pb2._LISTVALUE
+_COLPARAMS.oneofs_by_name['name'].fields.append(
+  _COLPARAMS.fields_by_name['metric'])
+_COLPARAMS.fields_by_name['metric'].containing_oneof = _COLPARAMS.oneofs_by_name['name']
+_COLPARAMS.oneofs_by_name['name'].fields.append(
+  _COLPARAMS.fields_by_name['hparam'])
+_COLPARAMS.fields_by_name['hparam'].containing_oneof = _COLPARAMS.oneofs_by_name['name']
+_COLPARAMS.oneofs_by_name['filter'].fields.append(
+  _COLPARAMS.fields_by_name['filter_regexp'])
+_COLPARAMS.fields_by_name['filter_regexp'].containing_oneof = _COLPARAMS.oneofs_by_name['filter']
+_COLPARAMS.oneofs_by_name['filter'].fields.append(
+  _COLPARAMS.fields_by_name['filter_interval'])
+_COLPARAMS.fields_by_name['filter_interval'].containing_oneof = _COLPARAMS.oneofs_by_name['filter']
+_COLPARAMS.oneofs_by_name['filter'].fields.append(
+  _COLPARAMS.fields_by_name['filter_discrete'])
+_COLPARAMS.fields_by_name['filter_discrete'].containing_oneof = _COLPARAMS.oneofs_by_name['filter']
+_LISTSESSIONGROUPSRESPONSE.fields_by_name['session_groups'].message_type = _SESSIONGROUP
+_LISTMETRICEVALSREQUEST.fields_by_name['metric_name'].message_type = _METRICNAME
+DESCRIPTOR.message_types_by_name['Experiment'] = _EXPERIMENT
+DESCRIPTOR.message_types_by_name['HParamInfo'] = _HPARAMINFO
+DESCRIPTOR.message_types_by_name['Interval'] = _INTERVAL
+DESCRIPTOR.message_types_by_name['MetricName'] = _METRICNAME
+DESCRIPTOR.message_types_by_name['MetricInfo'] = _METRICINFO
+DESCRIPTOR.message_types_by_name['SessionGroup'] = _SESSIONGROUP
+DESCRIPTOR.message_types_by_name['MetricValue'] = _METRICVALUE
+DESCRIPTOR.message_types_by_name['Session'] = _SESSION
+DESCRIPTOR.message_types_by_name['GetExperimentRequest'] = _GETEXPERIMENTREQUEST
+DESCRIPTOR.message_types_by_name['ListSessionGroupsRequest'] = _LISTSESSIONGROUPSREQUEST
+DESCRIPTOR.message_types_by_name['ColParams'] = _COLPARAMS
+DESCRIPTOR.message_types_by_name['ListSessionGroupsResponse'] = _LISTSESSIONGROUPSRESPONSE
+DESCRIPTOR.message_types_by_name['ListMetricEvalsRequest'] = _LISTMETRICEVALSREQUEST
+DESCRIPTOR.enum_types_by_name['DataType'] = _DATATYPE
+DESCRIPTOR.enum_types_by_name['DatasetType'] = _DATASETTYPE
+DESCRIPTOR.enum_types_by_name['Status'] = _STATUS
+DESCRIPTOR.enum_types_by_name['SortOrder'] = _SORTORDER
+DESCRIPTOR.enum_types_by_name['AggregationType'] = _AGGREGATIONTYPE
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+Experiment = _reflection.GeneratedProtocolMessageType('Experiment', (_message.Message,), dict(
+  DESCRIPTOR = _EXPERIMENT,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.Experiment)
+  ))
+_sym_db.RegisterMessage(Experiment)
+
+HParamInfo = _reflection.GeneratedProtocolMessageType('HParamInfo', (_message.Message,), dict(
+  DESCRIPTOR = _HPARAMINFO,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.HParamInfo)
+  ))
+_sym_db.RegisterMessage(HParamInfo)
+
+Interval = _reflection.GeneratedProtocolMessageType('Interval', (_message.Message,), dict(
+  DESCRIPTOR = _INTERVAL,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.Interval)
+  ))
+_sym_db.RegisterMessage(Interval)
+
+MetricName = _reflection.GeneratedProtocolMessageType('MetricName', (_message.Message,), dict(
+  DESCRIPTOR = _METRICNAME,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.MetricName)
+  ))
+_sym_db.RegisterMessage(MetricName)
+
+MetricInfo = _reflection.GeneratedProtocolMessageType('MetricInfo', (_message.Message,), dict(
+  DESCRIPTOR = _METRICINFO,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.MetricInfo)
+  ))
+_sym_db.RegisterMessage(MetricInfo)
+
+SessionGroup = _reflection.GeneratedProtocolMessageType('SessionGroup', (_message.Message,), dict(
+
+  HparamsEntry = _reflection.GeneratedProtocolMessageType('HparamsEntry', (_message.Message,), dict(
+    DESCRIPTOR = _SESSIONGROUP_HPARAMSENTRY,
+    __module__ = 'tensorboardX.proto.api_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.hparam.SessionGroup.HparamsEntry)
+    ))
+  ,
+  DESCRIPTOR = _SESSIONGROUP,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.SessionGroup)
+  ))
+_sym_db.RegisterMessage(SessionGroup)
+_sym_db.RegisterMessage(SessionGroup.HparamsEntry)
+
+MetricValue = _reflection.GeneratedProtocolMessageType('MetricValue', (_message.Message,), dict(
+  DESCRIPTOR = _METRICVALUE,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.MetricValue)
+  ))
+_sym_db.RegisterMessage(MetricValue)
+
+Session = _reflection.GeneratedProtocolMessageType('Session', (_message.Message,), dict(
+  DESCRIPTOR = _SESSION,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.Session)
+  ))
+_sym_db.RegisterMessage(Session)
+
+GetExperimentRequest = _reflection.GeneratedProtocolMessageType('GetExperimentRequest', (_message.Message,), dict(
+  DESCRIPTOR = _GETEXPERIMENTREQUEST,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.GetExperimentRequest)
+  ))
+_sym_db.RegisterMessage(GetExperimentRequest)
+
+ListSessionGroupsRequest = _reflection.GeneratedProtocolMessageType('ListSessionGroupsRequest', (_message.Message,), dict(
+  DESCRIPTOR = _LISTSESSIONGROUPSREQUEST,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.ListSessionGroupsRequest)
+  ))
+_sym_db.RegisterMessage(ListSessionGroupsRequest)
+
+ColParams = _reflection.GeneratedProtocolMessageType('ColParams', (_message.Message,), dict(
+  DESCRIPTOR = _COLPARAMS,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.ColParams)
+  ))
+_sym_db.RegisterMessage(ColParams)
+
+ListSessionGroupsResponse = _reflection.GeneratedProtocolMessageType('ListSessionGroupsResponse', (_message.Message,), dict(
+  DESCRIPTOR = _LISTSESSIONGROUPSRESPONSE,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.ListSessionGroupsResponse)
+  ))
+_sym_db.RegisterMessage(ListSessionGroupsResponse)
+
+ListMetricEvalsRequest = _reflection.GeneratedProtocolMessageType('ListMetricEvalsRequest', (_message.Message,), dict(
+  DESCRIPTOR = _LISTMETRICEVALSREQUEST,
+  __module__ = 'tensorboardX.proto.api_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.ListMetricEvalsRequest)
+  ))
+_sym_db.RegisterMessage(ListMetricEvalsRequest)
+
+
+_SESSIONGROUP_HPARAMSENTRY._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/attr_value.proto b/tensorboardX/tensorboardX/proto/attr_value.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b73fb36f91bb3f560593511138510480d6c700cc
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/attr_value.proto
@@ -0,0 +1,62 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "AttrValueProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+import "tensorboardX/proto/tensor.proto";
+import "tensorboardX/proto/tensor_shape.proto";
+import "tensorboardX/proto/types.proto";
+
+// Protocol buffer representing the value for an attr used to configure an Op.
+// Comment indicates the corresponding attr type.  Only the field matching the
+// attr type may be filled.
+message AttrValue {
+  // LINT.IfChange
+  message ListValue {
+    repeated bytes s = 2;                        // "list(string)"
+    repeated int64 i = 3 [packed = true];        // "list(int)"
+    repeated float f = 4 [packed = true];        // "list(float)"
+    repeated bool b = 5 [packed = true];         // "list(bool)"
+    repeated DataType type = 6 [packed = true];  // "list(type)"
+    repeated TensorShapeProto shape = 7;         // "list(shape)"
+    repeated TensorProto tensor = 8;             // "list(tensor)"
+    repeated NameAttrList func = 9;              // "list(attr)"
+  }
+  // LINT.ThenChange(https://www.tensorflow.org/code/tensorflow/c/c_api.cc)
+
+  oneof value {
+    bytes s = 2;                 // "string"
+    int64 i = 3;                 // "int"
+    float f = 4;                 // "float"
+    bool b = 5;                  // "bool"
+    DataType type = 6;           // "type"
+    TensorShapeProto shape = 7;  // "shape"
+    TensorProto tensor = 8;      // "tensor"
+    ListValue list = 1;          // any "list(...)"
+
+    // "func" represents a function. func.name is a function's name or
+    // a primitive op's name. func.attr.first is the name of an attr
+    // defined for that function. func.attr.second is the value for
+    // that attr in the instantiation.
+    NameAttrList func = 10;
+
+    // This is a placeholder only used in nodes defined inside a
+    // function.  It indicates the attr value will be supplied when
+    // the function is instantiated.  For example, let us suppose a
+    // node "N" in function "FN". "N" has an attr "A" with value
+    // placeholder = "foo". When FN is instantiated with attr "foo"
+    // set to "bar", the instantiated node N's attr A will have been
+    // given the value "bar".
+    string placeholder = 9;
+  }
+}
+
+// A list of attr names and their values. The whole list is attached
+// with a string name.  E.g., MatMul[T=float].
+message NameAttrList {
+  string name = 1;
+  map<string, AttrValue> attr = 2;
+}
diff --git a/tensorboardX/tensorboardX/proto/attr_value_pb2.py b/tensorboardX/tensorboardX/proto/attr_value_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8720e705d8e3c3971ecf6e9522d7ae1e9d6b16dd
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/attr_value_pb2.py
@@ -0,0 +1,366 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/attr_value.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from tensorboardX.proto import tensor_pb2 as tensorboardX_dot_proto_dot_tensor__pb2
+from tensorboardX.proto import tensor_shape_pb2 as tensorboardX_dot_proto_dot_tensor__shape__pb2
+from tensorboardX.proto import types_pb2 as tensorboardX_dot_proto_dot_types__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/attr_value.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\017AttrValueProtosP\001\370\001\001'),
+  serialized_pb=_b('\n#tensorboardX/proto/attr_value.proto\x12\x0ctensorboardX\x1a\x1ftensorboardX/proto/tensor.proto\x1a%tensorboardX/proto/tensor_shape.proto\x1a\x1etensorboardX/proto/types.proto\"\xb8\x04\n\tAttrValue\x12\x0b\n\x01s\x18\x02 \x01(\x0cH\x00\x12\x0b\n\x01i\x18\x03 \x01(\x03H\x00\x12\x0b\n\x01\x66\x18\x04 \x01(\x02H\x00\x12\x0b\n\x01\x62\x18\x05 \x01(\x08H\x00\x12&\n\x04type\x18\x06 \x01(\x0e\x32\x16.tensorboardX.DataTypeH\x00\x12/\n\x05shape\x18\x07 \x01(\x0b\x32\x1e.tensorboardX.TensorShapeProtoH\x00\x12+\n\x06tensor\x18\x08 \x01(\x0b\x32\x19.tensorboardX.TensorProtoH\x00\x12\x31\n\x04list\x18\x01 \x01(\x0b\x32!.tensorboardX.AttrValue.ListValueH\x00\x12*\n\x04\x66unc\x18\n \x01(\x0b\x32\x1a.tensorboardX.NameAttrListH\x00\x12\x15\n\x0bplaceholder\x18\t \x01(\tH\x00\x1a\xf1\x01\n\tListValue\x12\t\n\x01s\x18\x02 \x03(\x0c\x12\r\n\x01i\x18\x03 \x03(\x03\x42\x02\x10\x01\x12\r\n\x01\x66\x18\x04 \x03(\x02\x42\x02\x10\x01\x12\r\n\x01\x62\x18\x05 \x03(\x08\x42\x02\x10\x01\x12(\n\x04type\x18\x06 \x03(\x0e\x32\x16.tensorboardX.DataTypeB\x02\x10\x01\x12-\n\x05shape\x18\x07 \x03(\x0b\x32\x1e.tensorboardX.TensorShapeProto\x12)\n\x06tensor\x18\x08 \x03(\x0b\x32\x19.tensorboardX.TensorProto\x12(\n\x04\x66unc\x18\t \x03(\x0b\x32\x1a.tensorboardX.NameAttrListB\x07\n\x05value\"\x96\x01\n\x0cNameAttrList\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x32\n\x04\x61ttr\x18\x02 \x03(\x0b\x32$.tensorboardX.NameAttrList.AttrEntry\x1a\x44\n\tAttrEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12&\n\x05value\x18\x02 \x01(\x0b\x32\x17.tensorboardX.AttrValue:\x02\x38\x01\x42\x30\n\x18org.tensorflow.frameworkB\x0f\x41ttrValueProtosP\x01\xf8\x01\x01\x62\x06proto3')
+  ,
+  dependencies=[tensorboardX_dot_proto_dot_tensor__pb2.DESCRIPTOR,tensorboardX_dot_proto_dot_tensor__shape__pb2.DESCRIPTOR,tensorboardX_dot_proto_dot_types__pb2.DESCRIPTOR,])
+
+
+
+
+_ATTRVALUE_LISTVALUE = _descriptor.Descriptor(
+  name='ListValue',
+  full_name='tensorboardX.AttrValue.ListValue',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='s', full_name='tensorboardX.AttrValue.ListValue.s', index=0,
+      number=2, type=12, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='i', full_name='tensorboardX.AttrValue.ListValue.i', index=1,
+      number=3, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='f', full_name='tensorboardX.AttrValue.ListValue.f', index=2,
+      number=4, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='b', full_name='tensorboardX.AttrValue.ListValue.b', index=3,
+      number=5, type=8, cpp_type=7, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='tensorboardX.AttrValue.ListValue.type', index=4,
+      number=6, type=14, cpp_type=8, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='tensorboardX.AttrValue.ListValue.shape', index=5,
+      number=7, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tensor', full_name='tensorboardX.AttrValue.ListValue.tensor', index=6,
+      number=8, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='func', full_name='tensorboardX.AttrValue.ListValue.func', index=7,
+      number=9, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=476,
+  serialized_end=717,
+)
+
+_ATTRVALUE = _descriptor.Descriptor(
+  name='AttrValue',
+  full_name='tensorboardX.AttrValue',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='s', full_name='tensorboardX.AttrValue.s', index=0,
+      number=2, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='i', full_name='tensorboardX.AttrValue.i', index=1,
+      number=3, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='f', full_name='tensorboardX.AttrValue.f', index=2,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='b', full_name='tensorboardX.AttrValue.b', index=3,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='tensorboardX.AttrValue.type', index=4,
+      number=6, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='tensorboardX.AttrValue.shape', index=5,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tensor', full_name='tensorboardX.AttrValue.tensor', index=6,
+      number=8, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='list', full_name='tensorboardX.AttrValue.list', index=7,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='func', full_name='tensorboardX.AttrValue.func', index=8,
+      number=10, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='placeholder', full_name='tensorboardX.AttrValue.placeholder', index=9,
+      number=9, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_ATTRVALUE_LISTVALUE, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='value', full_name='tensorboardX.AttrValue.value',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=158,
+  serialized_end=726,
+)
+
+
+_NAMEATTRLIST_ATTRENTRY = _descriptor.Descriptor(
+  name='AttrEntry',
+  full_name='tensorboardX.NameAttrList.AttrEntry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='tensorboardX.NameAttrList.AttrEntry.key', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='tensorboardX.NameAttrList.AttrEntry.value', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=811,
+  serialized_end=879,
+)
+
+_NAMEATTRLIST = _descriptor.Descriptor(
+  name='NameAttrList',
+  full_name='tensorboardX.NameAttrList',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.NameAttrList.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='attr', full_name='tensorboardX.NameAttrList.attr', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_NAMEATTRLIST_ATTRENTRY, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=729,
+  serialized_end=879,
+)
+
+_ATTRVALUE_LISTVALUE.fields_by_name['type'].enum_type = tensorboardX_dot_proto_dot_types__pb2._DATATYPE
+_ATTRVALUE_LISTVALUE.fields_by_name['shape'].message_type = tensorboardX_dot_proto_dot_tensor__shape__pb2._TENSORSHAPEPROTO
+_ATTRVALUE_LISTVALUE.fields_by_name['tensor'].message_type = tensorboardX_dot_proto_dot_tensor__pb2._TENSORPROTO
+_ATTRVALUE_LISTVALUE.fields_by_name['func'].message_type = _NAMEATTRLIST
+_ATTRVALUE_LISTVALUE.containing_type = _ATTRVALUE
+_ATTRVALUE.fields_by_name['type'].enum_type = tensorboardX_dot_proto_dot_types__pb2._DATATYPE
+_ATTRVALUE.fields_by_name['shape'].message_type = tensorboardX_dot_proto_dot_tensor__shape__pb2._TENSORSHAPEPROTO
+_ATTRVALUE.fields_by_name['tensor'].message_type = tensorboardX_dot_proto_dot_tensor__pb2._TENSORPROTO
+_ATTRVALUE.fields_by_name['list'].message_type = _ATTRVALUE_LISTVALUE
+_ATTRVALUE.fields_by_name['func'].message_type = _NAMEATTRLIST
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['s'])
+_ATTRVALUE.fields_by_name['s'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['i'])
+_ATTRVALUE.fields_by_name['i'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['f'])
+_ATTRVALUE.fields_by_name['f'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['b'])
+_ATTRVALUE.fields_by_name['b'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['type'])
+_ATTRVALUE.fields_by_name['type'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['shape'])
+_ATTRVALUE.fields_by_name['shape'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['tensor'])
+_ATTRVALUE.fields_by_name['tensor'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['list'])
+_ATTRVALUE.fields_by_name['list'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['func'])
+_ATTRVALUE.fields_by_name['func'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_ATTRVALUE.oneofs_by_name['value'].fields.append(
+  _ATTRVALUE.fields_by_name['placeholder'])
+_ATTRVALUE.fields_by_name['placeholder'].containing_oneof = _ATTRVALUE.oneofs_by_name['value']
+_NAMEATTRLIST_ATTRENTRY.fields_by_name['value'].message_type = _ATTRVALUE
+_NAMEATTRLIST_ATTRENTRY.containing_type = _NAMEATTRLIST
+_NAMEATTRLIST.fields_by_name['attr'].message_type = _NAMEATTRLIST_ATTRENTRY
+DESCRIPTOR.message_types_by_name['AttrValue'] = _ATTRVALUE
+DESCRIPTOR.message_types_by_name['NameAttrList'] = _NAMEATTRLIST
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+AttrValue = _reflection.GeneratedProtocolMessageType('AttrValue', (_message.Message,), dict(
+
+  ListValue = _reflection.GeneratedProtocolMessageType('ListValue', (_message.Message,), dict(
+    DESCRIPTOR = _ATTRVALUE_LISTVALUE,
+    __module__ = 'tensorboardX.proto.attr_value_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.AttrValue.ListValue)
+    ))
+  ,
+  DESCRIPTOR = _ATTRVALUE,
+  __module__ = 'tensorboardX.proto.attr_value_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.AttrValue)
+  ))
+_sym_db.RegisterMessage(AttrValue)
+_sym_db.RegisterMessage(AttrValue.ListValue)
+
+NameAttrList = _reflection.GeneratedProtocolMessageType('NameAttrList', (_message.Message,), dict(
+
+  AttrEntry = _reflection.GeneratedProtocolMessageType('AttrEntry', (_message.Message,), dict(
+    DESCRIPTOR = _NAMEATTRLIST_ATTRENTRY,
+    __module__ = 'tensorboardX.proto.attr_value_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.NameAttrList.AttrEntry)
+    ))
+  ,
+  DESCRIPTOR = _NAMEATTRLIST,
+  __module__ = 'tensorboardX.proto.attr_value_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.NameAttrList)
+  ))
+_sym_db.RegisterMessage(NameAttrList)
+_sym_db.RegisterMessage(NameAttrList.AttrEntry)
+
+
+DESCRIPTOR._options = None
+_ATTRVALUE_LISTVALUE.fields_by_name['i']._options = None
+_ATTRVALUE_LISTVALUE.fields_by_name['f']._options = None
+_ATTRVALUE_LISTVALUE.fields_by_name['b']._options = None
+_ATTRVALUE_LISTVALUE.fields_by_name['type']._options = None
+_NAMEATTRLIST_ATTRENTRY._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/event.proto b/tensorboardX/tensorboardX/proto/event.proto
new file mode 100644
index 0000000000000000000000000000000000000000..45ab01f8dde204e8110a3520f7e51931d6d3a2e0
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/event.proto
@@ -0,0 +1,78 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "EventProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.util";
+
+import "tensorboardX/proto/summary.proto";
+
+// Protocol buffer representing an event that happened during
+// the execution of a Brain model.
+message Event {
+  // Timestamp of the event.
+  double wall_time = 1;
+
+  // Global step of the event.
+  int64 step = 2;
+
+  oneof what {
+    // An event file was started, with the specified version.
+    // This is use to identify the contents of the record IO files
+    // easily.  Current version is "brain.Event:2".  All versions
+    // start with "brain.Event:".
+    string file_version = 3;
+    // An encoded version of a GraphDef.
+    bytes graph_def = 4;
+    // A summary was generated.
+    Summary summary = 5;
+    // The user output a log message. Not all messages are logged, only ones
+    // generated via the Python tensorboard_logging module.
+    LogMessage log_message = 6;
+    // The state of the session which can be used for restarting after crashes.
+    SessionLog session_log = 7;
+    // The metadata returned by running a session.run() call.
+    TaggedRunMetadata tagged_run_metadata = 8;
+    // An encoded version of a MetaGraphDef.
+    bytes meta_graph_def = 9;
+  }
+}
+
+// Protocol buffer used for logging messages to the events file.
+message LogMessage {
+  enum Level {
+    UNKNOWN = 0;
+    DEBUG = 10;
+    INFO = 20;
+    WARN = 30;
+    ERROR = 40;
+    FATAL = 50;
+  }
+  Level level = 1;
+  string message = 2;
+}
+
+// Protocol buffer used for logging session state.
+message SessionLog {
+  enum SessionStatus {
+    STATUS_UNSPECIFIED = 0;
+    START = 1;
+    STOP = 2;
+    CHECKPOINT = 3;
+  }
+
+  SessionStatus status = 1;
+  // This checkpoint_path contains both the path and filename.
+  string checkpoint_path = 2;
+  string msg = 3;
+}
+
+// For logging the metadata output for a single session.run() call.
+message TaggedRunMetadata {
+  // Tag name associated with this metadata.
+  string tag = 1;
+  // Byte-encoded version of the `RunMetadata` proto in order to allow lazy
+  // deserialization.
+  bytes run_metadata = 2;
+}
diff --git a/tensorboardX/tensorboardX/proto/event_pb2.py b/tensorboardX/tensorboardX/proto/event_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbd3ed5aacbf77fb080820427c5020227e9bbb65
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/event_pb2.py
@@ -0,0 +1,375 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/event.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from tensorboardX.proto import summary_pb2 as tensorboardX_dot_proto_dot_summary__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/event.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\023org.tensorflow.utilB\013EventProtosP\001\370\001\001'),
+  serialized_pb=_b('\n\x1etensorboardX/proto/event.proto\x12\x0ctensorboardX\x1a tensorboardX/proto/summary.proto\"\xc3\x02\n\x05\x45vent\x12\x11\n\twall_time\x18\x01 \x01(\x01\x12\x0c\n\x04step\x18\x02 \x01(\x03\x12\x16\n\x0c\x66ile_version\x18\x03 \x01(\tH\x00\x12\x13\n\tgraph_def\x18\x04 \x01(\x0cH\x00\x12(\n\x07summary\x18\x05 \x01(\x0b\x32\x15.tensorboardX.SummaryH\x00\x12/\n\x0blog_message\x18\x06 \x01(\x0b\x32\x18.tensorboardX.LogMessageH\x00\x12/\n\x0bsession_log\x18\x07 \x01(\x0b\x32\x18.tensorboardX.SessionLogH\x00\x12>\n\x13tagged_run_metadata\x18\x08 \x01(\x0b\x32\x1f.tensorboardX.TaggedRunMetadataH\x00\x12\x18\n\x0emeta_graph_def\x18\t \x01(\x0cH\x00\x42\x06\n\x04what\"\x97\x01\n\nLogMessage\x12-\n\x05level\x18\x01 \x01(\x0e\x32\x1e.tensorboardX.LogMessage.Level\x12\x0f\n\x07message\x18\x02 \x01(\t\"I\n\x05Level\x12\x0b\n\x07UNKNOWN\x10\x00\x12\t\n\x05\x44\x45\x42UG\x10\n\x12\x08\n\x04INFO\x10\x14\x12\x08\n\x04WARN\x10\x1e\x12\t\n\x05\x45RROR\x10(\x12\t\n\x05\x46\x41TAL\x10\x32\"\xb8\x01\n\nSessionLog\x12\x36\n\x06status\x18\x01 \x01(\x0e\x32&.tensorboardX.SessionLog.SessionStatus\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\x12\x0b\n\x03msg\x18\x03 \x01(\t\"L\n\rSessionStatus\x12\x16\n\x12STATUS_UNSPECIFIED\x10\x00\x12\t\n\x05START\x10\x01\x12\x08\n\x04STOP\x10\x02\x12\x0e\n\nCHECKPOINT\x10\x03\"6\n\x11TaggedRunMetadata\x12\x0b\n\x03tag\x18\x01 \x01(\t\x12\x14\n\x0crun_metadata\x18\x02 \x01(\x0c\x42\'\n\x13org.tensorflow.utilB\x0b\x45ventProtosP\x01\xf8\x01\x01\x62\x06proto3')
+  ,
+  dependencies=[tensorboardX_dot_proto_dot_summary__pb2.DESCRIPTOR,])
+
+
+
+_LOGMESSAGE_LEVEL = _descriptor.EnumDescriptor(
+  name='Level',
+  full_name='tensorboardX.LogMessage.Level',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='UNKNOWN', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DEBUG', index=1, number=10,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INFO', index=2, number=20,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='WARN', index=3, number=30,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='ERROR', index=4, number=40,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FATAL', index=5, number=50,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=487,
+  serialized_end=560,
+)
+_sym_db.RegisterEnumDescriptor(_LOGMESSAGE_LEVEL)
+
+_SESSIONLOG_SESSIONSTATUS = _descriptor.EnumDescriptor(
+  name='SessionStatus',
+  full_name='tensorboardX.SessionLog.SessionStatus',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='STATUS_UNSPECIFIED', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='START', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STOP', index=2, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='CHECKPOINT', index=3, number=3,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=671,
+  serialized_end=747,
+)
+_sym_db.RegisterEnumDescriptor(_SESSIONLOG_SESSIONSTATUS)
+
+
+_EVENT = _descriptor.Descriptor(
+  name='Event',
+  full_name='tensorboardX.Event',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='wall_time', full_name='tensorboardX.Event.wall_time', index=0,
+      number=1, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='step', full_name='tensorboardX.Event.step', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='file_version', full_name='tensorboardX.Event.file_version', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='graph_def', full_name='tensorboardX.Event.graph_def', index=3,
+      number=4, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='summary', full_name='tensorboardX.Event.summary', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='log_message', full_name='tensorboardX.Event.log_message', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='session_log', full_name='tensorboardX.Event.session_log', index=6,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tagged_run_metadata', full_name='tensorboardX.Event.tagged_run_metadata', index=7,
+      number=8, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='meta_graph_def', full_name='tensorboardX.Event.meta_graph_def', index=8,
+      number=9, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='what', full_name='tensorboardX.Event.what',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=83,
+  serialized_end=406,
+)
+
+
+_LOGMESSAGE = _descriptor.Descriptor(
+  name='LogMessage',
+  full_name='tensorboardX.LogMessage',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='level', full_name='tensorboardX.LogMessage.level', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='message', full_name='tensorboardX.LogMessage.message', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _LOGMESSAGE_LEVEL,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=409,
+  serialized_end=560,
+)
+
+
+_SESSIONLOG = _descriptor.Descriptor(
+  name='SessionLog',
+  full_name='tensorboardX.SessionLog',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='status', full_name='tensorboardX.SessionLog.status', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='checkpoint_path', full_name='tensorboardX.SessionLog.checkpoint_path', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='msg', full_name='tensorboardX.SessionLog.msg', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _SESSIONLOG_SESSIONSTATUS,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=563,
+  serialized_end=747,
+)
+
+
+_TAGGEDRUNMETADATA = _descriptor.Descriptor(
+  name='TaggedRunMetadata',
+  full_name='tensorboardX.TaggedRunMetadata',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='tag', full_name='tensorboardX.TaggedRunMetadata.tag', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='run_metadata', full_name='tensorboardX.TaggedRunMetadata.run_metadata', index=1,
+      number=2, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=749,
+  serialized_end=803,
+)
+
+_EVENT.fields_by_name['summary'].message_type = tensorboardX_dot_proto_dot_summary__pb2._SUMMARY
+_EVENT.fields_by_name['log_message'].message_type = _LOGMESSAGE
+_EVENT.fields_by_name['session_log'].message_type = _SESSIONLOG
+_EVENT.fields_by_name['tagged_run_metadata'].message_type = _TAGGEDRUNMETADATA
+_EVENT.oneofs_by_name['what'].fields.append(
+  _EVENT.fields_by_name['file_version'])
+_EVENT.fields_by_name['file_version'].containing_oneof = _EVENT.oneofs_by_name['what']
+_EVENT.oneofs_by_name['what'].fields.append(
+  _EVENT.fields_by_name['graph_def'])
+_EVENT.fields_by_name['graph_def'].containing_oneof = _EVENT.oneofs_by_name['what']
+_EVENT.oneofs_by_name['what'].fields.append(
+  _EVENT.fields_by_name['summary'])
+_EVENT.fields_by_name['summary'].containing_oneof = _EVENT.oneofs_by_name['what']
+_EVENT.oneofs_by_name['what'].fields.append(
+  _EVENT.fields_by_name['log_message'])
+_EVENT.fields_by_name['log_message'].containing_oneof = _EVENT.oneofs_by_name['what']
+_EVENT.oneofs_by_name['what'].fields.append(
+  _EVENT.fields_by_name['session_log'])
+_EVENT.fields_by_name['session_log'].containing_oneof = _EVENT.oneofs_by_name['what']
+_EVENT.oneofs_by_name['what'].fields.append(
+  _EVENT.fields_by_name['tagged_run_metadata'])
+_EVENT.fields_by_name['tagged_run_metadata'].containing_oneof = _EVENT.oneofs_by_name['what']
+_EVENT.oneofs_by_name['what'].fields.append(
+  _EVENT.fields_by_name['meta_graph_def'])
+_EVENT.fields_by_name['meta_graph_def'].containing_oneof = _EVENT.oneofs_by_name['what']
+_LOGMESSAGE.fields_by_name['level'].enum_type = _LOGMESSAGE_LEVEL
+_LOGMESSAGE_LEVEL.containing_type = _LOGMESSAGE
+_SESSIONLOG.fields_by_name['status'].enum_type = _SESSIONLOG_SESSIONSTATUS
+_SESSIONLOG_SESSIONSTATUS.containing_type = _SESSIONLOG
+DESCRIPTOR.message_types_by_name['Event'] = _EVENT
+DESCRIPTOR.message_types_by_name['LogMessage'] = _LOGMESSAGE
+DESCRIPTOR.message_types_by_name['SessionLog'] = _SESSIONLOG
+DESCRIPTOR.message_types_by_name['TaggedRunMetadata'] = _TAGGEDRUNMETADATA
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+Event = _reflection.GeneratedProtocolMessageType('Event', (_message.Message,), dict(
+  DESCRIPTOR = _EVENT,
+  __module__ = 'tensorboardX.proto.event_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.Event)
+  ))
+_sym_db.RegisterMessage(Event)
+
+LogMessage = _reflection.GeneratedProtocolMessageType('LogMessage', (_message.Message,), dict(
+  DESCRIPTOR = _LOGMESSAGE,
+  __module__ = 'tensorboardX.proto.event_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.LogMessage)
+  ))
+_sym_db.RegisterMessage(LogMessage)
+
+SessionLog = _reflection.GeneratedProtocolMessageType('SessionLog', (_message.Message,), dict(
+  DESCRIPTOR = _SESSIONLOG,
+  __module__ = 'tensorboardX.proto.event_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.SessionLog)
+  ))
+_sym_db.RegisterMessage(SessionLog)
+
+TaggedRunMetadata = _reflection.GeneratedProtocolMessageType('TaggedRunMetadata', (_message.Message,), dict(
+  DESCRIPTOR = _TAGGEDRUNMETADATA,
+  __module__ = 'tensorboardX.proto.event_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.TaggedRunMetadata)
+  ))
+_sym_db.RegisterMessage(TaggedRunMetadata)
+
+
+DESCRIPTOR._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/graph.proto b/tensorboardX/tensorboardX/proto/graph.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9b322fa40b31573bb836229da9501a8b22a524b9
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/graph.proto
@@ -0,0 +1,56 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "GraphProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+import "tensorboardX/proto/node_def.proto";
+//import "tensorflow/core/framework/function.proto";
+import "tensorboardX/proto/versions.proto";
+
+// Represents the graph of operations
+message GraphDef {
+  repeated NodeDef node = 1;
+
+  // Compatibility versions of the graph.  See core/public/version.h for version
+  // history.  The GraphDef version is distinct from the TensorFlow version, and
+  // each release of TensorFlow will support a range of GraphDef versions.
+  VersionDef versions = 4;
+
+  // Deprecated single version field; use versions above instead.  Since all
+  // GraphDef changes before "versions" was introduced were forward
+  // compatible, this field is entirely ignored.
+  int32 version = 3 [deprecated = true];
+
+  // EXPERIMENTAL. DO NOT USE OR DEPEND ON THIS YET.
+  //
+  // "library" provides user-defined functions.
+  //
+  // Naming:
+  //   * library.function.name are in a flat namespace.
+  //     NOTE: We may need to change it to be hierarchical to support
+  //     different orgs. E.g.,
+  //     { "/google/nn", { ... }},
+  //     { "/google/vision", { ... }}
+  //     { "/org_foo/module_bar", { ... }}
+  //     map<string, FunctionDefLib> named_lib;
+  //   * If node[i].op is the name of one function in "library",
+  //     node[i] is deemed as a function call. Otherwise, node[i].op
+  //     must be a primitive operation supported by the runtime.
+  //
+  //
+  // Function call semantics:
+  //
+  //   * The callee may start execution as soon as some of its inputs
+  //     are ready. The caller may want to use Tuple() mechanism to
+  //     ensure all inputs are ready in the same time.
+  //
+  //   * The consumer of return values may start executing as soon as
+  //     the return values the consumer depends on are ready.  The
+  //     consumer may want to use Tuple() mechanism to ensure the
+  //     consumer does not start until all return values of the callee
+  //     function are ready.
+  //FunctionDefLibrary library = 2;
+};
diff --git a/tensorboardX/tensorboardX/proto/graph_pb2.py b/tensorboardX/tensorboardX/proto/graph_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..98a83c358739412e92527219b31f837d05144f98
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/graph_pb2.py
@@ -0,0 +1,90 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/graph.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from tensorboardX.proto import node_def_pb2 as tensorboardX_dot_proto_dot_node__def__pb2
+from tensorboardX.proto import versions_pb2 as tensorboardX_dot_proto_dot_versions__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/graph.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\013GraphProtosP\001\370\001\001'),
+  serialized_pb=_b('\n\x1etensorboardX/proto/graph.proto\x12\x0ctensorboardX\x1a!tensorboardX/proto/node_def.proto\x1a!tensorboardX/proto/versions.proto\"p\n\x08GraphDef\x12#\n\x04node\x18\x01 \x03(\x0b\x32\x15.tensorboardX.NodeDef\x12*\n\x08versions\x18\x04 \x01(\x0b\x32\x18.tensorboardX.VersionDef\x12\x13\n\x07version\x18\x03 \x01(\x05\x42\x02\x18\x01\x42,\n\x18org.tensorflow.frameworkB\x0bGraphProtosP\x01\xf8\x01\x01\x62\x06proto3')
+  ,
+  dependencies=[tensorboardX_dot_proto_dot_node__def__pb2.DESCRIPTOR,tensorboardX_dot_proto_dot_versions__pb2.DESCRIPTOR,])
+
+
+
+
+_GRAPHDEF = _descriptor.Descriptor(
+  name='GraphDef',
+  full_name='tensorboardX.GraphDef',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='node', full_name='tensorboardX.GraphDef.node', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='versions', full_name='tensorboardX.GraphDef.versions', index=1,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='version', full_name='tensorboardX.GraphDef.version', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\030\001'), file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=118,
+  serialized_end=230,
+)
+
+_GRAPHDEF.fields_by_name['node'].message_type = tensorboardX_dot_proto_dot_node__def__pb2._NODEDEF
+_GRAPHDEF.fields_by_name['versions'].message_type = tensorboardX_dot_proto_dot_versions__pb2._VERSIONDEF
+DESCRIPTOR.message_types_by_name['GraphDef'] = _GRAPHDEF
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+GraphDef = _reflection.GeneratedProtocolMessageType('GraphDef', (_message.Message,), dict(
+  DESCRIPTOR = _GRAPHDEF,
+  __module__ = 'tensorboardX.proto.graph_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.GraphDef)
+  ))
+_sym_db.RegisterMessage(GraphDef)
+
+
+DESCRIPTOR._options = None
+_GRAPHDEF.fields_by_name['version']._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/layout.proto b/tensorboardX/tensorboardX/proto/layout.proto
new file mode 100644
index 0000000000000000000000000000000000000000..8aa5b018585c8f823c659c52cb9477a7f4dac297
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/layout.proto
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorboardX;
+
+
+/**
+ * Encapsulates information on a single chart. Many charts appear in a category.
+ */
+message Chart {
+  // The title shown atop this chart. Optional. Defaults to 'untitled'.
+  string title = 1;
+
+  // The content of the chart. This depends on the type of the chart.
+  oneof content {
+    MultilineChartContent multiline = 2;
+    MarginChartContent margin = 3;
+  }
+}
+
+/**
+ * Encapsulates information on a single line chart. This line chart may have
+ * lines associated with several tags.
+ */
+message MultilineChartContent {
+  // A list of regular expressions for tags that should appear in this chart.
+  // Tags are matched from beginning to end. Each regex captures a set of tags.
+  repeated string tag = 1;
+}
+
+/**
+ * Encapsulates information on a single margin chart. A margin chart uses fill
+ * area to visualize lower and upper bounds that surround a value.
+ */
+message MarginChartContent {
+  /**
+   * Encapsulates a tag of data for the chart.
+   */
+  message Series {
+    // The exact tag string associated with the scalar summaries making up the
+    // main value between the bounds.
+    string value = 1;
+
+    // The exact tag string associated with the scalar summaries making up the
+    // lower bound.
+    string lower = 2;
+
+    // The exact tag string associated with the scalar summaries making up the
+    // upper bound.
+    string upper = 3;
+  }
+
+  // A list of data series to include within this margin chart.
+  repeated Series series = 1;
+}
+
+/**
+ * A category contains a group of charts. Each category maps to a collapsible
+ * within the dashboard.
+ */
+message Category {
+  // This string appears atop each grouping of charts within the dashboard.
+  string title = 1;
+
+  // Encapsulates data on charts to be shown in the category.
+  repeated Chart chart = 2;
+
+  // Whether this category should be initially closed. False by default.
+  bool closed = 3;
+}
+
+/**
+ * A layout encapsulates how charts are laid out within the custom scalars
+ * dashboard.
+ */
+message Layout {
+  // Version `0` is the only supported version.
+  int32 version = 1;
+
+  // The categories here are rendered from top to bottom.
+  repeated Category category = 2;
+}
diff --git a/tensorboardX/tensorboardX/proto/layout_pb2.py b/tensorboardX/tensorboardX/proto/layout_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1999056204625f43bb24ddfcb54df5e6a7bca6d
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/layout_pb2.py
@@ -0,0 +1,327 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/layout.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/layout.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=None,
+  serialized_pb=_b('\n\x1ftensorboardX/proto/layout.proto\x12\x0ctensorboardX\"\x8f\x01\n\x05\x43hart\x12\r\n\x05title\x18\x01 \x01(\t\x12\x38\n\tmultiline\x18\x02 \x01(\x0b\x32#.tensorboardX.MultilineChartContentH\x00\x12\x32\n\x06margin\x18\x03 \x01(\x0b\x32 .tensorboardX.MarginChartContentH\x00\x42\t\n\x07\x63ontent\"$\n\x15MultilineChartContent\x12\x0b\n\x03tag\x18\x01 \x03(\t\"\x84\x01\n\x12MarginChartContent\x12\x37\n\x06series\x18\x01 \x03(\x0b\x32\'.tensorboardX.MarginChartContent.Series\x1a\x35\n\x06Series\x12\r\n\x05value\x18\x01 \x01(\t\x12\r\n\x05lower\x18\x02 \x01(\t\x12\r\n\x05upper\x18\x03 \x01(\t\"M\n\x08\x43\x61tegory\x12\r\n\x05title\x18\x01 \x01(\t\x12\"\n\x05\x63hart\x18\x02 \x03(\x0b\x32\x13.tensorboardX.Chart\x12\x0e\n\x06\x63losed\x18\x03 \x01(\x08\"C\n\x06Layout\x12\x0f\n\x07version\x18\x01 \x01(\x05\x12(\n\x08\x63\x61tegory\x18\x02 \x03(\x0b\x32\x16.tensorboardX.Categoryb\x06proto3')
+)
+
+
+
+
+_CHART = _descriptor.Descriptor(
+  name='Chart',
+  full_name='tensorboardX.Chart',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='title', full_name='tensorboardX.Chart.title', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='multiline', full_name='tensorboardX.Chart.multiline', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='margin', full_name='tensorboardX.Chart.margin', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='content', full_name='tensorboardX.Chart.content',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=50,
+  serialized_end=193,
+)
+
+
+_MULTILINECHARTCONTENT = _descriptor.Descriptor(
+  name='MultilineChartContent',
+  full_name='tensorboardX.MultilineChartContent',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='tag', full_name='tensorboardX.MultilineChartContent.tag', index=0,
+      number=1, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=195,
+  serialized_end=231,
+)
+
+
+_MARGINCHARTCONTENT_SERIES = _descriptor.Descriptor(
+  name='Series',
+  full_name='tensorboardX.MarginChartContent.Series',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='value', full_name='tensorboardX.MarginChartContent.Series.value', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='lower', full_name='tensorboardX.MarginChartContent.Series.lower', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='upper', full_name='tensorboardX.MarginChartContent.Series.upper', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=313,
+  serialized_end=366,
+)
+
+_MARGINCHARTCONTENT = _descriptor.Descriptor(
+  name='MarginChartContent',
+  full_name='tensorboardX.MarginChartContent',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='series', full_name='tensorboardX.MarginChartContent.series', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_MARGINCHARTCONTENT_SERIES, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=234,
+  serialized_end=366,
+)
+
+
+_CATEGORY = _descriptor.Descriptor(
+  name='Category',
+  full_name='tensorboardX.Category',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='title', full_name='tensorboardX.Category.title', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='chart', full_name='tensorboardX.Category.chart', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='closed', full_name='tensorboardX.Category.closed', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=368,
+  serialized_end=445,
+)
+
+
+_LAYOUT = _descriptor.Descriptor(
+  name='Layout',
+  full_name='tensorboardX.Layout',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='version', full_name='tensorboardX.Layout.version', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='category', full_name='tensorboardX.Layout.category', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=447,
+  serialized_end=514,
+)
+
+_CHART.fields_by_name['multiline'].message_type = _MULTILINECHARTCONTENT
+_CHART.fields_by_name['margin'].message_type = _MARGINCHARTCONTENT
+_CHART.oneofs_by_name['content'].fields.append(
+  _CHART.fields_by_name['multiline'])
+_CHART.fields_by_name['multiline'].containing_oneof = _CHART.oneofs_by_name['content']
+_CHART.oneofs_by_name['content'].fields.append(
+  _CHART.fields_by_name['margin'])
+_CHART.fields_by_name['margin'].containing_oneof = _CHART.oneofs_by_name['content']
+_MARGINCHARTCONTENT_SERIES.containing_type = _MARGINCHARTCONTENT
+_MARGINCHARTCONTENT.fields_by_name['series'].message_type = _MARGINCHARTCONTENT_SERIES
+_CATEGORY.fields_by_name['chart'].message_type = _CHART
+_LAYOUT.fields_by_name['category'].message_type = _CATEGORY
+DESCRIPTOR.message_types_by_name['Chart'] = _CHART
+DESCRIPTOR.message_types_by_name['MultilineChartContent'] = _MULTILINECHARTCONTENT
+DESCRIPTOR.message_types_by_name['MarginChartContent'] = _MARGINCHARTCONTENT
+DESCRIPTOR.message_types_by_name['Category'] = _CATEGORY
+DESCRIPTOR.message_types_by_name['Layout'] = _LAYOUT
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+Chart = _reflection.GeneratedProtocolMessageType('Chart', (_message.Message,), dict(
+  DESCRIPTOR = _CHART,
+  __module__ = 'tensorboardX.proto.layout_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.Chart)
+  ))
+_sym_db.RegisterMessage(Chart)
+
+MultilineChartContent = _reflection.GeneratedProtocolMessageType('MultilineChartContent', (_message.Message,), dict(
+  DESCRIPTOR = _MULTILINECHARTCONTENT,
+  __module__ = 'tensorboardX.proto.layout_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.MultilineChartContent)
+  ))
+_sym_db.RegisterMessage(MultilineChartContent)
+
+MarginChartContent = _reflection.GeneratedProtocolMessageType('MarginChartContent', (_message.Message,), dict(
+
+  Series = _reflection.GeneratedProtocolMessageType('Series', (_message.Message,), dict(
+    DESCRIPTOR = _MARGINCHARTCONTENT_SERIES,
+    __module__ = 'tensorboardX.proto.layout_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.MarginChartContent.Series)
+    ))
+  ,
+  DESCRIPTOR = _MARGINCHARTCONTENT,
+  __module__ = 'tensorboardX.proto.layout_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.MarginChartContent)
+  ))
+_sym_db.RegisterMessage(MarginChartContent)
+_sym_db.RegisterMessage(MarginChartContent.Series)
+
+Category = _reflection.GeneratedProtocolMessageType('Category', (_message.Message,), dict(
+  DESCRIPTOR = _CATEGORY,
+  __module__ = 'tensorboardX.proto.layout_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.Category)
+  ))
+_sym_db.RegisterMessage(Category)
+
+Layout = _reflection.GeneratedProtocolMessageType('Layout', (_message.Message,), dict(
+  DESCRIPTOR = _LAYOUT,
+  __module__ = 'tensorboardX.proto.layout_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.Layout)
+  ))
+_sym_db.RegisterMessage(Layout)
+
+
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/node_def.proto b/tensorboardX/tensorboardX/proto/node_def.proto
new file mode 100644
index 0000000000000000000000000000000000000000..97cf8fb167de490f97cb6033093ad655d260a0a3
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/node_def.proto
@@ -0,0 +1,63 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "NodeProto";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+import "tensorboardX/proto/attr_value.proto";
+
+message NodeDef {
+  // The name given to this operator. Used for naming inputs,
+  // logging, visualization, etc.  Unique within a single GraphDef.
+  // Must match the regexp "[A-Za-z0-9.][A-Za-z0-9_./]*".
+  string name = 1;
+
+  // The operation name.  There may be custom parameters in attrs.
+  // Op names starting with an underscore are reserved for internal use.
+  string op = 2;
+
+  // Each input is "node:src_output" with "node" being a string name and
+  // "src_output" indicating which output tensor to use from "node". If
+  // "src_output" is 0 the ":0" suffix can be omitted.  Regular inputs
+  // may optionally be followed by control inputs that have the format
+  // "^node".
+  repeated string input = 3;
+
+  // A (possibly partial) specification for the device on which this
+  // node should be placed.
+  // The expected syntax for this string is as follows:
+  //
+  // DEVICE_SPEC ::= PARTIAL_SPEC
+  //
+  // PARTIAL_SPEC ::= ("/" CONSTRAINT) *
+  // CONSTRAINT ::= ("job:" JOB_NAME)
+  //              | ("replica:" [1-9][0-9]*)
+  //              | ("task:" [1-9][0-9]*)
+  //              | ( ("gpu" | "cpu") ":" ([1-9][0-9]* | "*") )
+  //
+  // Valid values for this string include:
+  // * "/job:worker/replica:0/task:1/gpu:3"  (full specification)
+  // * "/job:worker/gpu:3"                   (partial specification)
+  // * ""                                    (no specification)
+  //
+  // If the constraints do not resolve to a single device (or if this
+  // field is empty or not present), the runtime will attempt to
+  // choose a device automatically.
+  string device = 4;
+
+  // Operation-specific graph-construction-time configuration.
+  // Note that this should include all attrs defined in the
+  // corresponding OpDef, including those with a value matching
+  // the default -- this allows the default to change and makes
+  // NodeDefs easier to interpret on their own.  However, if
+  // an attr with a default is not specified in this list, the
+  // default will be used.
+  // The "names" (keys) must match the regexp "[a-z][a-z0-9_]+" (and
+  // one of the names from the corresponding OpDef's attr field).
+  // The values must have a type matching the corresponding OpDef
+  // attr's type field.
+  // TODO(josh11b): Add some examples here showing best practices.
+  map<string, AttrValue> attr = 5;
+};
diff --git a/tensorboardX/tensorboardX/proto/node_def_pb2.py b/tensorboardX/tensorboardX/proto/node_def_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..910ca93f9770e773a795bf0db6a5acaefefef536
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/node_def_pb2.py
@@ -0,0 +1,149 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/node_def.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from tensorboardX.proto import attr_value_pb2 as tensorboardX_dot_proto_dot_attr__value__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/node_def.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\tNodeProtoP\001\370\001\001'),
+  serialized_pb=_b('\n!tensorboardX/proto/node_def.proto\x12\x0ctensorboardX\x1a#tensorboardX/proto/attr_value.proto\"\xb7\x01\n\x07NodeDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\n\n\x02op\x18\x02 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12\x0e\n\x06\x64\x65vice\x18\x04 \x01(\t\x12-\n\x04\x61ttr\x18\x05 \x03(\x0b\x32\x1f.tensorboardX.NodeDef.AttrEntry\x1a\x44\n\tAttrEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12&\n\x05value\x18\x02 \x01(\x0b\x32\x17.tensorboardX.AttrValue:\x02\x38\x01\x42*\n\x18org.tensorflow.frameworkB\tNodeProtoP\x01\xf8\x01\x01\x62\x06proto3')
+  ,
+  dependencies=[tensorboardX_dot_proto_dot_attr__value__pb2.DESCRIPTOR,])
+
+
+
+
+_NODEDEF_ATTRENTRY = _descriptor.Descriptor(
+  name='AttrEntry',
+  full_name='tensorboardX.NodeDef.AttrEntry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='tensorboardX.NodeDef.AttrEntry.key', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='tensorboardX.NodeDef.AttrEntry.value', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=204,
+  serialized_end=272,
+)
+
+_NODEDEF = _descriptor.Descriptor(
+  name='NodeDef',
+  full_name='tensorboardX.NodeDef',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.NodeDef.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='op', full_name='tensorboardX.NodeDef.op', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='input', full_name='tensorboardX.NodeDef.input', index=2,
+      number=3, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='device', full_name='tensorboardX.NodeDef.device', index=3,
+      number=4, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='attr', full_name='tensorboardX.NodeDef.attr', index=4,
+      number=5, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_NODEDEF_ATTRENTRY, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=89,
+  serialized_end=272,
+)
+
+_NODEDEF_ATTRENTRY.fields_by_name['value'].message_type = tensorboardX_dot_proto_dot_attr__value__pb2._ATTRVALUE
+_NODEDEF_ATTRENTRY.containing_type = _NODEDEF
+_NODEDEF.fields_by_name['attr'].message_type = _NODEDEF_ATTRENTRY
+DESCRIPTOR.message_types_by_name['NodeDef'] = _NODEDEF
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+NodeDef = _reflection.GeneratedProtocolMessageType('NodeDef', (_message.Message,), dict(
+
+  AttrEntry = _reflection.GeneratedProtocolMessageType('AttrEntry', (_message.Message,), dict(
+    DESCRIPTOR = _NODEDEF_ATTRENTRY,
+    __module__ = 'tensorboardX.proto.node_def_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.NodeDef.AttrEntry)
+    ))
+  ,
+  DESCRIPTOR = _NODEDEF,
+  __module__ = 'tensorboardX.proto.node_def_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.NodeDef)
+  ))
+_sym_db.RegisterMessage(NodeDef)
+_sym_db.RegisterMessage(NodeDef.AttrEntry)
+
+
+DESCRIPTOR._options = None
+_NODEDEF_ATTRENTRY._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/plugin_hparams.proto b/tensorboardX/tensorboardX/proto/plugin_hparams.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c628bb2de84487a7c44a94337511ef2fb981ee60
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_hparams.proto
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines protos for storing a hypertuning experiment data inside Summary tags.
+//
+// A hypertuning-experiment data consists of metadata that's constant
+// throughout the experiment and evolving metric data for each training session
+// in the experiment. The HParams plugin assumes the following organization of
+// this entire data set. Experiment metadata is recorded in the empty run in a
+// tag (named by the Python constant) metadata.EXPERIMENT_TAG. Within the
+// experiment, for a session named by <session_name> its metadata is recorded
+// in the run <session_name> in the tags metadata.SESSION_START_INFO and
+// metadata.SESSION_END_INFO. Finally, the session's metric data for a metric
+// with a (<group>, <tag>) name (see MetricName in api.proto), is recorded
+// in a Scalar-plugin summary with tag <tag> in the run <session_name><group>.
+
+syntax = "proto3";
+
+import "tensorboardX/proto/api.proto";
+import "google/protobuf/struct.proto";
+
+package tensorboardX.hparam;
+
+// HParam summaries created by `tensorboard.plugins.hparams.summary`
+// module will include `SummaryMetadata` whose `plugin_data` field has
+// as `content` a serialized HParamsPluginData message.
+message HParamsPluginData {
+  // The version of the plugin data schema.
+  int32 version = 1;
+  oneof data {
+    Experiment experiment = 2;
+    SessionStartInfo session_start_info = 3;
+    SessionEndInfo session_end_info = 4;
+  }
+}
+
+message SessionStartInfo {
+  // A map describing the hyperparameter values for the session.
+  // Maps each hyperparameter name to its value.
+  // Currently only scalars are supported.
+  map<string, google.protobuf.Value> hparams = 1;
+
+  // A URI for where checkpoints are saved.
+  string model_uri = 2;
+
+  // An optional URL to a website monitoring the session.
+  string monitor_url = 3;
+
+  // The name of the session group containing this session. If empty, the
+  // group name is taken to be the session id (so this session is the only
+  // member of its group).
+  string group_name = 4;
+
+  // The time the session started in seconds since epoch.
+  double start_time_secs = 5;
+}
+
+message SessionEndInfo {
+  Status status = 1;
+
+  // The time the session ended in seconds since epoch.
+  double end_time_secs = 2;
+}
diff --git a/tensorboardX/tensorboardX/proto/plugin_hparams_pb2.py b/tensorboardX/tensorboardX/proto/plugin_hparams_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeaf1b6a7342d37fd549dd4b9607ef60ee1087c5
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_hparams_pb2.py
@@ -0,0 +1,271 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/plugin_hparams.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from tensorboardX.proto import api_pb2 as tensorboardX_dot_proto_dot_api__pb2
+from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/plugin_hparams.proto',
+  package='tensorboardX.hparam',
+  syntax='proto3',
+  serialized_options=None,
+  serialized_pb=_b('\n\'tensorboardX/proto/plugin_hparams.proto\x12\x13tensorboardX.hparam\x1a\x1ctensorboardX/proto/api.proto\x1a\x1cgoogle/protobuf/struct.proto\"\xe9\x01\n\x11HParamsPluginData\x12\x0f\n\x07version\x18\x01 \x01(\x05\x12\x35\n\nexperiment\x18\x02 \x01(\x0b\x32\x1f.tensorboardX.hparam.ExperimentH\x00\x12\x43\n\x12session_start_info\x18\x03 \x01(\x0b\x32%.tensorboardX.hparam.SessionStartInfoH\x00\x12?\n\x10session_end_info\x18\x04 \x01(\x0b\x32#.tensorboardX.hparam.SessionEndInfoH\x00\x42\x06\n\x04\x64\x61ta\"\xf4\x01\n\x10SessionStartInfo\x12\x43\n\x07hparams\x18\x01 \x03(\x0b\x32\x32.tensorboardX.hparam.SessionStartInfo.HparamsEntry\x12\x11\n\tmodel_uri\x18\x02 \x01(\t\x12\x13\n\x0bmonitor_url\x18\x03 \x01(\t\x12\x12\n\ngroup_name\x18\x04 \x01(\t\x12\x17\n\x0fstart_time_secs\x18\x05 \x01(\x01\x1a\x46\n\x0cHparamsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value:\x02\x38\x01\"T\n\x0eSessionEndInfo\x12+\n\x06status\x18\x01 \x01(\x0e\x32\x1b.tensorboardX.hparam.Status\x12\x15\n\rend_time_secs\x18\x02 \x01(\x01\x62\x06proto3')
+  ,
+  dependencies=[tensorboardX_dot_proto_dot_api__pb2.DESCRIPTOR,google_dot_protobuf_dot_struct__pb2.DESCRIPTOR,])
+
+
+
+
+_HPARAMSPLUGINDATA = _descriptor.Descriptor(
+  name='HParamsPluginData',
+  full_name='tensorboardX.hparam.HParamsPluginData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='version', full_name='tensorboardX.hparam.HParamsPluginData.version', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='experiment', full_name='tensorboardX.hparam.HParamsPluginData.experiment', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='session_start_info', full_name='tensorboardX.hparam.HParamsPluginData.session_start_info', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='session_end_info', full_name='tensorboardX.hparam.HParamsPluginData.session_end_info', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='data', full_name='tensorboardX.hparam.HParamsPluginData.data',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=125,
+  serialized_end=358,
+)
+
+
+_SESSIONSTARTINFO_HPARAMSENTRY = _descriptor.Descriptor(
+  name='HparamsEntry',
+  full_name='tensorboardX.hparam.SessionStartInfo.HparamsEntry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='tensorboardX.hparam.SessionStartInfo.HparamsEntry.key', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='tensorboardX.hparam.SessionStartInfo.HparamsEntry.value', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=535,
+  serialized_end=605,
+)
+
+_SESSIONSTARTINFO = _descriptor.Descriptor(
+  name='SessionStartInfo',
+  full_name='tensorboardX.hparam.SessionStartInfo',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='hparams', full_name='tensorboardX.hparam.SessionStartInfo.hparams', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='model_uri', full_name='tensorboardX.hparam.SessionStartInfo.model_uri', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='monitor_url', full_name='tensorboardX.hparam.SessionStartInfo.monitor_url', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='group_name', full_name='tensorboardX.hparam.SessionStartInfo.group_name', index=3,
+      number=4, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='start_time_secs', full_name='tensorboardX.hparam.SessionStartInfo.start_time_secs', index=4,
+      number=5, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_SESSIONSTARTINFO_HPARAMSENTRY, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=361,
+  serialized_end=605,
+)
+
+
+_SESSIONENDINFO = _descriptor.Descriptor(
+  name='SessionEndInfo',
+  full_name='tensorboardX.hparam.SessionEndInfo',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='status', full_name='tensorboardX.hparam.SessionEndInfo.status', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='end_time_secs', full_name='tensorboardX.hparam.SessionEndInfo.end_time_secs', index=1,
+      number=2, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=607,
+  serialized_end=691,
+)
+
+_HPARAMSPLUGINDATA.fields_by_name['experiment'].message_type = tensorboardX_dot_proto_dot_api__pb2._EXPERIMENT
+_HPARAMSPLUGINDATA.fields_by_name['session_start_info'].message_type = _SESSIONSTARTINFO
+_HPARAMSPLUGINDATA.fields_by_name['session_end_info'].message_type = _SESSIONENDINFO
+_HPARAMSPLUGINDATA.oneofs_by_name['data'].fields.append(
+  _HPARAMSPLUGINDATA.fields_by_name['experiment'])
+_HPARAMSPLUGINDATA.fields_by_name['experiment'].containing_oneof = _HPARAMSPLUGINDATA.oneofs_by_name['data']
+_HPARAMSPLUGINDATA.oneofs_by_name['data'].fields.append(
+  _HPARAMSPLUGINDATA.fields_by_name['session_start_info'])
+_HPARAMSPLUGINDATA.fields_by_name['session_start_info'].containing_oneof = _HPARAMSPLUGINDATA.oneofs_by_name['data']
+_HPARAMSPLUGINDATA.oneofs_by_name['data'].fields.append(
+  _HPARAMSPLUGINDATA.fields_by_name['session_end_info'])
+_HPARAMSPLUGINDATA.fields_by_name['session_end_info'].containing_oneof = _HPARAMSPLUGINDATA.oneofs_by_name['data']
+_SESSIONSTARTINFO_HPARAMSENTRY.fields_by_name['value'].message_type = google_dot_protobuf_dot_struct__pb2._VALUE
+_SESSIONSTARTINFO_HPARAMSENTRY.containing_type = _SESSIONSTARTINFO
+_SESSIONSTARTINFO.fields_by_name['hparams'].message_type = _SESSIONSTARTINFO_HPARAMSENTRY
+_SESSIONENDINFO.fields_by_name['status'].enum_type = tensorboardX_dot_proto_dot_api__pb2._STATUS
+DESCRIPTOR.message_types_by_name['HParamsPluginData'] = _HPARAMSPLUGINDATA
+DESCRIPTOR.message_types_by_name['SessionStartInfo'] = _SESSIONSTARTINFO
+DESCRIPTOR.message_types_by_name['SessionEndInfo'] = _SESSIONENDINFO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+HParamsPluginData = _reflection.GeneratedProtocolMessageType('HParamsPluginData', (_message.Message,), dict(
+  DESCRIPTOR = _HPARAMSPLUGINDATA,
+  __module__ = 'tensorboardX.proto.plugin_hparams_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.HParamsPluginData)
+  ))
+_sym_db.RegisterMessage(HParamsPluginData)
+
+SessionStartInfo = _reflection.GeneratedProtocolMessageType('SessionStartInfo', (_message.Message,), dict(
+
+  HparamsEntry = _reflection.GeneratedProtocolMessageType('HparamsEntry', (_message.Message,), dict(
+    DESCRIPTOR = _SESSIONSTARTINFO_HPARAMSENTRY,
+    __module__ = 'tensorboardX.proto.plugin_hparams_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.hparam.SessionStartInfo.HparamsEntry)
+    ))
+  ,
+  DESCRIPTOR = _SESSIONSTARTINFO,
+  __module__ = 'tensorboardX.proto.plugin_hparams_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.SessionStartInfo)
+  ))
+_sym_db.RegisterMessage(SessionStartInfo)
+_sym_db.RegisterMessage(SessionStartInfo.HparamsEntry)
+
+SessionEndInfo = _reflection.GeneratedProtocolMessageType('SessionEndInfo', (_message.Message,), dict(
+  DESCRIPTOR = _SESSIONENDINFO,
+  __module__ = 'tensorboardX.proto.plugin_hparams_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.hparam.SessionEndInfo)
+  ))
+_sym_db.RegisterMessage(SessionEndInfo)
+
+
+_SESSIONSTARTINFO_HPARAMSENTRY._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/plugin_mesh.proto b/tensorboardX/tensorboardX/proto/plugin_mesh.proto
new file mode 100644
index 0000000000000000000000000000000000000000..8bc496fb3d928e51e9709ed63836bbfd284a7853
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_mesh.proto
@@ -0,0 +1,29 @@
+syntax = "proto3";
+
+package tensorboardX.mesh;
+
+// A MeshPluginData encapsulates information on which plugins are able to make
+// use of a certain summary value.
+message MeshPluginData {
+  enum ContentType {
+    UNDEFINED = 0;
+    VERTEX = 1;
+    FACE = 2;  // Triangle face.
+    COLOR = 3;
+  }
+
+  // Version `0` is the only supported version.
+  int32 version = 1;
+
+  // The name of the mesh summary this particular summary belongs to.
+  string name = 2;
+
+  // Type of data in the summary.
+  ContentType content_type = 3;
+
+  // JSON-serialized dictionary of ThreeJS classes configuration.
+  string json_config = 5;
+
+  // Shape of underlying data. Cache it here for performance reasons.
+  repeated int32 shape = 6;
+}
diff --git a/tensorboardX/tensorboardX/proto/plugin_mesh_pb2.py b/tensorboardX/tensorboardX/proto/plugin_mesh_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..afbb209d06b3864ef2cf899ac9298ced506ced4e
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_mesh_pb2.py
@@ -0,0 +1,130 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/plugin_mesh.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/plugin_mesh.proto',
+  package='tensorboardX.mesh',
+  syntax='proto3',
+  serialized_options=None,
+  serialized_pb=_b('\n$tensorboardX/proto/plugin_mesh.proto\x12\x11tensorboardX.mesh\"\xd7\x01\n\x0eMeshPluginData\x12\x0f\n\x07version\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x43\n\x0c\x63ontent_type\x18\x03 \x01(\x0e\x32-.tensorboardX.mesh.MeshPluginData.ContentType\x12\x13\n\x0bjson_config\x18\x05 \x01(\t\x12\r\n\x05shape\x18\x06 \x03(\x05\"=\n\x0b\x43ontentType\x12\r\n\tUNDEFINED\x10\x00\x12\n\n\x06VERTEX\x10\x01\x12\x08\n\x04\x46\x41\x43\x45\x10\x02\x12\t\n\x05\x43OLOR\x10\x03\x62\x06proto3')
+)
+
+
+
+_MESHPLUGINDATA_CONTENTTYPE = _descriptor.EnumDescriptor(
+  name='ContentType',
+  full_name='tensorboardX.mesh.MeshPluginData.ContentType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='UNDEFINED', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='VERTEX', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FACE', index=2, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='COLOR', index=3, number=3,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=214,
+  serialized_end=275,
+)
+_sym_db.RegisterEnumDescriptor(_MESHPLUGINDATA_CONTENTTYPE)
+
+
+_MESHPLUGINDATA = _descriptor.Descriptor(
+  name='MeshPluginData',
+  full_name='tensorboardX.mesh.MeshPluginData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='version', full_name='tensorboardX.mesh.MeshPluginData.version', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.mesh.MeshPluginData.name', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='content_type', full_name='tensorboardX.mesh.MeshPluginData.content_type', index=2,
+      number=3, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='json_config', full_name='tensorboardX.mesh.MeshPluginData.json_config', index=3,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='tensorboardX.mesh.MeshPluginData.shape', index=4,
+      number=6, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _MESHPLUGINDATA_CONTENTTYPE,
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=60,
+  serialized_end=275,
+)
+
+_MESHPLUGINDATA.fields_by_name['content_type'].enum_type = _MESHPLUGINDATA_CONTENTTYPE
+_MESHPLUGINDATA_CONTENTTYPE.containing_type = _MESHPLUGINDATA
+DESCRIPTOR.message_types_by_name['MeshPluginData'] = _MESHPLUGINDATA
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+MeshPluginData = _reflection.GeneratedProtocolMessageType('MeshPluginData', (_message.Message,), dict(
+  DESCRIPTOR = _MESHPLUGINDATA,
+  __module__ = 'tensorboardX.proto.plugin_mesh_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.mesh.MeshPluginData)
+  ))
+_sym_db.RegisterMessage(MeshPluginData)
+
+
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/plugin_pr_curve.proto b/tensorboardX/tensorboardX/proto/plugin_pr_curve.proto
new file mode 100644
index 0000000000000000000000000000000000000000..e489fa8df60c50209c28d786ec0eb41730655018
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_pr_curve.proto
@@ -0,0 +1,25 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorboardX;
+
+message PrCurvePluginData {
+  // Version `0` is the only supported version.
+  int32 version = 1;
+
+  uint32 num_thresholds = 2;
+}
diff --git a/tensorboardX/tensorboardX/proto/plugin_pr_curve_pb2.py b/tensorboardX/tensorboardX/proto/plugin_pr_curve_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f7385c2efe81c6048eee2cb674bdd0dbd0ef69
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_pr_curve_pb2.py
@@ -0,0 +1,76 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/plugin_pr_curve.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/plugin_pr_curve.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=None,
+  serialized_pb=_b('\n(tensorboardX/proto/plugin_pr_curve.proto\x12\x0ctensorboardX\"<\n\x11PrCurvePluginData\x12\x0f\n\x07version\x18\x01 \x01(\x05\x12\x16\n\x0enum_thresholds\x18\x02 \x01(\rb\x06proto3')
+)
+
+
+
+
+_PRCURVEPLUGINDATA = _descriptor.Descriptor(
+  name='PrCurvePluginData',
+  full_name='tensorboardX.PrCurvePluginData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='version', full_name='tensorboardX.PrCurvePluginData.version', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_thresholds', full_name='tensorboardX.PrCurvePluginData.num_thresholds', index=1,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=58,
+  serialized_end=118,
+)
+
+DESCRIPTOR.message_types_by_name['PrCurvePluginData'] = _PRCURVEPLUGINDATA
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+PrCurvePluginData = _reflection.GeneratedProtocolMessageType('PrCurvePluginData', (_message.Message,), dict(
+  DESCRIPTOR = _PRCURVEPLUGINDATA,
+  __module__ = 'tensorboardX.proto.plugin_pr_curve_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.PrCurvePluginData)
+  ))
+_sym_db.RegisterMessage(PrCurvePluginData)
+
+
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/plugin_text.proto b/tensorboardX/tensorboardX/proto/plugin_text.proto
new file mode 100644
index 0000000000000000000000000000000000000000..6c163d0737adfe8f1c7bc22819f017dc1b367239
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_text.proto
@@ -0,0 +1,27 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorboardX;
+
+// Text summaries created by the `tensorboard.plugins.text.summary`
+// module will include `SummaryMetadata` whose `plugin_data` field has
+// as `content` a binary string that is the encoding of an
+// `TextPluginData` proto.
+message TextPluginData {
+  // Version `0` is the only supported version.
+  int32 version = 1;
+}
diff --git a/tensorboardX/tensorboardX/proto/plugin_text_pb2.py b/tensorboardX/tensorboardX/proto/plugin_text_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d3842cabcccb79a6bd8ad8b6e7ae10e64a0546
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/plugin_text_pb2.py
@@ -0,0 +1,69 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/plugin_text.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/plugin_text.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=None,
+  serialized_pb=_b('\n$tensorboardX/proto/plugin_text.proto\x12\x0ctensorboardX\"!\n\x0eTextPluginData\x12\x0f\n\x07version\x18\x01 \x01(\x05\x62\x06proto3')
+)
+
+
+
+
+_TEXTPLUGINDATA = _descriptor.Descriptor(
+  name='TextPluginData',
+  full_name='tensorboardX.TextPluginData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='version', full_name='tensorboardX.TextPluginData.version', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=54,
+  serialized_end=87,
+)
+
+DESCRIPTOR.message_types_by_name['TextPluginData'] = _TEXTPLUGINDATA
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TextPluginData = _reflection.GeneratedProtocolMessageType('TextPluginData', (_message.Message,), dict(
+  DESCRIPTOR = _TEXTPLUGINDATA,
+  __module__ = 'tensorboardX.proto.plugin_text_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.TextPluginData)
+  ))
+_sym_db.RegisterMessage(TextPluginData)
+
+
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/resource_handle.proto b/tensorboardX/tensorboardX/proto/resource_handle.proto
new file mode 100644
index 0000000000000000000000000000000000000000..bdb78997bd7b1ece459ced55859c643f3ea2490e
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/resource_handle.proto
@@ -0,0 +1,29 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "ResourceHandle";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+// Protocol buffer representing a handle to a tensorflow resource. Handles are
+// not valid across executions, but can be serialized back and forth from within
+// a single run.
+message ResourceHandleProto {
+  // Unique name for the device containing the resource.
+  string device = 1;
+
+  // Container in which this resource is placed.
+  string container = 2;
+
+  // Unique name of this resource.
+  string name = 3;
+
+  // Hash code for the type of the resource. Is only valid in the same device
+  // and in the same execution.
+  uint64 hash_code = 4;
+
+  // For debug-only, the name of the type pointed to by this handle, if
+  // available.
+  string maybe_type_name = 5;
+};
diff --git a/tensorboardX/tensorboardX/proto/resource_handle_pb2.py b/tensorboardX/tensorboardX/proto/resource_handle_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d67707e0d0e0c6b6331d690100f3f16b8d38d286
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/resource_handle_pb2.py
@@ -0,0 +1,98 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/resource_handle.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/resource_handle.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\016ResourceHandleP\001\370\001\001'),
+  serialized_pb=_b('\n(tensorboardX/proto/resource_handle.proto\x12\x0ctensorboardX\"r\n\x13ResourceHandleProto\x12\x0e\n\x06\x64\x65vice\x18\x01 \x01(\t\x12\x11\n\tcontainer\x18\x02 \x01(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x11\n\thash_code\x18\x04 \x01(\x04\x12\x17\n\x0fmaybe_type_name\x18\x05 \x01(\tB/\n\x18org.tensorflow.frameworkB\x0eResourceHandleP\x01\xf8\x01\x01\x62\x06proto3')
+)
+
+
+
+
+_RESOURCEHANDLEPROTO = _descriptor.Descriptor(
+  name='ResourceHandleProto',
+  full_name='tensorboardX.ResourceHandleProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='device', full_name='tensorboardX.ResourceHandleProto.device', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='container', full_name='tensorboardX.ResourceHandleProto.container', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.ResourceHandleProto.name', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='hash_code', full_name='tensorboardX.ResourceHandleProto.hash_code', index=3,
+      number=4, type=4, cpp_type=4, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='maybe_type_name', full_name='tensorboardX.ResourceHandleProto.maybe_type_name', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=58,
+  serialized_end=172,
+)
+
+DESCRIPTOR.message_types_by_name['ResourceHandleProto'] = _RESOURCEHANDLEPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+ResourceHandleProto = _reflection.GeneratedProtocolMessageType('ResourceHandleProto', (_message.Message,), dict(
+  DESCRIPTOR = _RESOURCEHANDLEPROTO,
+  __module__ = 'tensorboardX.proto.resource_handle_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.ResourceHandleProto)
+  ))
+_sym_db.RegisterMessage(ResourceHandleProto)
+
+
+DESCRIPTOR._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/step_stats.proto b/tensorboardX/tensorboardX/proto/step_stats.proto
new file mode 100644
index 0000000000000000000000000000000000000000..7b7e3371f146681f354a2e41f7e78a8d7449348d
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/step_stats.proto
@@ -0,0 +1,95 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "StepStatsProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
+//import "tensorflow/core/framework/allocation_description.proto";
+//import "tensorflow/core/framework/tensor_description.proto";
+
+// An allocation/de-allocation operation performed by the allocator.
+message AllocationRecord {
+  // The timestamp of the operation.
+  int64 alloc_micros = 1;
+  // Number of bytes allocated, or de-allocated if negative.
+  int64 alloc_bytes = 2;
+}
+
+message AllocatorMemoryUsed {
+  string allocator_name = 1;
+  // These are per-node allocator memory stats.
+  int64 total_bytes = 2;
+  int64 peak_bytes = 3;
+  // The bytes that are not deallocated.
+  int64 live_bytes = 4;
+  // The allocation and deallocation timeline.
+  repeated AllocationRecord allocation_records = 6;
+
+  // These are snapshots of the overall allocator memory stats.
+  // The number of live bytes currently allocated by the allocator.
+  int64 allocator_bytes_in_use = 5;
+}
+
+// Output sizes recorded for a single execution of a graph node.
+message NodeOutput {
+  int32 slot = 1;
+  // TensorDescription tensor_description = 3;
+};
+
+// For memory tracking.
+message MemoryStats {
+  int64 temp_memory_size = 1;
+  int64 persistent_memory_size = 3;
+  repeated int64 persistent_tensor_alloc_ids = 5;
+
+  int64 device_temp_memory_size = 2 [deprecated = true];
+  int64 device_persistent_memory_size = 4 [deprecated = true];
+  repeated int64 device_persistent_tensor_alloc_ids = 6 [deprecated = true];
+}
+
+// Time/size stats recorded for a single execution of a graph node.
+message NodeExecStats {
+  // TODO(tucker): Use some more compact form of node identity than
+  // the full string name.  Either all processes should agree on a
+  // global id (cost_id?) for each node, or we should use a hash of
+  // the name.
+  string node_name = 1;
+  int64 all_start_micros = 2;
+  int64 op_start_rel_micros = 3;
+  int64 op_end_rel_micros = 4;
+  int64 all_end_rel_micros = 5;
+  repeated AllocatorMemoryUsed memory = 6;
+  repeated NodeOutput output = 7;
+  string timeline_label = 8;
+  int64 scheduled_micros = 9;
+  uint32 thread_id = 10;
+  // repeated AllocationDescription referenced_tensor = 11;
+  MemoryStats memory_stats = 12;
+};
+
+message DeviceStepStats {
+  string device = 1;
+  repeated NodeExecStats node_stats = 2;
+}
+
+message StepStats {
+  repeated DeviceStepStats dev_stats = 1;
+};
+
+
+// lanpa, copied from config.proto
+// Metadata output (i.e., non-Tensor) for a single Run() call.
+message RunMetadata {
+  // Statistics traced for this step. Populated if tracing is turned on via the
+  // "RunOptions" proto.
+  // EXPERIMENTAL: The format and set of events may change in future versions.
+  StepStats step_stats = 1;
+
+  // The cost graph for the computation defined by the run call.
+  // CostGraphDef cost_graph = 2;
+
+  // Graphs of the partitions executed by executors.
+  // repeated GraphDef partition_graphs = 3;
+}
diff --git a/tensorboardX/tensorboardX/proto/step_stats_pb2.py b/tensorboardX/tensorboardX/proto/step_stats_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..83e91e260a59badf9a73fe80cacca734d9aac35e
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/step_stats_pb2.py
@@ -0,0 +1,507 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/step_stats.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/step_stats.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\017StepStatsProtosP\001Z=github.com/tensorflow/tensorflow/tensorflow/go/core/framework\370\001\001'),
+  serialized_pb=_b('\n#tensorboardX/proto/step_stats.proto\x12\x0ctensorboardX\"=\n\x10\x41llocationRecord\x12\x14\n\x0c\x61lloc_micros\x18\x01 \x01(\x03\x12\x13\n\x0b\x61lloc_bytes\x18\x02 \x01(\x03\"\xc6\x01\n\x13\x41llocatorMemoryUsed\x12\x16\n\x0e\x61llocator_name\x18\x01 \x01(\t\x12\x13\n\x0btotal_bytes\x18\x02 \x01(\x03\x12\x12\n\npeak_bytes\x18\x03 \x01(\x03\x12\x12\n\nlive_bytes\x18\x04 \x01(\x03\x12:\n\x12\x61llocation_records\x18\x06 \x03(\x0b\x32\x1e.tensorboardX.AllocationRecord\x12\x1e\n\x16\x61llocator_bytes_in_use\x18\x05 \x01(\x03\"\x1a\n\nNodeOutput\x12\x0c\n\x04slot\x18\x01 \x01(\x05\"\xec\x01\n\x0bMemoryStats\x12\x18\n\x10temp_memory_size\x18\x01 \x01(\x03\x12\x1e\n\x16persistent_memory_size\x18\x03 \x01(\x03\x12#\n\x1bpersistent_tensor_alloc_ids\x18\x05 \x03(\x03\x12#\n\x17\x64\x65vice_temp_memory_size\x18\x02 \x01(\x03\x42\x02\x18\x01\x12)\n\x1d\x64\x65vice_persistent_memory_size\x18\x04 \x01(\x03\x42\x02\x18\x01\x12.\n\"device_persistent_tensor_alloc_ids\x18\x06 \x03(\x03\x42\x02\x18\x01\"\xe3\x02\n\rNodeExecStats\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x18\n\x10\x61ll_start_micros\x18\x02 \x01(\x03\x12\x1b\n\x13op_start_rel_micros\x18\x03 \x01(\x03\x12\x19\n\x11op_end_rel_micros\x18\x04 \x01(\x03\x12\x1a\n\x12\x61ll_end_rel_micros\x18\x05 \x01(\x03\x12\x31\n\x06memory\x18\x06 \x03(\x0b\x32!.tensorboardX.AllocatorMemoryUsed\x12(\n\x06output\x18\x07 \x03(\x0b\x32\x18.tensorboardX.NodeOutput\x12\x16\n\x0etimeline_label\x18\x08 \x01(\t\x12\x18\n\x10scheduled_micros\x18\t \x01(\x03\x12\x11\n\tthread_id\x18\n \x01(\r\x12/\n\x0cmemory_stats\x18\x0c \x01(\x0b\x32\x19.tensorboardX.MemoryStats\"R\n\x0f\x44\x65viceStepStats\x12\x0e\n\x06\x64\x65vice\x18\x01 \x01(\t\x12/\n\nnode_stats\x18\x02 \x03(\x0b\x32\x1b.tensorboardX.NodeExecStats\"=\n\tStepStats\x12\x30\n\tdev_stats\x18\x01 \x03(\x0b\x32\x1d.tensorboardX.DeviceStepStats\":\n\x0bRunMetadata\x12+\n\nstep_stats\x18\x01 \x01(\x0b\x32\x17.tensorboardX.StepStatsBo\n\x18org.tensorflow.frameworkB\x0fStepStatsProtosP\x01Z=github.com/tensorflow/tensorflow/tensorflow/go/core/framework\xf8\x01\x01\x62\x06proto3')
+)
+
+
+
+
+_ALLOCATIONRECORD = _descriptor.Descriptor(
+  name='AllocationRecord',
+  full_name='tensorboardX.AllocationRecord',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='alloc_micros', full_name='tensorboardX.AllocationRecord.alloc_micros', index=0,
+      number=1, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='alloc_bytes', full_name='tensorboardX.AllocationRecord.alloc_bytes', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=53,
+  serialized_end=114,
+)
+
+
+_ALLOCATORMEMORYUSED = _descriptor.Descriptor(
+  name='AllocatorMemoryUsed',
+  full_name='tensorboardX.AllocatorMemoryUsed',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='allocator_name', full_name='tensorboardX.AllocatorMemoryUsed.allocator_name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='total_bytes', full_name='tensorboardX.AllocatorMemoryUsed.total_bytes', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='peak_bytes', full_name='tensorboardX.AllocatorMemoryUsed.peak_bytes', index=2,
+      number=3, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='live_bytes', full_name='tensorboardX.AllocatorMemoryUsed.live_bytes', index=3,
+      number=4, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='allocation_records', full_name='tensorboardX.AllocatorMemoryUsed.allocation_records', index=4,
+      number=6, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='allocator_bytes_in_use', full_name='tensorboardX.AllocatorMemoryUsed.allocator_bytes_in_use', index=5,
+      number=5, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=117,
+  serialized_end=315,
+)
+
+
+_NODEOUTPUT = _descriptor.Descriptor(
+  name='NodeOutput',
+  full_name='tensorboardX.NodeOutput',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='slot', full_name='tensorboardX.NodeOutput.slot', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=317,
+  serialized_end=343,
+)
+
+
+_MEMORYSTATS = _descriptor.Descriptor(
+  name='MemoryStats',
+  full_name='tensorboardX.MemoryStats',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='temp_memory_size', full_name='tensorboardX.MemoryStats.temp_memory_size', index=0,
+      number=1, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='persistent_memory_size', full_name='tensorboardX.MemoryStats.persistent_memory_size', index=1,
+      number=3, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='persistent_tensor_alloc_ids', full_name='tensorboardX.MemoryStats.persistent_tensor_alloc_ids', index=2,
+      number=5, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='device_temp_memory_size', full_name='tensorboardX.MemoryStats.device_temp_memory_size', index=3,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\030\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='device_persistent_memory_size', full_name='tensorboardX.MemoryStats.device_persistent_memory_size', index=4,
+      number=4, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\030\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='device_persistent_tensor_alloc_ids', full_name='tensorboardX.MemoryStats.device_persistent_tensor_alloc_ids', index=5,
+      number=6, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\030\001'), file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=346,
+  serialized_end=582,
+)
+
+
+_NODEEXECSTATS = _descriptor.Descriptor(
+  name='NodeExecStats',
+  full_name='tensorboardX.NodeExecStats',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='node_name', full_name='tensorboardX.NodeExecStats.node_name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='all_start_micros', full_name='tensorboardX.NodeExecStats.all_start_micros', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='op_start_rel_micros', full_name='tensorboardX.NodeExecStats.op_start_rel_micros', index=2,
+      number=3, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='op_end_rel_micros', full_name='tensorboardX.NodeExecStats.op_end_rel_micros', index=3,
+      number=4, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='all_end_rel_micros', full_name='tensorboardX.NodeExecStats.all_end_rel_micros', index=4,
+      number=5, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='memory', full_name='tensorboardX.NodeExecStats.memory', index=5,
+      number=6, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='output', full_name='tensorboardX.NodeExecStats.output', index=6,
+      number=7, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='timeline_label', full_name='tensorboardX.NodeExecStats.timeline_label', index=7,
+      number=8, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='scheduled_micros', full_name='tensorboardX.NodeExecStats.scheduled_micros', index=8,
+      number=9, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='thread_id', full_name='tensorboardX.NodeExecStats.thread_id', index=9,
+      number=10, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='memory_stats', full_name='tensorboardX.NodeExecStats.memory_stats', index=10,
+      number=12, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=585,
+  serialized_end=940,
+)
+
+
+_DEVICESTEPSTATS = _descriptor.Descriptor(
+  name='DeviceStepStats',
+  full_name='tensorboardX.DeviceStepStats',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='device', full_name='tensorboardX.DeviceStepStats.device', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='node_stats', full_name='tensorboardX.DeviceStepStats.node_stats', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=942,
+  serialized_end=1024,
+)
+
+
+_STEPSTATS = _descriptor.Descriptor(
+  name='StepStats',
+  full_name='tensorboardX.StepStats',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='dev_stats', full_name='tensorboardX.StepStats.dev_stats', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1026,
+  serialized_end=1087,
+)
+
+
+_RUNMETADATA = _descriptor.Descriptor(
+  name='RunMetadata',
+  full_name='tensorboardX.RunMetadata',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='step_stats', full_name='tensorboardX.RunMetadata.step_stats', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1089,
+  serialized_end=1147,
+)
+
+_ALLOCATORMEMORYUSED.fields_by_name['allocation_records'].message_type = _ALLOCATIONRECORD
+_NODEEXECSTATS.fields_by_name['memory'].message_type = _ALLOCATORMEMORYUSED
+_NODEEXECSTATS.fields_by_name['output'].message_type = _NODEOUTPUT
+_NODEEXECSTATS.fields_by_name['memory_stats'].message_type = _MEMORYSTATS
+_DEVICESTEPSTATS.fields_by_name['node_stats'].message_type = _NODEEXECSTATS
+_STEPSTATS.fields_by_name['dev_stats'].message_type = _DEVICESTEPSTATS
+_RUNMETADATA.fields_by_name['step_stats'].message_type = _STEPSTATS
+DESCRIPTOR.message_types_by_name['AllocationRecord'] = _ALLOCATIONRECORD
+DESCRIPTOR.message_types_by_name['AllocatorMemoryUsed'] = _ALLOCATORMEMORYUSED
+DESCRIPTOR.message_types_by_name['NodeOutput'] = _NODEOUTPUT
+DESCRIPTOR.message_types_by_name['MemoryStats'] = _MEMORYSTATS
+DESCRIPTOR.message_types_by_name['NodeExecStats'] = _NODEEXECSTATS
+DESCRIPTOR.message_types_by_name['DeviceStepStats'] = _DEVICESTEPSTATS
+DESCRIPTOR.message_types_by_name['StepStats'] = _STEPSTATS
+DESCRIPTOR.message_types_by_name['RunMetadata'] = _RUNMETADATA
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+AllocationRecord = _reflection.GeneratedProtocolMessageType('AllocationRecord', (_message.Message,), dict(
+  DESCRIPTOR = _ALLOCATIONRECORD,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.AllocationRecord)
+  ))
+_sym_db.RegisterMessage(AllocationRecord)
+
+AllocatorMemoryUsed = _reflection.GeneratedProtocolMessageType('AllocatorMemoryUsed', (_message.Message,), dict(
+  DESCRIPTOR = _ALLOCATORMEMORYUSED,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.AllocatorMemoryUsed)
+  ))
+_sym_db.RegisterMessage(AllocatorMemoryUsed)
+
+NodeOutput = _reflection.GeneratedProtocolMessageType('NodeOutput', (_message.Message,), dict(
+  DESCRIPTOR = _NODEOUTPUT,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.NodeOutput)
+  ))
+_sym_db.RegisterMessage(NodeOutput)
+
+MemoryStats = _reflection.GeneratedProtocolMessageType('MemoryStats', (_message.Message,), dict(
+  DESCRIPTOR = _MEMORYSTATS,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.MemoryStats)
+  ))
+_sym_db.RegisterMessage(MemoryStats)
+
+NodeExecStats = _reflection.GeneratedProtocolMessageType('NodeExecStats', (_message.Message,), dict(
+  DESCRIPTOR = _NODEEXECSTATS,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.NodeExecStats)
+  ))
+_sym_db.RegisterMessage(NodeExecStats)
+
+DeviceStepStats = _reflection.GeneratedProtocolMessageType('DeviceStepStats', (_message.Message,), dict(
+  DESCRIPTOR = _DEVICESTEPSTATS,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.DeviceStepStats)
+  ))
+_sym_db.RegisterMessage(DeviceStepStats)
+
+StepStats = _reflection.GeneratedProtocolMessageType('StepStats', (_message.Message,), dict(
+  DESCRIPTOR = _STEPSTATS,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.StepStats)
+  ))
+_sym_db.RegisterMessage(StepStats)
+
+RunMetadata = _reflection.GeneratedProtocolMessageType('RunMetadata', (_message.Message,), dict(
+  DESCRIPTOR = _RUNMETADATA,
+  __module__ = 'tensorboardX.proto.step_stats_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.RunMetadata)
+  ))
+_sym_db.RegisterMessage(RunMetadata)
+
+
+DESCRIPTOR._options = None
+_MEMORYSTATS.fields_by_name['device_temp_memory_size']._options = None
+_MEMORYSTATS.fields_by_name['device_persistent_memory_size']._options = None
+_MEMORYSTATS.fields_by_name['device_persistent_tensor_alloc_ids']._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/summary.proto b/tensorboardX/tensorboardX/proto/summary.proto
new file mode 100644
index 0000000000000000000000000000000000000000..de3d50864659eb1dcaae54ff25ccab4345bc58c1
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/summary.proto
@@ -0,0 +1,124 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "SummaryProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+import "tensorboardX/proto/tensor.proto";
+
+// Metadata associated with a series of Summary data
+message SummaryDescription {
+  // Hint on how plugins should process the data in this series.
+  // Supported values include "scalar", "histogram", "image", "audio"
+  string type_hint = 1;
+}
+
+// Serialization format for histogram module in
+// core/lib/histogram/histogram.h
+message HistogramProto {
+  double min = 1;
+  double max = 2;
+  double num = 3;
+  double sum = 4;
+  double sum_squares = 5;
+
+  // Parallel arrays encoding the bucket boundaries and the bucket values.
+  // bucket(i) is the count for the bucket i.  The range for
+  // a bucket is:
+  //   i == 0:  -DBL_MAX .. bucket_limit(0)
+  //   i != 0:  bucket_limit(i-1) .. bucket_limit(i)
+  repeated double bucket_limit = 6 [packed = true];
+  repeated double bucket = 7 [packed = true];
+};
+
+// A SummaryMetadata encapsulates information on which plugins are able to make
+// use of a certain summary value.
+message SummaryMetadata {
+  message PluginData {
+    // The name of the plugin this data pertains to.
+    string plugin_name = 1;
+
+    // The content to store for the plugin. The best practice is for this to be
+    // a binary serialized protocol buffer.
+    bytes content = 2;
+  }
+
+  // Data that associates a summary with a certain plugin.
+  PluginData plugin_data = 1;
+
+  // Display name for viewing in TensorBoard.
+  string display_name = 2;
+
+  // Longform readable description of the summary sequence. Markdown supported.
+  string summary_description = 3;
+};
+
+// A Summary is a set of named values to be displayed by the
+// visualizer.
+//
+// Summaries are produced regularly during training, as controlled by
+// the "summary_interval_secs" attribute of the training operation.
+// Summaries are also produced at the end of an evaluation.
+message Summary {
+  message Image {
+    // Dimensions of the image.
+    int32 height = 1;
+    int32 width = 2;
+    // Valid colorspace values are
+    //   1 - grayscale
+    //   2 - grayscale + alpha
+    //   3 - RGB
+    //   4 - RGBA
+    //   5 - DIGITAL_YUV
+    //   6 - BGRA
+    int32 colorspace = 3;
+    // Image data in encoded format.  All image formats supported by
+    // image_codec::CoderUtil can be stored here.
+    bytes encoded_image_string = 4;
+  }
+
+  message Audio {
+    // Sample rate of the audio in Hz.
+    float sample_rate = 1;
+    // Number of channels of audio.
+    int64 num_channels = 2;
+    // Length of the audio in frames (samples per channel).
+    int64 length_frames = 3;
+    // Encoded audio data and its associated RFC 2045 content type (e.g.
+    // "audio/wav").
+    bytes encoded_audio_string = 4;
+    string content_type = 5;
+  }
+
+  message Value {
+    // This field is deprecated and will not be set.
+    string node_name = 7;
+
+    // Tag name for the data. Used by TensorBoard plugins to organize data. Tags
+    // are often organized by scope (which contains slashes to convey
+    // hierarchy). For example: foo/bar/0
+    string tag = 1;
+
+    // Contains metadata on the summary value such as which plugins may use it.
+    // Take note that many summary values may lack a metadata field. This is
+    // because the FileWriter only keeps a metadata object on the first summary
+    // value with a certain tag for each tag. TensorBoard then remembers which
+    // tags are associated with which plugins. This saves space.
+    SummaryMetadata metadata = 9;
+
+    // Value associated with the tag.
+    oneof value {
+      float simple_value = 2;
+      bytes obsolete_old_style_histogram = 3;
+      Image image = 4;
+      HistogramProto histo = 5;
+      Audio audio = 6;
+      TensorProto tensor = 8;
+    }
+  }
+
+  // Set of values for the summary.
+  repeated Value value = 1;
+}
diff --git a/tensorboardX/tensorboardX/proto/summary_pb2.py b/tensorboardX/tensorboardX/proto/summary_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eeccba35801f8baf9a6ce0944473f03cb064bdd
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/summary_pb2.py
@@ -0,0 +1,543 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/summary.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from tensorboardX.proto import tensor_pb2 as tensorboardX_dot_proto_dot_tensor__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/summary.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\rSummaryProtosP\001\370\001\001'),
+  serialized_pb=_b('\n tensorboardX/proto/summary.proto\x12\x0ctensorboardX\x1a\x1ftensorboardX/proto/tensor.proto\"\'\n\x12SummaryDescription\x12\x11\n\ttype_hint\x18\x01 \x01(\t\"\x87\x01\n\x0eHistogramProto\x12\x0b\n\x03min\x18\x01 \x01(\x01\x12\x0b\n\x03max\x18\x02 \x01(\x01\x12\x0b\n\x03num\x18\x03 \x01(\x01\x12\x0b\n\x03sum\x18\x04 \x01(\x01\x12\x13\n\x0bsum_squares\x18\x05 \x01(\x01\x12\x18\n\x0c\x62ucket_limit\x18\x06 \x03(\x01\x42\x02\x10\x01\x12\x12\n\x06\x62ucket\x18\x07 \x03(\x01\x42\x02\x10\x01\"\xb7\x01\n\x0fSummaryMetadata\x12=\n\x0bplugin_data\x18\x01 \x01(\x0b\x32(.tensorboardX.SummaryMetadata.PluginData\x12\x14\n\x0c\x64isplay_name\x18\x02 \x01(\t\x12\x1b\n\x13summary_description\x18\x03 \x01(\t\x1a\x32\n\nPluginData\x12\x13\n\x0bplugin_name\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontent\x18\x02 \x01(\x0c\"\xea\x04\n\x07Summary\x12*\n\x05value\x18\x01 \x03(\x0b\x32\x1b.tensorboardX.Summary.Value\x1aX\n\x05Image\x12\x0e\n\x06height\x18\x01 \x01(\x05\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x12\n\ncolorspace\x18\x03 \x01(\x05\x12\x1c\n\x14\x65ncoded_image_string\x18\x04 \x01(\x0c\x1a}\n\x05\x41udio\x12\x13\n\x0bsample_rate\x18\x01 \x01(\x02\x12\x14\n\x0cnum_channels\x18\x02 \x01(\x03\x12\x15\n\rlength_frames\x18\x03 \x01(\x03\x12\x1c\n\x14\x65ncoded_audio_string\x18\x04 \x01(\x0c\x12\x14\n\x0c\x63ontent_type\x18\x05 \x01(\t\x1a\xd9\x02\n\x05Value\x12\x11\n\tnode_name\x18\x07 \x01(\t\x12\x0b\n\x03tag\x18\x01 \x01(\t\x12/\n\x08metadata\x18\t \x01(\x0b\x32\x1d.tensorboardX.SummaryMetadata\x12\x16\n\x0csimple_value\x18\x02 \x01(\x02H\x00\x12&\n\x1cobsolete_old_style_histogram\x18\x03 \x01(\x0cH\x00\x12,\n\x05image\x18\x04 \x01(\x0b\x32\x1b.tensorboardX.Summary.ImageH\x00\x12-\n\x05histo\x18\x05 \x01(\x0b\x32\x1c.tensorboardX.HistogramProtoH\x00\x12,\n\x05\x61udio\x18\x06 \x01(\x0b\x32\x1b.tensorboardX.Summary.AudioH\x00\x12+\n\x06tensor\x18\x08 \x01(\x0b\x32\x19.tensorboardX.TensorProtoH\x00\x42\x07\n\x05valueB.\n\x18org.tensorflow.frameworkB\rSummaryProtosP\x01\xf8\x01\x01\x62\x06proto3')
+  ,
+  dependencies=[tensorboardX_dot_proto_dot_tensor__pb2.DESCRIPTOR,])
+
+
+
+
+_SUMMARYDESCRIPTION = _descriptor.Descriptor(
+  name='SummaryDescription',
+  full_name='tensorboardX.SummaryDescription',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type_hint', full_name='tensorboardX.SummaryDescription.type_hint', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=83,
+  serialized_end=122,
+)
+
+
+_HISTOGRAMPROTO = _descriptor.Descriptor(
+  name='HistogramProto',
+  full_name='tensorboardX.HistogramProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='min', full_name='tensorboardX.HistogramProto.min', index=0,
+      number=1, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='max', full_name='tensorboardX.HistogramProto.max', index=1,
+      number=2, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num', full_name='tensorboardX.HistogramProto.num', index=2,
+      number=3, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='sum', full_name='tensorboardX.HistogramProto.sum', index=3,
+      number=4, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='sum_squares', full_name='tensorboardX.HistogramProto.sum_squares', index=4,
+      number=5, type=1, cpp_type=5, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='bucket_limit', full_name='tensorboardX.HistogramProto.bucket_limit', index=5,
+      number=6, type=1, cpp_type=5, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='bucket', full_name='tensorboardX.HistogramProto.bucket', index=6,
+      number=7, type=1, cpp_type=5, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=125,
+  serialized_end=260,
+)
+
+
+_SUMMARYMETADATA_PLUGINDATA = _descriptor.Descriptor(
+  name='PluginData',
+  full_name='tensorboardX.SummaryMetadata.PluginData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='plugin_name', full_name='tensorboardX.SummaryMetadata.PluginData.plugin_name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='content', full_name='tensorboardX.SummaryMetadata.PluginData.content', index=1,
+      number=2, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=396,
+  serialized_end=446,
+)
+
+_SUMMARYMETADATA = _descriptor.Descriptor(
+  name='SummaryMetadata',
+  full_name='tensorboardX.SummaryMetadata',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='plugin_data', full_name='tensorboardX.SummaryMetadata.plugin_data', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='display_name', full_name='tensorboardX.SummaryMetadata.display_name', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='summary_description', full_name='tensorboardX.SummaryMetadata.summary_description', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_SUMMARYMETADATA_PLUGINDATA, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=263,
+  serialized_end=446,
+)
+
+
+_SUMMARY_IMAGE = _descriptor.Descriptor(
+  name='Image',
+  full_name='tensorboardX.Summary.Image',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='height', full_name='tensorboardX.Summary.Image.height', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='width', full_name='tensorboardX.Summary.Image.width', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='colorspace', full_name='tensorboardX.Summary.Image.colorspace', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='encoded_image_string', full_name='tensorboardX.Summary.Image.encoded_image_string', index=3,
+      number=4, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=504,
+  serialized_end=592,
+)
+
+_SUMMARY_AUDIO = _descriptor.Descriptor(
+  name='Audio',
+  full_name='tensorboardX.Summary.Audio',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='sample_rate', full_name='tensorboardX.Summary.Audio.sample_rate', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_channels', full_name='tensorboardX.Summary.Audio.num_channels', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='length_frames', full_name='tensorboardX.Summary.Audio.length_frames', index=2,
+      number=3, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='encoded_audio_string', full_name='tensorboardX.Summary.Audio.encoded_audio_string', index=3,
+      number=4, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='content_type', full_name='tensorboardX.Summary.Audio.content_type', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=594,
+  serialized_end=719,
+)
+
+_SUMMARY_VALUE = _descriptor.Descriptor(
+  name='Value',
+  full_name='tensorboardX.Summary.Value',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='node_name', full_name='tensorboardX.Summary.Value.node_name', index=0,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tag', full_name='tensorboardX.Summary.Value.tag', index=1,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='metadata', full_name='tensorboardX.Summary.Value.metadata', index=2,
+      number=9, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='simple_value', full_name='tensorboardX.Summary.Value.simple_value', index=3,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='obsolete_old_style_histogram', full_name='tensorboardX.Summary.Value.obsolete_old_style_histogram', index=4,
+      number=3, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='image', full_name='tensorboardX.Summary.Value.image', index=5,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='histo', full_name='tensorboardX.Summary.Value.histo', index=6,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='audio', full_name='tensorboardX.Summary.Value.audio', index=7,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tensor', full_name='tensorboardX.Summary.Value.tensor', index=8,
+      number=8, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='value', full_name='tensorboardX.Summary.Value.value',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=722,
+  serialized_end=1067,
+)
+
+_SUMMARY = _descriptor.Descriptor(
+  name='Summary',
+  full_name='tensorboardX.Summary',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='value', full_name='tensorboardX.Summary.value', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_SUMMARY_IMAGE, _SUMMARY_AUDIO, _SUMMARY_VALUE, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=449,
+  serialized_end=1067,
+)
+
+_SUMMARYMETADATA_PLUGINDATA.containing_type = _SUMMARYMETADATA
+_SUMMARYMETADATA.fields_by_name['plugin_data'].message_type = _SUMMARYMETADATA_PLUGINDATA
+_SUMMARY_IMAGE.containing_type = _SUMMARY
+_SUMMARY_AUDIO.containing_type = _SUMMARY
+_SUMMARY_VALUE.fields_by_name['metadata'].message_type = _SUMMARYMETADATA
+_SUMMARY_VALUE.fields_by_name['image'].message_type = _SUMMARY_IMAGE
+_SUMMARY_VALUE.fields_by_name['histo'].message_type = _HISTOGRAMPROTO
+_SUMMARY_VALUE.fields_by_name['audio'].message_type = _SUMMARY_AUDIO
+_SUMMARY_VALUE.fields_by_name['tensor'].message_type = tensorboardX_dot_proto_dot_tensor__pb2._TENSORPROTO
+_SUMMARY_VALUE.containing_type = _SUMMARY
+_SUMMARY_VALUE.oneofs_by_name['value'].fields.append(
+  _SUMMARY_VALUE.fields_by_name['simple_value'])
+_SUMMARY_VALUE.fields_by_name['simple_value'].containing_oneof = _SUMMARY_VALUE.oneofs_by_name['value']
+_SUMMARY_VALUE.oneofs_by_name['value'].fields.append(
+  _SUMMARY_VALUE.fields_by_name['obsolete_old_style_histogram'])
+_SUMMARY_VALUE.fields_by_name['obsolete_old_style_histogram'].containing_oneof = _SUMMARY_VALUE.oneofs_by_name['value']
+_SUMMARY_VALUE.oneofs_by_name['value'].fields.append(
+  _SUMMARY_VALUE.fields_by_name['image'])
+_SUMMARY_VALUE.fields_by_name['image'].containing_oneof = _SUMMARY_VALUE.oneofs_by_name['value']
+_SUMMARY_VALUE.oneofs_by_name['value'].fields.append(
+  _SUMMARY_VALUE.fields_by_name['histo'])
+_SUMMARY_VALUE.fields_by_name['histo'].containing_oneof = _SUMMARY_VALUE.oneofs_by_name['value']
+_SUMMARY_VALUE.oneofs_by_name['value'].fields.append(
+  _SUMMARY_VALUE.fields_by_name['audio'])
+_SUMMARY_VALUE.fields_by_name['audio'].containing_oneof = _SUMMARY_VALUE.oneofs_by_name['value']
+_SUMMARY_VALUE.oneofs_by_name['value'].fields.append(
+  _SUMMARY_VALUE.fields_by_name['tensor'])
+_SUMMARY_VALUE.fields_by_name['tensor'].containing_oneof = _SUMMARY_VALUE.oneofs_by_name['value']
+_SUMMARY.fields_by_name['value'].message_type = _SUMMARY_VALUE
+DESCRIPTOR.message_types_by_name['SummaryDescription'] = _SUMMARYDESCRIPTION
+DESCRIPTOR.message_types_by_name['HistogramProto'] = _HISTOGRAMPROTO
+DESCRIPTOR.message_types_by_name['SummaryMetadata'] = _SUMMARYMETADATA
+DESCRIPTOR.message_types_by_name['Summary'] = _SUMMARY
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+SummaryDescription = _reflection.GeneratedProtocolMessageType('SummaryDescription', (_message.Message,), dict(
+  DESCRIPTOR = _SUMMARYDESCRIPTION,
+  __module__ = 'tensorboardX.proto.summary_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.SummaryDescription)
+  ))
+_sym_db.RegisterMessage(SummaryDescription)
+
+HistogramProto = _reflection.GeneratedProtocolMessageType('HistogramProto', (_message.Message,), dict(
+  DESCRIPTOR = _HISTOGRAMPROTO,
+  __module__ = 'tensorboardX.proto.summary_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.HistogramProto)
+  ))
+_sym_db.RegisterMessage(HistogramProto)
+
+SummaryMetadata = _reflection.GeneratedProtocolMessageType('SummaryMetadata', (_message.Message,), dict(
+
+  PluginData = _reflection.GeneratedProtocolMessageType('PluginData', (_message.Message,), dict(
+    DESCRIPTOR = _SUMMARYMETADATA_PLUGINDATA,
+    __module__ = 'tensorboardX.proto.summary_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.SummaryMetadata.PluginData)
+    ))
+  ,
+  DESCRIPTOR = _SUMMARYMETADATA,
+  __module__ = 'tensorboardX.proto.summary_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.SummaryMetadata)
+  ))
+_sym_db.RegisterMessage(SummaryMetadata)
+_sym_db.RegisterMessage(SummaryMetadata.PluginData)
+
+Summary = _reflection.GeneratedProtocolMessageType('Summary', (_message.Message,), dict(
+
+  Image = _reflection.GeneratedProtocolMessageType('Image', (_message.Message,), dict(
+    DESCRIPTOR = _SUMMARY_IMAGE,
+    __module__ = 'tensorboardX.proto.summary_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.Summary.Image)
+    ))
+  ,
+
+  Audio = _reflection.GeneratedProtocolMessageType('Audio', (_message.Message,), dict(
+    DESCRIPTOR = _SUMMARY_AUDIO,
+    __module__ = 'tensorboardX.proto.summary_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.Summary.Audio)
+    ))
+  ,
+
+  Value = _reflection.GeneratedProtocolMessageType('Value', (_message.Message,), dict(
+    DESCRIPTOR = _SUMMARY_VALUE,
+    __module__ = 'tensorboardX.proto.summary_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.Summary.Value)
+    ))
+  ,
+  DESCRIPTOR = _SUMMARY,
+  __module__ = 'tensorboardX.proto.summary_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.Summary)
+  ))
+_sym_db.RegisterMessage(Summary)
+_sym_db.RegisterMessage(Summary.Image)
+_sym_db.RegisterMessage(Summary.Audio)
+_sym_db.RegisterMessage(Summary.Value)
+
+
+DESCRIPTOR._options = None
+_HISTOGRAMPROTO.fields_by_name['bucket_limit']._options = None
+_HISTOGRAMPROTO.fields_by_name['bucket']._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/tensor.proto b/tensorboardX/tensorboardX/proto/tensor.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b390d397154749ac7d68d874d122320cb6e65722
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/tensor.proto
@@ -0,0 +1,75 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "TensorProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+import "tensorboardX/proto/resource_handle.proto";
+import "tensorboardX/proto/tensor_shape.proto";
+import "tensorboardX/proto/types.proto";
+
+// Protocol buffer representing a tensor.
+message TensorProto {
+  DataType dtype = 1;
+
+  // Shape of the tensor.  TODO(touts): sort out the 0-rank issues.
+  TensorShapeProto tensor_shape = 2;
+
+  // Only one of the representations below is set, one of "tensor_contents" and
+  // the "xxx_val" attributes.  We are not using oneof because as oneofs cannot
+  // contain repeated fields it would require another extra set of messages.
+
+  // Version number.
+  //
+  // In version 0, if the "repeated xxx" representations contain only one
+  // element, that element is repeated to fill the shape.  This makes it easy
+  // to represent a constant Tensor with a single value.
+  int32 version_number = 3;
+
+  // Serialized raw tensor content from either Tensor::AsProtoTensorContent or
+  // memcpy in tensorflow::grpc::EncodeTensorToByteBuffer. This representation
+  // can be used for all tensor types. The purpose of this representation is to
+  // reduce serialization overhead during RPC call by avoiding serialization of
+  // many repeated small items.
+  bytes tensor_content = 4;
+
+  // Type specific representations that make it easy to create tensor protos in
+  // all languages.  Only the representation corresponding to "dtype" can
+  // be set.  The values hold the flattened representation of the tensor in
+  // row major order.
+
+  // DT_HALF. Note that since protobuf has no int16 type, we'll have some
+  // pointless zero padding for each value here.
+  repeated int32 half_val = 13 [packed = true];
+
+  // DT_FLOAT.
+  repeated float float_val = 5 [packed = true];
+
+  // DT_DOUBLE.
+  repeated double double_val = 6 [packed = true];
+
+  // DT_INT32, DT_INT16, DT_INT8, DT_UINT8.
+  repeated int32 int_val = 7 [packed = true];
+
+  // DT_STRING
+  repeated bytes string_val = 8;
+
+  // DT_COMPLEX64. scomplex_val(2*i) and scomplex_val(2*i+1) are real
+  // and imaginary parts of i-th single precision complex.
+  repeated float scomplex_val = 9 [packed = true];
+
+  // DT_INT64
+  repeated int64 int64_val = 10 [packed = true];
+
+  // DT_BOOL
+  repeated bool bool_val = 11 [packed = true];
+
+  // DT_COMPLEX128. dcomplex_val(2*i) and dcomplex_val(2*i+1) are real
+  // and imaginary parts of i-th double precision complex.
+  repeated double dcomplex_val = 12 [packed = true];
+
+  // DT_RESOURCE
+  repeated ResourceHandleProto resource_handle_val = 14;
+};
diff --git a/tensorboardX/tensorboardX/proto/tensor_pb2.py b/tensorboardX/tensorboardX/proto/tensor_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ed1605aae4c94c71b3aa53641b55fb44667e2f
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/tensor_pb2.py
@@ -0,0 +1,176 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/tensor.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from tensorboardX.proto import resource_handle_pb2 as tensorboardX_dot_proto_dot_resource__handle__pb2
+from tensorboardX.proto import tensor_shape_pb2 as tensorboardX_dot_proto_dot_tensor__shape__pb2
+from tensorboardX.proto import types_pb2 as tensorboardX_dot_proto_dot_types__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/tensor.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\014TensorProtosP\001\370\001\001'),
+  serialized_pb=_b('\n\x1ftensorboardX/proto/tensor.proto\x12\x0ctensorboardX\x1a(tensorboardX/proto/resource_handle.proto\x1a%tensorboardX/proto/tensor_shape.proto\x1a\x1etensorboardX/proto/types.proto\"\xa9\x03\n\x0bTensorProto\x12%\n\x05\x64type\x18\x01 \x01(\x0e\x32\x16.tensorboardX.DataType\x12\x34\n\x0ctensor_shape\x18\x02 \x01(\x0b\x32\x1e.tensorboardX.TensorShapeProto\x12\x16\n\x0eversion_number\x18\x03 \x01(\x05\x12\x16\n\x0etensor_content\x18\x04 \x01(\x0c\x12\x14\n\x08half_val\x18\r \x03(\x05\x42\x02\x10\x01\x12\x15\n\tfloat_val\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x16\n\ndouble_val\x18\x06 \x03(\x01\x42\x02\x10\x01\x12\x13\n\x07int_val\x18\x07 \x03(\x05\x42\x02\x10\x01\x12\x12\n\nstring_val\x18\x08 \x03(\x0c\x12\x18\n\x0cscomplex_val\x18\t \x03(\x02\x42\x02\x10\x01\x12\x15\n\tint64_val\x18\n \x03(\x03\x42\x02\x10\x01\x12\x14\n\x08\x62ool_val\x18\x0b \x03(\x08\x42\x02\x10\x01\x12\x18\n\x0c\x64\x63omplex_val\x18\x0c \x03(\x01\x42\x02\x10\x01\x12>\n\x13resource_handle_val\x18\x0e \x03(\x0b\x32!.tensorboardX.ResourceHandleProtoB-\n\x18org.tensorflow.frameworkB\x0cTensorProtosP\x01\xf8\x01\x01\x62\x06proto3')
+  ,
+  dependencies=[tensorboardX_dot_proto_dot_resource__handle__pb2.DESCRIPTOR,tensorboardX_dot_proto_dot_tensor__shape__pb2.DESCRIPTOR,tensorboardX_dot_proto_dot_types__pb2.DESCRIPTOR,])
+
+
+
+
+_TENSORPROTO = _descriptor.Descriptor(
+  name='TensorProto',
+  full_name='tensorboardX.TensorProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='dtype', full_name='tensorboardX.TensorProto.dtype', index=0,
+      number=1, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tensor_shape', full_name='tensorboardX.TensorProto.tensor_shape', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='version_number', full_name='tensorboardX.TensorProto.version_number', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='tensor_content', full_name='tensorboardX.TensorProto.tensor_content', index=3,
+      number=4, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='half_val', full_name='tensorboardX.TensorProto.half_val', index=4,
+      number=13, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='float_val', full_name='tensorboardX.TensorProto.float_val', index=5,
+      number=5, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='double_val', full_name='tensorboardX.TensorProto.double_val', index=6,
+      number=6, type=1, cpp_type=5, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='int_val', full_name='tensorboardX.TensorProto.int_val', index=7,
+      number=7, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='string_val', full_name='tensorboardX.TensorProto.string_val', index=8,
+      number=8, type=12, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='scomplex_val', full_name='tensorboardX.TensorProto.scomplex_val', index=9,
+      number=9, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='int64_val', full_name='tensorboardX.TensorProto.int64_val', index=10,
+      number=10, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='bool_val', full_name='tensorboardX.TensorProto.bool_val', index=11,
+      number=11, type=8, cpp_type=7, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='dcomplex_val', full_name='tensorboardX.TensorProto.dcomplex_val', index=12,
+      number=12, type=1, cpp_type=5, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=_b('\020\001'), file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='resource_handle_val', full_name='tensorboardX.TensorProto.resource_handle_val', index=13,
+      number=14, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=163,
+  serialized_end=588,
+)
+
+_TENSORPROTO.fields_by_name['dtype'].enum_type = tensorboardX_dot_proto_dot_types__pb2._DATATYPE
+_TENSORPROTO.fields_by_name['tensor_shape'].message_type = tensorboardX_dot_proto_dot_tensor__shape__pb2._TENSORSHAPEPROTO
+_TENSORPROTO.fields_by_name['resource_handle_val'].message_type = tensorboardX_dot_proto_dot_resource__handle__pb2._RESOURCEHANDLEPROTO
+DESCRIPTOR.message_types_by_name['TensorProto'] = _TENSORPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TensorProto = _reflection.GeneratedProtocolMessageType('TensorProto', (_message.Message,), dict(
+  DESCRIPTOR = _TENSORPROTO,
+  __module__ = 'tensorboardX.proto.tensor_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.TensorProto)
+  ))
+_sym_db.RegisterMessage(TensorProto)
+
+
+DESCRIPTOR._options = None
+_TENSORPROTO.fields_by_name['half_val']._options = None
+_TENSORPROTO.fields_by_name['float_val']._options = None
+_TENSORPROTO.fields_by_name['double_val']._options = None
+_TENSORPROTO.fields_by_name['int_val']._options = None
+_TENSORPROTO.fields_by_name['scomplex_val']._options = None
+_TENSORPROTO.fields_by_name['int64_val']._options = None
+_TENSORPROTO.fields_by_name['bool_val']._options = None
+_TENSORPROTO.fields_by_name['dcomplex_val']._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/tensor_shape.proto b/tensorboardX/tensorboardX/proto/tensor_shape.proto
new file mode 100644
index 0000000000000000000000000000000000000000..6b86c1f2743b679e87dee1af61afcb09c816fa0f
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/tensor_shape.proto
@@ -0,0 +1,45 @@
+// Protocol buffer representing the shape of tensors.
+
+syntax = "proto3";
+option cc_enable_arenas = true;
+option java_outer_classname = "TensorShapeProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+package tensorboardX;
+
+// Dimensions of a tensor.
+message TensorShapeProto {
+  // One dimension of the tensor.
+  message Dim {
+    // Size of the tensor in that dimension.
+    // This value must be >= -1, but values of -1 are reserved for "unknown"
+    // shapes (values of -1 mean "unknown" dimension).  Certain wrappers
+    // that work with TensorShapeProto may fail at runtime when deserializing
+    // a TensorShapeProto containing a dim value of -1.
+    int64 size = 1;
+
+    // Optional name of the tensor dimension.
+    string name = 2;
+  };
+
+  // Dimensions of the tensor, such as {"input", 30}, {"output", 40}
+  // for a 30 x 40 2D tensor.  If an entry has size -1, this
+  // corresponds to a dimension of unknown size. The names are
+  // optional.
+  //
+  // The order of entries in "dim" matters: It indicates the layout of the
+  // values in the tensor in-memory representation.
+  //
+  // The first entry in "dim" is the outermost dimension used to layout the
+  // values, the last entry is the innermost dimension.  This matches the
+  // in-memory layout of RowMajor Eigen tensors.
+  //
+  // If "dim.size()" > 0, "unknown_rank" must be false.
+  repeated Dim dim = 2;
+
+  // If true, the number of dimensions in the shape is unknown.
+  //
+  // If true, "dim.size()" must be 0.
+  bool unknown_rank = 3;
+};
diff --git a/tensorboardX/tensorboardX/proto/tensor_shape_pb2.py b/tensorboardX/tensorboardX/proto/tensor_shape_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..176511e23c7f227b470bc4c222fb646ff0baa288
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/tensor_shape_pb2.py
@@ -0,0 +1,124 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/tensor_shape.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/tensor_shape.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\021TensorShapeProtosP\001\370\001\001'),
+  serialized_pb=_b('\n%tensorboardX/proto/tensor_shape.proto\x12\x0ctensorboardX\"|\n\x10TensorShapeProto\x12/\n\x03\x64im\x18\x02 \x03(\x0b\x32\".tensorboardX.TensorShapeProto.Dim\x12\x14\n\x0cunknown_rank\x18\x03 \x01(\x08\x1a!\n\x03\x44im\x12\x0c\n\x04size\x18\x01 \x01(\x03\x12\x0c\n\x04name\x18\x02 \x01(\tB2\n\x18org.tensorflow.frameworkB\x11TensorShapeProtosP\x01\xf8\x01\x01\x62\x06proto3')
+)
+
+
+
+
+_TENSORSHAPEPROTO_DIM = _descriptor.Descriptor(
+  name='Dim',
+  full_name='tensorboardX.TensorShapeProto.Dim',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='size', full_name='tensorboardX.TensorShapeProto.Dim.size', index=0,
+      number=1, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='name', full_name='tensorboardX.TensorShapeProto.Dim.name', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=146,
+  serialized_end=179,
+)
+
+_TENSORSHAPEPROTO = _descriptor.Descriptor(
+  name='TensorShapeProto',
+  full_name='tensorboardX.TensorShapeProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='dim', full_name='tensorboardX.TensorShapeProto.dim', index=0,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='unknown_rank', full_name='tensorboardX.TensorShapeProto.unknown_rank', index=1,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_TENSORSHAPEPROTO_DIM, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=55,
+  serialized_end=179,
+)
+
+_TENSORSHAPEPROTO_DIM.containing_type = _TENSORSHAPEPROTO
+_TENSORSHAPEPROTO.fields_by_name['dim'].message_type = _TENSORSHAPEPROTO_DIM
+DESCRIPTOR.message_types_by_name['TensorShapeProto'] = _TENSORSHAPEPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TensorShapeProto = _reflection.GeneratedProtocolMessageType('TensorShapeProto', (_message.Message,), dict(
+
+  Dim = _reflection.GeneratedProtocolMessageType('Dim', (_message.Message,), dict(
+    DESCRIPTOR = _TENSORSHAPEPROTO_DIM,
+    __module__ = 'tensorboardX.proto.tensor_shape_pb2'
+    # @@protoc_insertion_point(class_scope:tensorboardX.TensorShapeProto.Dim)
+    ))
+  ,
+  DESCRIPTOR = _TENSORSHAPEPROTO,
+  __module__ = 'tensorboardX.proto.tensor_shape_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.TensorShapeProto)
+  ))
+_sym_db.RegisterMessage(TensorShapeProto)
+_sym_db.RegisterMessage(TensorShapeProto.Dim)
+
+
+DESCRIPTOR._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/types.proto b/tensorboardX/tensorboardX/proto/types.proto
new file mode 100644
index 0000000000000000000000000000000000000000..26f0b6dfabca977672841fd54a81504cd667ab17
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/types.proto
@@ -0,0 +1,64 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "TypesProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+// LINT.IfChange
+enum DataType {
+  // Not a legal value for DataType.  Used to indicate a DataType field
+  // has not been set.
+  DT_INVALID = 0;
+
+  // Data types that all computation devices are expected to be
+  // capable to support.
+  DT_FLOAT = 1;
+  DT_DOUBLE = 2;
+  DT_INT32 = 3;
+  DT_UINT8 = 4;
+  DT_INT16 = 5;
+  DT_INT8 = 6;
+  DT_STRING = 7;
+  DT_COMPLEX64 = 8;  // Single-precision complex
+  DT_INT64 = 9;
+  DT_BOOL = 10;
+  DT_QINT8 = 11;     // Quantized int8
+  DT_QUINT8 = 12;    // Quantized uint8
+  DT_QINT32 = 13;    // Quantized int32
+  DT_BFLOAT16 = 14;  // Float32 truncated to 16 bits.  Only for cast ops.
+  DT_QINT16 = 15;    // Quantized int16
+  DT_QUINT16 = 16;   // Quantized uint16
+  DT_UINT16 = 17;
+  DT_COMPLEX128 = 18;  // Double-precision complex
+  DT_HALF = 19;
+  DT_RESOURCE = 20;
+
+  // TODO(josh11b): DT_GENERIC_PROTO = ??;
+  // TODO(jeff,josh11b): DT_UINT64?  DT_UINT32?
+
+  // Do not use!  These are only for parameters.  Every enum above
+  // should have a corresponding value below (verified by types_test).
+  DT_FLOAT_REF = 101;
+  DT_DOUBLE_REF = 102;
+  DT_INT32_REF = 103;
+  DT_UINT8_REF = 104;
+  DT_INT16_REF = 105;
+  DT_INT8_REF = 106;
+  DT_STRING_REF = 107;
+  DT_COMPLEX64_REF = 108;
+  DT_INT64_REF = 109;
+  DT_BOOL_REF = 110;
+  DT_QINT8_REF = 111;
+  DT_QUINT8_REF = 112;
+  DT_QINT32_REF = 113;
+  DT_BFLOAT16_REF = 114;
+  DT_QINT16_REF = 115;
+  DT_QUINT16_REF = 116;
+  DT_UINT16_REF = 117;
+  DT_COMPLEX128_REF = 118;
+  DT_HALF_REF = 119;
+  DT_RESOURCE_REF = 120;
+}
+// LINT.ThenChange(https://www.tensorflow.org/code/tensorflow/c/c_api.h,https://www.tensorflow.org/code/tensorflow/go/tensor.go)
diff --git a/tensorboardX/tensorboardX/proto/types_pb2.py b/tensorboardX/tensorboardX/proto/types_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae544230de46f56f68cba7dbc461a27d4b855631
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/types_pb2.py
@@ -0,0 +1,253 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/types.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/types.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\013TypesProtosP\001\370\001\001'),
+  serialized_pb=_b('\n\x1etensorboardX/proto/types.proto\x12\x0ctensorboardX*\xc2\x05\n\x08\x44\x61taType\x12\x0e\n\nDT_INVALID\x10\x00\x12\x0c\n\x08\x44T_FLOAT\x10\x01\x12\r\n\tDT_DOUBLE\x10\x02\x12\x0c\n\x08\x44T_INT32\x10\x03\x12\x0c\n\x08\x44T_UINT8\x10\x04\x12\x0c\n\x08\x44T_INT16\x10\x05\x12\x0b\n\x07\x44T_INT8\x10\x06\x12\r\n\tDT_STRING\x10\x07\x12\x10\n\x0c\x44T_COMPLEX64\x10\x08\x12\x0c\n\x08\x44T_INT64\x10\t\x12\x0b\n\x07\x44T_BOOL\x10\n\x12\x0c\n\x08\x44T_QINT8\x10\x0b\x12\r\n\tDT_QUINT8\x10\x0c\x12\r\n\tDT_QINT32\x10\r\x12\x0f\n\x0b\x44T_BFLOAT16\x10\x0e\x12\r\n\tDT_QINT16\x10\x0f\x12\x0e\n\nDT_QUINT16\x10\x10\x12\r\n\tDT_UINT16\x10\x11\x12\x11\n\rDT_COMPLEX128\x10\x12\x12\x0b\n\x07\x44T_HALF\x10\x13\x12\x0f\n\x0b\x44T_RESOURCE\x10\x14\x12\x10\n\x0c\x44T_FLOAT_REF\x10\x65\x12\x11\n\rDT_DOUBLE_REF\x10\x66\x12\x10\n\x0c\x44T_INT32_REF\x10g\x12\x10\n\x0c\x44T_UINT8_REF\x10h\x12\x10\n\x0c\x44T_INT16_REF\x10i\x12\x0f\n\x0b\x44T_INT8_REF\x10j\x12\x11\n\rDT_STRING_REF\x10k\x12\x14\n\x10\x44T_COMPLEX64_REF\x10l\x12\x10\n\x0c\x44T_INT64_REF\x10m\x12\x0f\n\x0b\x44T_BOOL_REF\x10n\x12\x10\n\x0c\x44T_QINT8_REF\x10o\x12\x11\n\rDT_QUINT8_REF\x10p\x12\x11\n\rDT_QINT32_REF\x10q\x12\x13\n\x0f\x44T_BFLOAT16_REF\x10r\x12\x11\n\rDT_QINT16_REF\x10s\x12\x12\n\x0e\x44T_QUINT16_REF\x10t\x12\x11\n\rDT_UINT16_REF\x10u\x12\x15\n\x11\x44T_COMPLEX128_REF\x10v\x12\x0f\n\x0b\x44T_HALF_REF\x10w\x12\x13\n\x0f\x44T_RESOURCE_REF\x10xB,\n\x18org.tensorflow.frameworkB\x0bTypesProtosP\x01\xf8\x01\x01\x62\x06proto3')
+)
+
+_DATATYPE = _descriptor.EnumDescriptor(
+  name='DataType',
+  full_name='tensorboardX.DataType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DT_INVALID', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_FLOAT', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_DOUBLE', index=2, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT32', index=3, number=3,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_UINT8', index=4, number=4,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT16', index=5, number=5,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT8', index=6, number=6,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_STRING', index=7, number=7,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_COMPLEX64', index=8, number=8,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT64', index=9, number=9,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_BOOL', index=10, number=10,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QINT8', index=11, number=11,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QUINT8', index=12, number=12,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QINT32', index=13, number=13,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_BFLOAT16', index=14, number=14,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QINT16', index=15, number=15,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QUINT16', index=16, number=16,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_UINT16', index=17, number=17,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_COMPLEX128', index=18, number=18,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_HALF', index=19, number=19,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_RESOURCE', index=20, number=20,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_FLOAT_REF', index=21, number=101,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_DOUBLE_REF', index=22, number=102,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT32_REF', index=23, number=103,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_UINT8_REF', index=24, number=104,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT16_REF', index=25, number=105,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT8_REF', index=26, number=106,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_STRING_REF', index=27, number=107,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_COMPLEX64_REF', index=28, number=108,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_INT64_REF', index=29, number=109,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_BOOL_REF', index=30, number=110,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QINT8_REF', index=31, number=111,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QUINT8_REF', index=32, number=112,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QINT32_REF', index=33, number=113,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_BFLOAT16_REF', index=34, number=114,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QINT16_REF', index=35, number=115,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_QUINT16_REF', index=36, number=116,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_UINT16_REF', index=37, number=117,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_COMPLEX128_REF', index=38, number=118,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_HALF_REF', index=39, number=119,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='DT_RESOURCE_REF', index=40, number=120,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=49,
+  serialized_end=755,
+)
+_sym_db.RegisterEnumDescriptor(_DATATYPE)
+
+DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE)
+DT_INVALID = 0
+DT_FLOAT = 1
+DT_DOUBLE = 2
+DT_INT32 = 3
+DT_UINT8 = 4
+DT_INT16 = 5
+DT_INT8 = 6
+DT_STRING = 7
+DT_COMPLEX64 = 8
+DT_INT64 = 9
+DT_BOOL = 10
+DT_QINT8 = 11
+DT_QUINT8 = 12
+DT_QINT32 = 13
+DT_BFLOAT16 = 14
+DT_QINT16 = 15
+DT_QUINT16 = 16
+DT_UINT16 = 17
+DT_COMPLEX128 = 18
+DT_HALF = 19
+DT_RESOURCE = 20
+DT_FLOAT_REF = 101
+DT_DOUBLE_REF = 102
+DT_INT32_REF = 103
+DT_UINT8_REF = 104
+DT_INT16_REF = 105
+DT_INT8_REF = 106
+DT_STRING_REF = 107
+DT_COMPLEX64_REF = 108
+DT_INT64_REF = 109
+DT_BOOL_REF = 110
+DT_QINT8_REF = 111
+DT_QUINT8_REF = 112
+DT_QINT32_REF = 113
+DT_BFLOAT16_REF = 114
+DT_QINT16_REF = 115
+DT_QUINT16_REF = 116
+DT_UINT16_REF = 117
+DT_COMPLEX128_REF = 118
+DT_HALF_REF = 119
+DT_RESOURCE_REF = 120
+
+
+DESCRIPTOR.enum_types_by_name['DataType'] = _DATATYPE
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+
+DESCRIPTOR._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto/versions.proto b/tensorboardX/tensorboardX/proto/versions.proto
new file mode 100644
index 0000000000000000000000000000000000000000..e62c503b0c763ae1e165c3af55d61945bfd6a018
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/versions.proto
@@ -0,0 +1,31 @@
+syntax = "proto3";
+
+package tensorboardX;
+option cc_enable_arenas = true;
+option java_outer_classname = "VersionsProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+// Version information for a piece of serialized data
+//
+// There are different types of versions for each type of data
+// (GraphDef, etc.), but they all have the same common shape
+// described here.
+//
+// Each consumer has "consumer" and "min_producer" versions (specified
+// elsewhere).  A consumer is allowed to consume this data if
+//
+//   producer >= min_producer
+//   consumer >= min_consumer
+//   consumer not in bad_consumers
+//
+message VersionDef {
+  // The version of the code that produced this data.
+  int32 producer = 1;
+
+  // Any consumer below this version is not allowed to consume this data.
+  int32 min_consumer = 2;
+
+  // Specific consumer versions which are disallowed (e.g. due to bugs).
+  repeated int32 bad_consumers = 3;
+};
diff --git a/tensorboardX/tensorboardX/proto/versions_pb2.py b/tensorboardX/tensorboardX/proto/versions_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f2513bcc6a73aa776529dc6d3668eaabcc9e53b
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto/versions_pb2.py
@@ -0,0 +1,84 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: tensorboardX/proto/versions.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='tensorboardX/proto/versions.proto',
+  package='tensorboardX',
+  syntax='proto3',
+  serialized_options=_b('\n\030org.tensorflow.frameworkB\016VersionsProtosP\001\370\001\001'),
+  serialized_pb=_b('\n!tensorboardX/proto/versions.proto\x12\x0ctensorboardX\"K\n\nVersionDef\x12\x10\n\x08producer\x18\x01 \x01(\x05\x12\x14\n\x0cmin_consumer\x18\x02 \x01(\x05\x12\x15\n\rbad_consumers\x18\x03 \x03(\x05\x42/\n\x18org.tensorflow.frameworkB\x0eVersionsProtosP\x01\xf8\x01\x01\x62\x06proto3')
+)
+
+
+
+
+_VERSIONDEF = _descriptor.Descriptor(
+  name='VersionDef',
+  full_name='tensorboardX.VersionDef',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='producer', full_name='tensorboardX.VersionDef.producer', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='min_consumer', full_name='tensorboardX.VersionDef.min_consumer', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='bad_consumers', full_name='tensorboardX.VersionDef.bad_consumers', index=2,
+      number=3, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=51,
+  serialized_end=126,
+)
+
+DESCRIPTOR.message_types_by_name['VersionDef'] = _VERSIONDEF
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+VersionDef = _reflection.GeneratedProtocolMessageType('VersionDef', (_message.Message,), dict(
+  DESCRIPTOR = _VERSIONDEF,
+  __module__ = 'tensorboardX.proto.versions_pb2'
+  # @@protoc_insertion_point(class_scope:tensorboardX.VersionDef)
+  ))
+_sym_db.RegisterMessage(VersionDef)
+
+
+DESCRIPTOR._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tensorboardX/tensorboardX/proto_graph.py b/tensorboardX/tensorboardX/proto_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..29f9fdce5c42f76050eba2be280407962a6e8a11
--- /dev/null
+++ b/tensorboardX/tensorboardX/proto_graph.py
@@ -0,0 +1,50 @@
+from .proto.graph_pb2 import GraphDef
+from .proto.node_def_pb2 import NodeDef
+from .proto.versions_pb2 import VersionDef
+from .proto.attr_value_pb2 import AttrValue
+from .proto.tensor_shape_pb2 import TensorShapeProto
+
+
+def attr_value_proto(dtype, shape, s):
+    """Creates a dict of objects matching
+    https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/attr_value.proto
+    specifically designed for a NodeDef. The values have been
+    reverse engineered from standard TensorBoard logged data.
+    """
+    attr = {}
+    if s is not None:
+        attr['attr'] = AttrValue(s=s.encode(encoding='utf_8'))
+    if shape is not None:
+        shapeproto = tensor_shape_proto(shape)
+        attr['_output_shapes'] = AttrValue(list=AttrValue.ListValue(shape=[shapeproto]))
+    return attr
+
+
+def tensor_shape_proto(outputsize):
+    """Creates an object matching
+    https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/tensor_shape.proto
+    """
+    return TensorShapeProto(dim=[TensorShapeProto.Dim(size=d) for d in outputsize])
+
+
+def node_proto(name,
+               op='UnSpecified',
+               input=None,
+               dtype=None,
+               shape=None,  # type: tuple
+               outputsize=None,
+               attributes=''
+               ):
+    """Creates an object matching
+    https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/node_def.proto
+    """
+    if input is None:
+        input = []
+    if not isinstance(input, list):
+        input = [input]
+    return NodeDef(
+        name=name.encode(encoding='utf_8'),
+        op=op,
+        input=input,
+        attr=attr_value_proto(dtype, outputsize, attributes)
+    )
diff --git a/tensorboardX/tensorboardX/pytorch_graph.py b/tensorboardX/tensorboardX/pytorch_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..febe12af866df7e1b1fc78765a1259e4d45c6c32
--- /dev/null
+++ b/tensorboardX/tensorboardX/pytorch_graph.py
@@ -0,0 +1,289 @@
+import logging
+import time
+from collections import OrderedDict
+from .proto.attr_value_pb2 import AttrValue
+from .proto.graph_pb2 import GraphDef
+from .proto.node_def_pb2 import NodeDef
+from .proto.step_stats_pb2 import RunMetadata, StepStats, DeviceStepStats, NodeExecStats, AllocatorMemoryUsed
+from .proto.tensor_shape_pb2 import TensorShapeProto
+from .proto.versions_pb2 import VersionDef
+from .proto_graph import node_proto
+
+methods_OP = ['attributeNames', 'hasMultipleOutputs', 'hasUses', 'inputs',
+              'kind', 'outputs', 'outputsSize', 'scopeName']
+methods_IO = ['node', 'offset', 'debugName']  # 'unique' <int> , 'type' <Tensor<class 'torch._C.Type'>>
+
+backward_mode = False
+
+class NodeBase(object):
+    def __init__(self,
+                 debugName=None,
+                 inputs=None,
+                 scope=None,
+                 tensor_size=None,
+                 op_type='UnSpecified',
+                 attributes=''):
+        self.debugName = debugName
+        self.inputs = inputs
+        self.tensor_size = tensor_size
+        self.kind = op_type
+        self.attributes = attributes
+        if scope is not None:
+            self.scope = scope
+
+    def __repr__(self):
+        repr = []
+        repr.append(str(type(self)))
+        for m in dir(self):
+            if '__' not in m:
+                repr.append(m + ': ' + str(getattr(self, m)) + str(type(getattr(self, m))))
+        return '\n'.join(repr) + '\n\n'
+
+
+class NodePy(NodeBase):
+    def __init__(self, node_cpp, valid_methods):
+        super(NodePy, self).__init__(node_cpp)
+        valid_methods = valid_methods[:]
+        self.inputs = []
+        global backward_mode
+        for m in valid_methods:
+            if m == 'inputs' or m == 'outputs':
+                list_of_node = list(getattr(node_cpp, m)())
+                io_unique_names = []
+                io_tensor_sizes = []
+                for n in list_of_node:
+                    if backward_mode:
+                        io_unique_names.append(n.uniqueName())
+                    else:
+                        io_unique_names.append(n.debugName())
+
+                    if n.type().kind() == 'CompleteTensorType':
+                        io_tensor_sizes.append(n.type().sizes())
+                    else:
+                        io_tensor_sizes.append(None)
+
+                setattr(self, m, io_unique_names)
+                setattr(self, m + 'tensor_size', io_tensor_sizes)
+
+            else:
+                if m == 'debugName' and backward_mode:
+                    setattr(self, m, getattr(node_cpp, 'uniqueName')())
+                else:
+                    setattr(self, m, getattr(node_cpp, m)())
+
+
+class NodePyIO(NodePy):
+    def __init__(self, node_cpp, input_or_output=None):
+        super(NodePyIO, self).__init__(node_cpp, methods_IO)
+        try:
+            tensor_size = node_cpp.type().sizes()
+        except RuntimeError:
+            tensor_size = [1, ]  # fail when constant model is used.
+        self.tensor_size = tensor_size
+        # Kind attribute string is purely descriptive and will be shown
+        # in detailed information for the node in TensorBoard's graph plugin.
+        #
+        # NodePyOP nodes get this from their kind() method.
+        self.kind = 'Parameter'
+        if input_or_output:
+            self.input_or_output = input_or_output
+            self.kind = 'IO Node'
+
+
+class NodePyOP(NodePy):
+    def __init__(self, node_cpp):
+        super(NodePyOP, self).__init__(node_cpp, methods_OP)
+        # Replace single quote which causes strange behavior in TensorBoard
+        # TODO: See if we can remove this in the future
+        self.attributes = str({k: node_cpp[k] for k in node_cpp.attributeNames()}).replace("'", ' ')
+        self.kind = node_cpp.kind()
+
+
+class GraphPy(object):
+    """Helper class to convert torch.nn.Module to GraphDef proto and visualization
+    with TensorBoard.
+
+    GraphDef generation operates in two passes:
+
+    In the first pass, all nodes are read and saved to two lists.
+    One list is for input/output nodes (nodes_io), which only have inbound
+    or outbound connections, but not both. Another list is for internal
+    operator nodes (nodes_op). The first pass also saves all scope name
+    appeared in the nodes in scope_name_appeared list for later processing.
+
+    In the second pass, scope names are fully applied to all nodes.
+    debugNameToScopedName is a mapping from a node's ID to its fully qualified
+    scope name. e.g. Net1/Linear[0]/1. Unfortunately torch.jit doesn't have
+    totally correct scope output, so this is nontrivial. The function
+    populate_namespace_from_OP_to_IO and find_common_root are used to
+    assign scope name to a node based on the connection between nodes
+    in a heuristic kind of way. Bookkeeping is done with shallowest_scope_name
+    and scope_name_appeared.
+    """
+    def __init__(self):
+        self.nodes_op = []
+        self.nodes_io = OrderedDict()
+        self.unique_name_to_scoped_name = {}
+        self.shallowest_scope_name = 'default'
+        self.scope_name_appeared = []
+
+    def append(self, x):
+        if isinstance(x, NodePyIO):
+            self.nodes_io[x.debugName] = x
+        if isinstance(x, NodePyOP):
+            self.nodes_op.append(x)
+            for node_output, outputSize in zip(x.outputs, x.outputstensor_size):
+                self.scope_name_appeared.append(x.scopeName)
+                self.nodes_io[node_output] = NodeBase(node_output,
+                                                      x.inputs,
+                                                      x.scopeName,
+                                                      outputSize,
+                                                      op_type=x.kind,
+                                                      attributes=x.attributes)
+
+    def printall(self):
+        print('all nodes')
+        for node in self.nodes_op:
+            print(node)
+        for key in self.nodes_io:
+            print(self.nodes_io[key])
+
+    def find_common_root(self):
+        for fullscope in self.scope_name_appeared:
+            if fullscope:
+                self.shallowest_scope_name = fullscope.split('/')[0]
+
+    def populate_namespace_from_OP_to_IO(self):
+        for node in self.nodes_op:
+            for input_node_id in node.inputs:
+                self.unique_name_to_scoped_name[input_node_id] = node.scopeName + '/' + input_node_id
+
+        for key, node in self.nodes_io.items():
+            if type(node) == NodeBase:
+                self.unique_name_to_scoped_name[key] = node.scope + '/' + node.debugName
+            if hasattr(node, 'input_or_output'):
+                self.unique_name_to_scoped_name[key] = node.input_or_output + '/' + node.debugName
+            if hasattr(node, 'scope'):
+                if node.scope == '' and self.shallowest_scope_name:
+                    self.unique_name_to_scoped_name[node.debugName] = \
+                        self.shallowest_scope_name + '/' + node.debugName
+
+        # replace name
+        for key, node in self.nodes_io.items():
+            self.nodes_io[key].inputs = \
+                [self.unique_name_to_scoped_name[node_input_id] for node_input_id in node.inputs]
+            if node.debugName in self.unique_name_to_scoped_name:
+                self.nodes_io[key].debugName = self.unique_name_to_scoped_name[node.debugName]
+
+    def to_proto(self):
+        """
+        Converts graph representation of GraphPy object to TensorBoard
+        required format.
+        """
+        # TODO: compute correct memory usage and CPU time once
+        # PyTorch supports it
+        import numpy as np
+        nodes = []
+        node_stats = []
+        for v in self.nodes_io.values():
+            nodes.append(node_proto(v.debugName,
+                                    input=v.inputs,
+                                    outputsize=v.tensor_size,
+                                    op=v.kind,
+                                    attributes=v.attributes))
+
+            if v.tensor_size and len(v.tensor_size) > 0:  # assume data is float32, only parameter is counted
+                node_stats.append(
+                    NodeExecStats(node_name=v.debugName,
+                                  all_start_micros=int(time.time() * 1e7),
+                                  all_end_rel_micros=42,
+                                  memory=[AllocatorMemoryUsed(allocator_name="cpu",
+                                                              total_bytes=int(np.prod(v.tensor_size)) * 4)]))
+
+        return nodes, node_stats
+
+
+# one argument: 'hasAttribute', 'hasAttributes',
+def parse(graph, args=None, omit_useless_nodes=True):
+    """This method parses an optimized PyTorch model graph and produces
+    a list of nodes and node stats for eventual conversion to TensorBoard
+    protobuf format.
+
+    Args:
+      graph (PyTorch module): The model to be parsed.
+      args (tuple): input tensor[s] for the model.
+      omit_useless_nodes (boolean): Whether to remove nodes from the graph.
+    """
+    import torch
+    n_inputs = len(args)  # not sure...
+
+    nodes_py = GraphPy()
+    for i, node in enumerate(graph.inputs()):
+        global backward_mode
+        if not backward_mode:
+            try:
+                node.debugName()
+            except:
+                backward_mode = True
+        if omit_useless_nodes:
+            if len(node.uses()) == 0:  # number of user of the node (= number of outputs/ fanout)
+                continue
+
+        if i < n_inputs:
+            nodes_py.append(NodePyIO(node, 'input'))
+        else:
+            nodes_py.append(NodePyIO(node))  # parameter
+
+    for node in graph.nodes():
+        nodes_py.append(NodePyOP(node))
+
+    for node in graph.outputs():  # must place last.
+        NodePyIO(node, 'output')
+    nodes_py.find_common_root()
+    nodes_py.populate_namespace_from_OP_to_IO()
+    return nodes_py.to_proto()
+
+
+def graph(model, args, verbose=False, **kwargs):
+    """
+    This method processes a PyTorch model and produces a `GraphDef` proto
+    that can be logged to TensorBoard.
+
+    Args:
+      model (PyTorch module): The model to be parsed.
+      args (tuple): input tensor[s] for the model.
+      verbose (bool): Whether to print out verbose information while
+        processing.
+    """
+    import torch
+
+    with torch.onnx.set_training(model, False):  # TODO: move outside of torch.onnx
+        try:
+            trace = torch.jit.trace(model, args)
+            graph = trace.graph
+
+        except RuntimeError as e:
+            print(e)
+            print('Error occurs, No graph saved')
+            raise e
+            # Create an object matching
+            # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/graph.proto
+            # The producer version has been reverse engineered from standard
+            # TensorBoard logged data.
+
+    if verbose:
+        print(graph)
+    list_of_nodes, node_stats = parse(graph, args)
+    # We are hardcoding that this was run on CPU even though it might have actually
+    # run on GPU. Note this is what is shown in TensorBoard and has no bearing
+    # on actual execution.
+    # TODO: See if we can extract GPU vs CPU information from the PyTorch model
+    # and pass it correctly to TensorBoard.
+    #
+    # Definition of StepStats and DeviceStepStats can be found at
+    # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/graph/tf_graph_common/test/graph-test.ts
+    # and
+    # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/step_stats.proto
+    stepstats = RunMetadata(step_stats=StepStats(dev_stats=[DeviceStepStats(device="/device:CPU:0",
+                                                                            node_stats=node_stats)]))
+    return GraphDef(node=list_of_nodes, versions=VersionDef(producer=22)), stepstats
diff --git a/tensorboardX/tensorboardX/record_writer.py b/tensorboardX/tensorboardX/record_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..88442ccd8e274a910d3a591950287b86fb39b639
--- /dev/null
+++ b/tensorboardX/tensorboardX/record_writer.py
@@ -0,0 +1,143 @@
+"""
+To write tf_record into file. Here we use it for tensorboard's event writting.
+The code was borrowed from https://github.com/TeamHG-Memex/tensorboard_logger
+"""
+
+import copy
+import io
+import os.path
+import re
+import struct
+try:
+    import boto3
+    S3_ENABLED = True
+except ImportError:
+    S3_ENABLED = False
+
+from .crc32c import crc32c
+
+
+_VALID_OP_NAME_START = re.compile('^[A-Za-z0-9.]')
+_VALID_OP_NAME_PART = re.compile('[A-Za-z0-9_.\\-/]+')
+
+# Registry of writer factories by prefix backends.
+#
+# Currently supports "s3://" URLs for S3 based on boto and falls
+# back to local filesystem.
+REGISTERED_FACTORIES = {}
+
+
+def register_writer_factory(prefix, factory):
+    if ':' in prefix:
+        raise ValueError('prefix cannot contain a :')
+    REGISTERED_FACTORIES[prefix] = factory
+
+
+def directory_check(path):
+    '''Initialize the directory for log files.'''
+    try:
+        prefix = path.split(':')[0]
+        factory = REGISTERED_FACTORIES[prefix]
+        return factory.directory_check(path)
+    except KeyError:
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+
+def open_file(path):
+    '''Open a writer for outputting event files.'''
+    try:
+        prefix = path.split(':')[0]
+        factory = REGISTERED_FACTORIES[prefix]
+        return factory.open(path)
+    except KeyError:
+        return open(path, 'wb')
+
+
+class S3RecordWriter(object):
+    """Writes tensorboard protocol buffer files to S3."""
+
+    def __init__(self, path):
+        if not S3_ENABLED:
+            raise ImportError("boto3 must be installed for S3 support.")
+        self.path = path
+        self.buffer = io.BytesIO()
+
+    def __del__(self):
+        self.close()
+
+    def bucket_and_path(self):
+        path = self.path
+        if path.startswith("s3://"):
+            path = path[len("s3://"):]
+        bp = path.split("/")
+        bucket = bp[0]
+        path = path[1 + len(bucket):]
+        return bucket, path
+
+    def write(self, val):
+        self.buffer.write(val)
+
+    def flush(self):
+        s3 = boto3.client('s3')
+        bucket, path = self.bucket_and_path()
+        upload_buffer = copy.copy(self.buffer)
+        upload_buffer.seek(0)
+        s3.upload_fileobj(upload_buffer, bucket, path)
+
+    def close(self):
+        self.flush()
+
+
+class S3RecordWriterFactory(object):
+    """Factory for event protocol buffer files to S3."""
+
+    def open(self, path):
+        return S3RecordWriter(path)
+
+    def directory_check(self, path):
+        # S3 doesn't need directories created before files are added
+        # so we can just skip this check
+        pass
+
+
+register_writer_factory("s3", S3RecordWriterFactory())
+
+
+class RecordWriter(object):
+    def __init__(self, path):
+        self._name_to_tf_name = {}
+        self._tf_names = set()
+        self.path = path
+        self._writer = None
+        self._writer = open_file(path)
+
+    def write(self, data):
+        w = self._writer.write
+        header = struct.pack('Q', len(data))
+        w(header)
+        w(struct.pack('I', masked_crc32c(header)))
+        w(data)
+        w(struct.pack('I', masked_crc32c(data)))
+
+    def flush(self):
+        self._writer.flush()
+
+    def close(self):
+        self._writer.close()
+
+
+def masked_crc32c(data):
+    x = u32(crc32c(data))
+    return u32(((x >> 15) | u32(x << 17)) + 0xa282ead8)
+
+
+def u32(x):
+    return x & 0xffffffff
+
+
+def make_valid_tf_name(name):
+    if not _VALID_OP_NAME_START.match(name):
+        # Must make it valid somehow, but don't want to remove stuff
+        name = '.' + name
+    return '_'.join(_VALID_OP_NAME_PART.findall(name))
diff --git a/tensorboardX/tensorboardX/summary.py b/tensorboardX/tensorboardX/summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5bf234a3b872b776e7ac28a9744deeaea00019b
--- /dev/null
+++ b/tensorboardX/tensorboardX/summary.py
@@ -0,0 +1,544 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import numpy as np
+import os
+import re as _re
+
+# pylint: disable=unused-import
+from six.moves import range
+
+from .proto.summary_pb2 import Summary
+from .proto.summary_pb2 import HistogramProto
+from .proto.summary_pb2 import SummaryMetadata
+from .proto.tensor_pb2 import TensorProto
+from .proto.tensor_shape_pb2 import TensorShapeProto
+from .proto.plugin_pr_curve_pb2 import PrCurvePluginData
+from .proto.plugin_text_pb2 import TextPluginData
+from .proto.plugin_mesh_pb2 import MeshPluginData
+from .proto import layout_pb2
+from .x2num import make_np
+from .utils import _prepare_video, convert_to_HWC
+
+_INVALID_TAG_CHARACTERS = _re.compile(r'[^-/\w\.]')
+
+
+def _clean_tag(name):
+    # In the past, the first argument to summary ops was a tag, which allowed
+    # arbitrary characters. Now we are changing the first argument to be the node
+    # name. This has a number of advantages (users of summary ops now can
+    # take advantage of the tf name scope system) but risks breaking existing
+    # usage, because a much smaller set of characters are allowed in node names.
+    # This function replaces all illegal characters with _s, and logs a warning.
+    # It also strips leading slashes from the name.
+    if name is not None:
+        new_name = _INVALID_TAG_CHARACTERS.sub('_', name)
+        new_name = new_name.lstrip('/')  # Remove leading slashes
+        if new_name != name:
+            logging.info(
+                'Summary name %s is illegal; using %s instead.' % (name, new_name))
+            name = new_name
+    return name
+
+
+def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, color='black', color_text='black', thickness=2):
+    from PIL import ImageDraw, ImageFont
+    font = ImageFont.load_default()
+    draw = ImageDraw.Draw(image)
+    (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+    draw.line([(left, top), (left, bottom), (right, bottom),
+               (right, top), (left, top)], width=thickness, fill=color)
+    if display_str:
+        text_bottom = bottom
+        # Reverse list and print from bottom to top.
+        text_width, text_height = font.getsize(display_str)
+        margin = np.ceil(0.05 * text_height)
+        draw.rectangle(
+            [(left, text_bottom - text_height - 2 * margin),
+             (left + text_width, text_bottom)], fill=color
+        )
+        draw.text(
+            (left + margin, text_bottom - text_height - margin),
+            display_str, fill=color_text, font=font
+        )
+    return image
+
+
+def hparams(hparam_dict=None, metric_dict=None):
+    from tensorboardX.proto.plugin_hparams_pb2 import HParamsPluginData, SessionEndInfo, SessionStartInfo
+    from tensorboardX.proto.api_pb2 import Experiment, HParamInfo, MetricInfo, MetricName, Status
+    from six import string_types
+
+    PLUGIN_NAME = 'hparams'
+    PLUGIN_DATA_VERSION = 0
+
+    EXPERIMENT_TAG = '_hparams_/experiment'
+    SESSION_START_INFO_TAG = '_hparams_/session_start_info'
+    SESSION_END_INFO_TAG = '_hparams_/session_end_info'
+
+    # TODO: expose other parameters in the future.
+    # hp = HParamInfo(name='lr',display_name='learning rate', type=DataType.DATA_TYPE_FLOAT64, domain_interval=Interval(min_value=10, max_value=100))  # noqa E501
+    # mt = MetricInfo(name=MetricName(tag='accuracy'), display_name='accuracy', description='', dataset_type=DatasetType.DATASET_VALIDATION)  # noqa E501
+    # exp = Experiment(name='123', description='456', time_created_secs=100.0, hparam_infos=[hp], metric_infos=[mt], user='tw')  # noqa E501
+
+    hps = [HParamInfo(name=k) for k in hparam_dict.keys()]
+    mts = [MetricInfo(name=MetricName(tag=k)) for k in metric_dict.keys()]
+
+    exp = Experiment(hparam_infos=hps, metric_infos=mts)
+
+    content = HParamsPluginData(experiment=exp, version=PLUGIN_DATA_VERSION)
+    smd = SummaryMetadata(plugin_data=SummaryMetadata.PluginData(plugin_name=PLUGIN_NAME,
+                                                                 content=content.SerializeToString()))
+    exp = Summary(value=[Summary.Value(tag=EXPERIMENT_TAG, metadata=smd)])
+
+    ssi = SessionStartInfo()
+    for k, v in hparam_dict.items():
+        if isinstance(v, string_types):
+            ssi.hparams[k].string_value = v
+            continue
+
+        if isinstance(v, bool):
+            ssi.hparams[k].bool_value = v
+            continue
+
+        if not isinstance(v, int) or not isinstance(v, float):
+            v = make_np(v)[0]
+            ssi.hparams[k].number_value = v
+
+    content = HParamsPluginData(session_start_info=ssi, version=PLUGIN_DATA_VERSION)
+    smd = SummaryMetadata(plugin_data=SummaryMetadata.PluginData(plugin_name=PLUGIN_NAME,
+                                                                 content=content.SerializeToString()))
+    ssi = Summary(value=[Summary.Value(tag=SESSION_START_INFO_TAG, metadata=smd)])
+
+    sei = SessionEndInfo(status=Status.STATUS_SUCCESS)
+    content = HParamsPluginData(session_end_info=sei, version=PLUGIN_DATA_VERSION)
+    smd = SummaryMetadata(plugin_data=SummaryMetadata.PluginData(plugin_name=PLUGIN_NAME,
+                                                                 content=content.SerializeToString()))
+    sei = Summary(value=[Summary.Value(tag=SESSION_END_INFO_TAG, metadata=smd)])
+
+    return exp, ssi, sei
+
+
+def scalar(name, scalar, collections=None):
+    """Outputs a `Summary` protocol buffer containing a single scalar value.
+    The generated Summary has a Tensor.proto containing the input Tensor.
+    Args:
+      name: A name for the generated node. Will also serve as the series name in
+        TensorBoard.
+      tensor: A real numeric Tensor containing a single value.
+      collections: Optional list of graph collections keys. The new summary op is
+        added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+    Returns:
+      A scalar `Tensor` of type `string`. Which contains a `Summary` protobuf.
+    Raises:
+      ValueError: If tensor has the wrong shape or type.
+    """
+    name = _clean_tag(name)
+    scalar = make_np(scalar)
+    assert(scalar.squeeze().ndim == 0), 'scalar should be 0D'
+    scalar = float(scalar)
+    return Summary(value=[Summary.Value(tag=name, simple_value=scalar)])
+
+
+def histogram_raw(name, min, max, num, sum, sum_squares, bucket_limits, bucket_counts):
+    # pylint: disable=line-too-long
+    """Outputs a `Summary` protocol buffer with a histogram.
+    The generated
+    [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+    has one summary value containing a histogram for `values`.
+    Args:
+      name: A name for the generated node. Will also serve as a series name in
+        TensorBoard.
+      min: A float or int min value
+      max: A float or int max value
+      num: Int number of values
+      sum: Float or int sum of all values
+      sum_squares: Float or int sum of squares for all values
+      bucket_limits: A numeric `Tensor` with upper value per bucket
+      bucket_counts: A numeric `Tensor` with number of values per bucket
+    Returns:
+      A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+      buffer.
+    """
+    hist = HistogramProto(min=min,
+                          max=max,
+                          num=num,
+                          sum=sum,
+                          sum_squares=sum_squares,
+                          bucket_limit=bucket_limits,
+                          bucket=bucket_counts)
+    return Summary(value=[Summary.Value(tag=name, histo=hist)])
+
+
+def histogram(name, values, bins, max_bins=None):
+    # pylint: disable=line-too-long
+    """Outputs a `Summary` protocol buffer with a histogram.
+    The generated
+    [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+    has one summary value containing a histogram for `values`.
+    This op reports an `InvalidArgument` error if any value is not finite.
+    Args:
+      name: A name for the generated node. Will also serve as a series name in
+        TensorBoard.
+      values: A real numeric `Tensor`. Any shape. Values to use to
+        build the histogram.
+    Returns:
+      A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+      buffer.
+    """
+    name = _clean_tag(name)
+    values = make_np(values)
+    hist = make_histogram(values.astype(float), bins, max_bins)
+    return Summary(value=[Summary.Value(tag=name, histo=hist)])
+
+
+def make_histogram(values, bins, max_bins=None):
+    """Convert values into a histogram proto using logic from histogram.cc."""
+    if values.size == 0:
+        raise ValueError('The input has no element.')
+    values = values.reshape(-1)
+    counts, limits = np.histogram(values, bins=bins)
+    num_bins = len(counts)
+    if max_bins is not None and num_bins > max_bins:
+        subsampling = num_bins // max_bins
+        subsampling_remainder = num_bins % subsampling
+        if subsampling_remainder != 0:
+            counts = np.pad(counts, pad_width=[[0, subsampling - subsampling_remainder]],
+                            mode="constant", constant_values=0)
+        counts = counts.reshape(-1, subsampling).sum(axis=-1)
+        new_limits = np.empty((counts.size + 1,), limits.dtype)
+        new_limits[:-1] = limits[:-1:subsampling]
+        new_limits[-1] = limits[-1]
+        limits = new_limits
+
+    # Find the first and the last bin defining the support of the histogram:
+    cum_counts = np.cumsum(np.greater(counts, 0, dtype=np.int32))
+    start, end = np.searchsorted(cum_counts, [0, cum_counts[-1] - 1], side="right")
+    start = int(start)
+    end = int(end) + 1
+    del cum_counts
+
+    # TensorBoard only includes the right bin limits. To still have the leftmost limit
+    # included, we include an empty bin left.
+    # If start == 0, we need to add an empty one left, otherwise we can just include the bin left to the
+    # first nonzero-count bin:
+    counts = counts[start - 1:end] if start > 0 else np.concatenate([[0], counts[:end]])
+    limits = limits[start:end + 1]
+
+    if counts.size == 0 or limits.size == 0:
+        raise ValueError('The histogram is empty, please file a bug report.')
+
+    sum_sq = values.dot(values)
+    return HistogramProto(min=values.min(),
+                          max=values.max(),
+                          num=len(values),
+                          sum=values.sum(),
+                          sum_squares=sum_sq,
+                          bucket_limit=limits.tolist(),
+                          bucket=counts.tolist())
+
+
+def image(tag, tensor, rescale=1, dataformats='CHW'):
+    """Outputs a `Summary` protocol buffer with images.
+    The summary has up to `max_images` summary values containing images. The
+    images are built from `tensor` which must be 3-D with shape `[height, width,
+    channels]` and where `channels` can be:
+    *  1: `tensor` is interpreted as Grayscale.
+    *  3: `tensor` is interpreted as RGB.
+    *  4: `tensor` is interpreted as RGBA.
+
+    Args:
+      tag: A name for the generated node. Will also serve as a series name in
+        TensorBoard.
+      tensor: A 3-D `uint8` or `float32` `Tensor` of shape `[height, width,
+        channels]` where `channels` is 1, 3, or 4.
+        'tensor' can either have values in [0, 1] (float32) or [0, 255] (uint8).
+        The image() function will scale the image values to [0, 255] by applying
+        a scale factor of either 1 (uint8) or 255 (float32).
+    Returns:
+      A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+      buffer.
+    """
+    tag = _clean_tag(tag)
+    tensor = make_np(tensor)
+    tensor = convert_to_HWC(tensor, dataformats)
+    # Do not assume that user passes in values in [0, 255], use data type to detect
+    if tensor.dtype != np.uint8:
+        tensor = (tensor * 255.0).astype(np.uint8)
+
+    image = make_image(tensor, rescale=rescale)
+    return Summary(value=[Summary.Value(tag=tag, image=image)])
+
+
+def image_boxes(tag, tensor_image, tensor_boxes, rescale=1, dataformats='CHW', labels=None):
+    '''Outputs a `Summary` protocol buffer with images.'''
+    tensor_image = make_np(tensor_image)
+    tensor_image = convert_to_HWC(tensor_image, dataformats)
+    tensor_boxes = make_np(tensor_boxes)
+
+    if tensor_image.dtype != np.uint8:
+        tensor_image = (tensor_image * 255.0).astype(np.uint8)
+
+    image = make_image(tensor_image,
+                       rescale=rescale,
+                       rois=tensor_boxes, labels=labels)
+    return Summary(value=[Summary.Value(tag=tag, image=image)])
+
+
+def draw_boxes(disp_image, boxes, labels=None):
+    # xyxy format
+    num_boxes = boxes.shape[0]
+    list_gt = range(num_boxes)
+    for i in list_gt:
+        disp_image = _draw_single_box(disp_image,
+                                      boxes[i, 0],
+                                      boxes[i, 1],
+                                      boxes[i, 2],
+                                      boxes[i, 3],
+                                      display_str=None if labels is None else labels[i],
+                                      color='Red')
+    return disp_image
+
+
+def make_image(tensor, rescale=1, rois=None, labels=None):
+    """Convert an numpy representation image to Image protobuf"""
+    from PIL import Image
+    height, width, channel = tensor.shape
+    scaled_height = int(height * rescale)
+    scaled_width = int(width * rescale)
+    image = Image.fromarray(tensor)
+    if rois is not None:
+        image = draw_boxes(image, rois, labels=labels)
+    image = image.resize((scaled_width, scaled_height), Image.ANTIALIAS)
+    import io
+    output = io.BytesIO()
+    image.save(output, format='PNG')
+    image_string = output.getvalue()
+    output.close()
+    return Summary.Image(height=height,
+                         width=width,
+                         colorspace=channel,
+                         encoded_image_string=image_string)
+
+
+def video(tag, tensor, fps=4):
+    tag = _clean_tag(tag)
+    tensor = make_np(tensor)
+    tensor = _prepare_video(tensor)
+    # If user passes in uint8, then we don't need to rescale by 255
+    if tensor.dtype != np.uint8:
+        tensor = (tensor * 255.0).astype(np.uint8)
+
+    video = make_video(tensor, fps)
+    return Summary(value=[Summary.Value(tag=tag, image=video)])
+
+
+def make_video(tensor, fps):
+    try:
+        import moviepy  # noqa: F401
+    except ImportError:
+        print('add_video needs package moviepy')
+        return
+    try:
+        from moviepy import editor as mpy
+    except ImportError:
+        print("moviepy is installed, but can't import moviepy.editor.",
+              "Some packages could be missing [imageio, requests]")
+        return
+    import tempfile
+
+    t, h, w, c = tensor.shape
+
+    # encode sequence of images into gif string
+    clip = mpy.ImageSequenceClip(list(tensor), fps=fps)
+
+    filename = tempfile.NamedTemporaryFile(suffix='.gif', delete=False).name
+    try:  # older version of moviepy does not support progress_bar argument.
+        clip.write_gif(filename, verbose=False, progress_bar=False)
+    except TypeError:
+        clip.write_gif(filename, verbose=False)
+
+    with open(filename, 'rb') as f:
+        tensor_string = f.read()
+
+    try:
+        os.remove(filename)
+    except OSError:
+        logging.warning('The temporary file used by moviepy cannot be deleted.')
+
+    return Summary.Image(height=h, width=w, colorspace=c, encoded_image_string=tensor_string)
+
+
+def audio(tag, tensor, sample_rate=44100):
+    tensor = make_np(tensor)
+    if abs(tensor).max() > 1:
+        print('warning: audio amplitude out of range, auto clipped.')
+        tensor = tensor.clip(-1, 1)
+    assert(tensor.ndim == 2), 'input tensor should be 2 dimensional.'
+    length_frames, num_channels = tensor.shape
+    assert num_channels == 1 or num_channels == 2, f'Expected 1/2 channels, got {num_channels}'
+    import soundfile
+    import io
+    with io.BytesIO() as fio:
+        soundfile.write(fio, tensor, samplerate=sample_rate, format='wav')
+        audio_string = fio.getvalue()
+    audio = Summary.Audio(sample_rate=sample_rate,
+                          num_channels=num_channels,
+                          length_frames=length_frames,
+                          encoded_audio_string=audio_string,
+                          content_type='audio/wav')
+    return Summary(value=[Summary.Value(tag=tag, audio=audio)])
+
+def custom_scalars(layout):
+    categoriesnames = layout.keys()
+    categories = []
+    layouts = []
+    for k, v in layout.items():
+        charts = []
+        for chart_name, chart_meatadata in v.items():
+            tags = chart_meatadata[1]
+            if chart_meatadata[0] == 'Margin':
+                assert len(tags) == 3
+                mgcc = layout_pb2.MarginChartContent(series=[layout_pb2.MarginChartContent.Series(value=tags[0],
+                                                                                                  lower=tags[1],
+                                                                                                  upper=tags[2])])
+                chart = layout_pb2.Chart(title=chart_name, margin=mgcc)
+            else:
+                mlcc = layout_pb2.MultilineChartContent(tag=tags)
+                chart = layout_pb2.Chart(title=chart_name, multiline=mlcc)
+            charts.append(chart)
+        categories.append(layout_pb2.Category(title=k, chart=charts))
+
+    layout = layout_pb2.Layout(category=categories)
+    PluginData = SummaryMetadata.PluginData(plugin_name='custom_scalars')
+    smd = SummaryMetadata(plugin_data=PluginData)
+    tensor = TensorProto(dtype='DT_STRING',
+                         string_val=[layout.SerializeToString()],
+                         tensor_shape=TensorShapeProto())
+    return Summary(value=[Summary.Value(tag='custom_scalars__config__', tensor=tensor, metadata=smd)])
+
+
+def text(tag, text):
+    import json
+    PluginData = SummaryMetadata.PluginData(
+        plugin_name='text', content=TextPluginData(version=0).SerializeToString())
+    smd = SummaryMetadata(plugin_data=PluginData)
+    tensor = TensorProto(dtype='DT_STRING',
+                         string_val=[text.encode(encoding='utf_8')],
+                         tensor_shape=TensorShapeProto(dim=[TensorShapeProto.Dim(size=1)]))
+    return Summary(value=[Summary.Value(tag=tag + '/text_summary', metadata=smd, tensor=tensor)])
+
+
+def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, num_thresholds=127, weights=None):
+    if num_thresholds > 127:  # weird, value > 127 breaks protobuf
+        num_thresholds = 127
+    data = np.stack((tp, fp, tn, fn, precision, recall))
+    pr_curve_plugin_data = PrCurvePluginData(
+        version=0, num_thresholds=num_thresholds).SerializeToString()
+    PluginData = SummaryMetadata.PluginData(
+        plugin_name='pr_curves', content=pr_curve_plugin_data)
+    smd = SummaryMetadata(plugin_data=PluginData)
+    tensor = TensorProto(dtype='DT_FLOAT',
+                         float_val=data.reshape(-1).tolist(),
+                         tensor_shape=TensorShapeProto(
+                             dim=[TensorShapeProto.Dim(size=data.shape[0]), TensorShapeProto.Dim(size=data.shape[1])]))
+    return Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor)])
+
+
+def pr_curve(tag, labels, predictions, num_thresholds=127, weights=None):
+    # weird, value > 127 breaks protobuf
+    num_thresholds = min(num_thresholds, 127)
+    data = compute_curve(labels, predictions,
+                         num_thresholds=num_thresholds, weights=weights)
+    pr_curve_plugin_data = PrCurvePluginData(
+        version=0, num_thresholds=num_thresholds).SerializeToString()
+    PluginData = SummaryMetadata.PluginData(
+        plugin_name='pr_curves', content=pr_curve_plugin_data)
+    smd = SummaryMetadata(plugin_data=PluginData)
+    tensor = TensorProto(dtype='DT_FLOAT',
+                         float_val=data.reshape(-1).tolist(),
+                         tensor_shape=TensorShapeProto(
+                             dim=[TensorShapeProto.Dim(size=data.shape[0]), TensorShapeProto.Dim(size=data.shape[1])]))
+    return Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor)])
+
+
+# https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/pr_curve/summary.py
+def compute_curve(labels, predictions, num_thresholds=None, weights=None):
+    _MINIMUM_COUNT = 1e-7
+
+    if weights is None:
+        weights = 1.0
+
+    # Compute bins of true positives and false positives.
+    bucket_indices = np.int32(np.floor(predictions * (num_thresholds - 1)))
+    float_labels = labels.astype(np.float)
+    histogram_range = (0, num_thresholds - 1)
+    tp_buckets, _ = np.histogram(
+        bucket_indices,
+        bins=num_thresholds,
+        range=histogram_range,
+        weights=float_labels * weights)
+    fp_buckets, _ = np.histogram(
+        bucket_indices,
+        bins=num_thresholds,
+        range=histogram_range,
+        weights=(1.0 - float_labels) * weights)
+
+    # Obtain the reverse cumulative sum.
+    tp = np.cumsum(tp_buckets[::-1])[::-1]
+    fp = np.cumsum(fp_buckets[::-1])[::-1]
+    tn = fp[0] - fp
+    fn = tp[0] - tp
+    precision = tp / np.maximum(_MINIMUM_COUNT, tp + fp)
+    recall = tp / np.maximum(_MINIMUM_COUNT, tp + fn)
+    return np.stack((tp, fp, tn, fn, precision, recall))
+
+
+def _get_tensor_summary(tag, tensor, content_type, json_config):
+    mesh_plugin_data = MeshPluginData(
+        version=0,
+        name=tag,
+        content_type=content_type,
+        json_config=json_config,
+        shape=tensor.shape,
+    )
+    content = mesh_plugin_data.SerializeToString()
+    smd = SummaryMetadata(
+        plugin_data=SummaryMetadata.PluginData(
+            plugin_name='mesh',
+            content=content))
+
+    tensor = TensorProto(dtype='DT_FLOAT',
+                         float_val=tensor.reshape(-1).tolist(),
+                         tensor_shape=TensorShapeProto(dim=[
+                             TensorShapeProto.Dim(size=tensor.shape[0]),
+                             TensorShapeProto.Dim(size=tensor.shape[1]),
+                             TensorShapeProto.Dim(size=tensor.shape[2]),
+                         ]))
+    tensor_summary = Summary.Value(
+        tag='{}_{}'.format(tag, content_type),
+        tensor=tensor,
+        metadata=smd,
+    )
+    return tensor_summary
+
+
+def mesh(tag, vertices, colors, faces, config_dict=None):
+
+    import json
+    summaries = []
+    tensors = [
+        (vertices, 1),
+        (faces, 2),
+        (colors, 3)
+    ]
+
+    for tensor, content_type in tensors:
+        if tensor is None:
+            continue
+        summaries.append(
+            _get_tensor_summary(tag, make_np(tensor), content_type, json.dumps(config_dict, sort_keys=True)))
+
+    return Summary(value=summaries)
diff --git a/tensorboardX/tensorboardX/torchvis.py b/tensorboardX/tensorboardX/torchvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dfecf2c149a8117110e76d5db1829a904cc2922
--- /dev/null
+++ b/tensorboardX/tensorboardX/torchvis.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import gc
+import six
+import time
+
+from functools import wraps
+from .writer import SummaryWriter
+from .visdom_writer import VisdomWriter
+
+
+# Supports both TensorBoard and Visdom (no embedding or graph visualization with Visdom)
+vis_formats = {'tensorboard': SummaryWriter, 'visdom': VisdomWriter}
+
+
+class TorchVis:
+    def __init__(self, *args, **init_kwargs):
+        """
+        Args:
+            args (list of strings): The name of the visualization target(s).
+              Accepted targets are 'tensorboard' and 'visdom'.
+            init_kwargs: Additional keyword parameters for the visdom writer (For example, server IP).
+              See https://github.com/facebookresearch/visdom/blob/master/README.md#visdom-arguments-python-only
+              for more.
+        """
+        self.subscribers = {}
+        self.register(*args, **init_kwargs)
+
+    def register(self, *args, **init_kwargs):
+        # Sets tensorboard as the default visualization format if not specified
+        formats = ['tensorboard'] if not args else args
+        for format in formats:
+            if self.subscribers.get(format) is None and format in vis_formats.keys():
+                self.subscribers[format] = vis_formats[format](**init_kwargs.get(format, {}))
+
+    def unregister(self, *args):
+        for format in args:
+            self.subscribers[format].close()
+            del self.subscribers[format]
+            gc.collect()
+
+    def __getattr__(self, attr):
+        for _, subscriber in six.iteritems(self.subscribers):
+            def wrapper(*args, **kwargs):
+                for _, subscriber in six.iteritems(self.subscribers):
+                    if hasattr(subscriber, attr):
+                        getattr(subscriber, attr)(*args, **kwargs)
+            return wrapper
+        raise AttributeError
+
+    # Handle writer management (open/close) for the user
+    def __del__(self):
+        for _, subscriber in six.iteritems(self.subscribers):
+            subscriber.close()
diff --git a/tensorboardX/tensorboardX/utils.py b/tensorboardX/tensorboardX/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..86063bf5b25e472f3ce7cc13aa5d6c430cf2d1a5
--- /dev/null
+++ b/tensorboardX/tensorboardX/utils.py
@@ -0,0 +1,123 @@
+# Functions for converting
+def figure_to_image(figures, close=True):
+    """Render matplotlib figure to numpy format.
+
+    Note that this requires the ``matplotlib`` package.
+
+    Args:
+        figure (matplotlib.pyplot.figure) or list of figures: figure or a list of figures
+        close (bool): Flag to automatically close the figure
+
+    Returns:
+        numpy.array: image in [CHW] order
+    """
+    import numpy as np
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib.backends.backend_agg as plt_backend_agg
+    except ModuleNotFoundError:
+        print('please install matplotlib')
+
+    def render_to_rgb(figure):
+        canvas = plt_backend_agg.FigureCanvasAgg(figure)
+        canvas.draw()
+        data = np.frombuffer(canvas.buffer_rgba(), dtype=np.uint8)
+        w, h = figure.canvas.get_width_height()
+        image_hwc = data.reshape([h, w, 4])[:, :, 0:3]
+        image_chw = np.moveaxis(image_hwc, source=2, destination=0)
+        if close:
+            plt.close(figure)
+        return image_chw
+
+    if isinstance(figures, list):
+        images = [render_to_rgb(figure) for figure in figures]
+        return np.stack(images)
+    else:
+        image = render_to_rgb(figures)
+        return image
+
+
+def graphviz_to_image():
+    pass
+
+
+def _prepare_video(V):
+    import numpy as np
+    b, t, c, h, w = V.shape
+
+    if V.dtype == np.uint8:
+        V = np.float32(V) / 255.
+
+    def is_power2(num):
+        return num != 0 and ((num & (num - 1)) == 0)
+
+    # pad to nearest power of 2, all at once
+    if not is_power2(V.shape[0]):
+        len_addition = int(2**V.shape[0].bit_length() - V.shape[0])
+        V = np.concatenate(
+            (V, np.zeros(shape=(len_addition, t, c, h, w))), axis=0)
+
+    n_rows = 2**((b.bit_length() - 1) // 2)
+    n_cols = V.shape[0] // n_rows
+
+    V = np.reshape(V, newshape=(n_rows, n_cols, t, c, h, w))
+    V = np.transpose(V, axes=(2, 0, 4, 1, 5, 3))
+    V = np.reshape(V, newshape=(t, n_rows * h, n_cols * w, c))
+
+    return V
+
+
+def make_grid(I, ncols=8):
+    # I: N1HW or N3HW
+    import numpy as np
+    assert isinstance(
+        I, np.ndarray), 'plugin error, should pass numpy array here'
+    if I.shape[1] == 1:
+        I = np.concatenate([I, I, I], 1)
+    assert I.ndim == 4 and I.shape[1] == 3 or I.shape[1] == 4
+    nimg = I.shape[0]
+    H = I.shape[2]
+    W = I.shape[3]
+    ncols = min(nimg, ncols)
+    nrows = int(np.ceil(float(nimg) / ncols))
+    canvas = np.zeros((I.shape[1], H * nrows, W * ncols))
+    i = 0
+    for y in range(nrows):
+        for x in range(ncols):
+            if i >= nimg:
+                break
+            canvas[:, y * H:(y + 1) * H, x * W:(x + 1) * W] = I[i]
+            i = i + 1
+    return canvas
+
+    # if modality == 'IMG':
+    #     if x.dtype == np.uint8:
+    #         x = x.astype(np.float32) / 255.0
+
+
+def convert_to_HWC(tensor, input_format):  # tensor: numpy array
+    import numpy as np
+    assert(len(set(input_format)) == len(input_format)), "You can not use the same dimension shordhand twice. \
+        input_format: {}".format(input_format)
+    assert(len(tensor.shape) == len(input_format)), "size of input tensor and input format are different. \
+        tensor shape: {}, input_format: {}".format(tensor.shape, input_format)
+    input_format = input_format.upper()
+
+    if len(input_format) == 4:
+        index = [input_format.find(c) for c in 'NCHW']
+        tensor_NCHW = tensor.transpose(index)
+        tensor_CHW = make_grid(tensor_NCHW)
+        return tensor_CHW.transpose(1, 2, 0)
+
+    if len(input_format) == 3:
+        index = [input_format.find(c) for c in 'HWC']
+        tensor_HWC = tensor.transpose(index)
+        if tensor_HWC.shape[2] == 1:
+            tensor_HWC = np.concatenate([tensor_HWC, tensor_HWC, tensor_HWC], 2)
+        return tensor_HWC
+
+    if len(input_format) == 2:
+        index = [input_format.find(c) for c in 'HW']
+        tensor = tensor.transpose(index)
+        tensor = np.stack([tensor, tensor, tensor], 2)
+        return tensor
diff --git a/tensorboardX/tensorboardX/visdom_writer.py b/tensorboardX/tensorboardX/visdom_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..896c2d4480a3c43fe200713937f78fd4aebc71e8
--- /dev/null
+++ b/tensorboardX/tensorboardX/visdom_writer.py
@@ -0,0 +1,328 @@
+import gc
+import numpy as np
+import math
+import json
+import time
+
+from .summary import compute_curve
+from .utils import figure_to_image
+from .x2num import make_np
+
+
+# Decorator that checks if there is a Visdom connection
+def _check_connection(fn):
+    def wrapper(self, *args, **kwargs):
+        if not self.server_connected:
+            print('ERROR: No Visdom server currently connected')
+            self._try_connect()
+            return
+        fn(self, *args, **kwargs)
+    return wrapper
+
+
+class VisdomWriter:
+    def __init__(self, *args, **kwargs):
+        try:
+            from visdom import Visdom
+        except ImportError:
+            raise ImportError(
+                "Visdom visualization requires installation of Visdom")
+
+        self.scalar_dict = {}
+        self.server_connected = False
+        self.vis = Visdom(*args, **kwargs)
+        self.windows = {}
+
+        self._try_connect()
+
+    def _try_connect(self):
+        startup_sec = 1
+        self.server_connected = self.vis.check_connection()
+        while not self.server_connected and startup_sec > 0:
+            time.sleep(0.1)
+            startup_sec -= 0.1
+            self.server_connected = self.vis.check_connection()
+        assert self.server_connected, 'No connection could be formed quickly'
+
+    @_check_connection
+    def add_scalar(self, tag, scalar_value, global_step=None, main_tag='default'):
+        """Add scalar data to Visdom. Plots the values in a plot titled
+           {main_tag}-{tag}.
+
+        Args:
+            tag (string): Data identifier
+            scalar_value (float or string/blobname): Value to save
+            global_step (int): Global step value to record
+            main_tag (string): Data group identifier
+        """
+        if self.scalar_dict.get(main_tag) is None:
+            self.scalar_dict[main_tag] = {}
+        exists = self.scalar_dict[main_tag].get(tag) is not None
+        self.scalar_dict[main_tag][tag] = self.scalar_dict[main_tag][tag] + \
+            [scalar_value] if exists else [scalar_value]
+        plot_name = '{}-{}'.format(main_tag, tag)
+        # If there is no global_step provided, follow sequential order
+        x_val = len(self.scalar_dict[main_tag][tag]
+                    ) if not global_step else global_step
+        if exists:
+            # Update our existing Visdom window
+            self.vis.line(
+                X=make_np(x_val),
+                Y=make_np(scalar_value),
+                name=plot_name,
+                update='append',
+                win=self.windows[plot_name],
+            )
+        else:
+            # Save the window if we are creating this graph for the first time
+            self.windows[plot_name] = self.vis.line(
+                X=make_np(x_val),
+                Y=make_np(scalar_value),
+                name=plot_name,
+                opts={
+                    'title': plot_name,
+                    'xlabel': 'timestep',
+                    'ylabel': tag,
+                },
+            )
+
+    @_check_connection
+    def add_scalars(self, main_tag, tag_scalar_dict, global_step=None):
+        """Adds many scalar data to summary.
+
+        Note that this function also keeps logged scalars in memory. In extreme case it explodes your RAM.
+
+        Args:
+            tag (string): Data identifier
+            main_tag (string): Data group identifier
+            tag_scalar_dict (dict): Key-value pair storing the tag and corresponding values
+            global_step (int): Global step value to record
+
+        Examples::
+
+            writer.add_scalars('run_14h',{'xsinx':i*np.sin(i/r),
+                                          'xcosx':i*np.cos(i/r),
+                                          'arctanx': numsteps*np.arctan(i/r)}, i)
+            This function adds three plots:
+                'run_14h-xsinx',
+                'run_14h-xcosx',
+                'run_14h-arctanx'
+            with the corresponding values.
+        """
+        for key in tag_scalar_dict.keys():
+            self.add_scalar(key, tag_scalar_dict[key], global_step, main_tag)
+
+    @_check_connection
+    def export_scalars_to_json(self, path):
+        """Exports to the given 'path' an ASCII file containing all the scalars written
+        so far by this instance, with the following format:
+        {writer_id : [[timestamp, step, value], ...], ...}
+
+        The scalars saved by ``add_scalars()`` will be flushed after export.
+        """
+        with open(path, "w") as f:
+            json.dump(self.scalar_dict, f)
+        self.scalar_dict = {}
+
+    @_check_connection
+    def add_histogram(self, tag, values, global_step=None, bins='tensorflow'):
+        """Add histogram to summary.
+
+        Args:
+            tag (string): Data identifier
+            values (torch.Tensor, numpy.array, or string/blobname): Values to build histogram
+            global_step (int): Global step value to record
+            bins (string): one of {'tensorflow', 'auto', 'fd', ...}, this determines how the bins are made. You can find
+              other options in: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html
+        """
+        values = make_np(values)
+        self.vis.histogram(make_np(values), opts={'title': tag})
+
+    @_check_connection
+    def add_image(self, tag, img_tensor, global_step=None, caption=None):
+        """Add image data to summary.
+
+        Note that this requires the ``pillow`` package.
+
+        Args:
+            tag (string): Data identifier
+            img_tensor (torch.Tensor, numpy.array, or string/blobname): Image data
+            global_step (int): Global step value to record
+        Shape:
+            img_tensor: :math:`(C, H, W)`. Use ``torchvision.utils.make_grid()`` to prepare it is a good idea.
+            C = colors (can be 1 - grayscale, 3 - RGB, 4 - RGBA)
+        """
+        img_tensor = make_np(img_tensor)
+        self.vis.image(img_tensor, opts={'title': tag, 'caption': caption})
+
+    @_check_connection
+    def add_figure(self, tag, figure, global_step=None, close=True):
+        """Render matplotlib figure into an image and add it to summary.
+
+        Note that this requires the ``matplotlib`` package.
+
+        Args:
+            tag (string): Data identifier
+            figure (matplotlib.pyplot.figure) or list of figures: figure or a list of figures
+            global_step (int): Global step value to record
+            close (bool): Flag to automatically close the figure
+        """
+        self.add_image(tag, figure_to_image(figure, close), global_step)
+
+    @_check_connection
+    def add_video(self, tag, vid_tensor, global_step=None, fps=4):
+        """Add video data to summary.
+
+        Note that this requires the ``moviepy`` package.
+
+        Args:
+            tag (string): Data identifier
+            vid_tensor (torch.Tensor): Video data
+            global_step (int): Global step value to record
+            fps (float or int): Frames per second
+        Shape:
+            vid_tensor: :math:`(B, C, T, H, W)`. (if following tensorboardX format)
+            vid_tensor: :math:`(T, H, W, C)`. (if following visdom format)
+            B = batches, C = colors (1, 3, or 4), T = time frames, H = height, W = width
+        """
+        shape = vid_tensor.shape
+        # A batch of videos (tensorboardX format) is a 5D tensor
+        if len(shape) > 4:
+            for i in range(shape[0]):
+                # Reshape each video to Visdom's (T x H x W x C) and write each video
+                # TODO: reverse the logic here, shoudl do the permutation in numpy
+                if isinstance(vid_tensor, np.ndarray):
+                    import torch
+                    ind_vid = torch.from_numpy(
+                        vid_tensor[i, :, :, :, :]).permute(1, 2, 3, 0)
+                else:
+                    ind_vid = vid_tensor[i, :, :, :, :].permute(1, 2, 3, 0)
+                scale_factor = 255 if np.any(
+                    (ind_vid > 0) & (ind_vid < 1)) else 1
+                # Visdom looks for .ndim attr, this is something raw Tensors don't have
+                # Cast to Numpy array to get .ndim attr
+                ind_vid = ind_vid.numpy()
+                ind_vid = (ind_vid * scale_factor).astype(np.uint8)
+                assert ind_vid.shape[3] in [1, 3, 4], \
+                    'Visdom requires the last dimension to be color, which can be 1 (grayscale), 3 (RGB) or 4 (RGBA)'
+                self.vis.video(tensor=ind_vid, opts={'fps': fps})
+        else:
+            self.vis.video(tensor=vid_tensor, opts={'fps': fps})
+
+    @_check_connection
+    def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=44100):
+        """Add audio data to summary.
+
+        Args:
+            tag (string): Data identifier
+            snd_tensor (torch.Tensor, numpy.array, or string/blobname): Sound data
+            global_step (int): Global step value to record
+            sample_rate (int): sample rate in Hz
+
+        Shape:
+            snd_tensor: :math:`(1, L)`. The values should lie between [-1, 1].
+        """
+        snd_tensor = make_np(snd_tensor)
+        self.vis.audio(tensor=snd_tensor, opts={
+                       'sample_frequency': sample_rate})
+
+    @_check_connection
+    def add_text(self, tag, text_string, global_step=None):
+        """Add text data to summary.
+
+        Args:
+            tag (string): Data identifier
+            text_string (string): String to save
+            global_step (int): Global step value to record
+        Examples::
+            writer.add_text('lstm', 'This is an lstm', 0)
+            writer.add_text('rnn', 'This is an rnn', 10)
+        """
+        if text_string is None:
+            # Visdom doesn't support tags, write the tag as the text_string
+            text_string = tag
+        self.vis.text(text_string)
+
+    @_check_connection
+    def add_onnx_graph(self, prototxt):
+        # TODO: Visdom doesn't support graph visualization yet, so this is a no-op
+        return
+
+    @_check_connection
+    def add_graph(self, model, input_to_model=None, verbose=False, **kwargs):
+        # TODO: Visdom doesn't support graph visualization yet, so this is a no-op
+        return
+
+    @_check_connection
+    def add_embedding(self, mat, metadata=None, label_img=None, global_step=None, tag='default', metadata_header=None):
+        # TODO: Visdom doesn't support embeddings yet, so this is a no-op
+        return
+
+    @_check_connection
+    def add_pr_curve(self, tag, labels, predictions, global_step=None, num_thresholds=127, weights=None):
+        """Adds precision recall curve.
+
+        Args:
+            tag (string): Data identifier
+            labels (torch.Tensor, numpy.array, or string/blobname): Ground truth data. Binary label for each element.
+            predictions (torch.Tensor, numpy.array, or string/blobname):
+            The probability that an element be classified as true. Value should in [0, 1]
+            global_step (int): Global step value to record
+            num_thresholds (int): Number of thresholds used to draw the curve.
+
+        """
+        labels, predictions = make_np(labels), make_np(predictions)
+        raw_data = compute_curve(labels, predictions, num_thresholds, weights)
+
+        # compute_curve returns np.stack((tp, fp, tn, fn, precision, recall))
+        # We want to access 'precision' and 'recall'
+        precision, recall = raw_data[4, :], raw_data[5, :]
+
+        self.vis.line(
+            X=recall,
+            Y=precision,
+            name=tag,
+            opts={
+                'title': 'PR Curve for {}'.format(tag),
+                'xlabel': 'recall',
+                'ylabel': 'precision',
+            },
+        )
+
+    @_check_connection
+    def add_pr_curve_raw(self, tag, true_positive_counts,
+                         false_positive_counts,
+                         true_negative_counts,
+                         false_negative_counts,
+                         precision,
+                         recall, global_step=None, num_thresholds=127, weights=None):
+        """Adds precision recall curve with raw data.
+
+        Args:
+            tag (string): Data identifier
+            true_positive_counts (torch.Tensor, numpy.array, or string/blobname): true positive counts
+            false_positive_counts (torch.Tensor, numpy.array, or string/blobname): false positive counts
+            true_negative_counts (torch.Tensor, numpy.array, or string/blobname): true negative counts
+            false_negative_counts (torch.Tensor, numpy.array, or string/blobname): false negative counts
+            precision (torch.Tensor, numpy.array, or string/blobname): precision
+            recall (torch.Tensor, numpy.array, or string/blobname): recall
+            global_step (int): Global step value to record
+            num_thresholds (int): Number of thresholds used to draw the curve.
+            see: https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/pr_curve/README.md
+        """
+        precision, recall = make_np(precision), make_np(recall)
+        self.vis.line(
+            X=recall,
+            Y=precision,
+            name=tag,
+            opts={
+                'title': 'PR Curve for {}'.format(tag),
+                'xlabel': 'recall',
+                'ylabel': 'precision',
+            },
+        )
+
+    def close(self):
+        del self.vis
+        del self.scalar_dict
+        gc.collect()
diff --git a/tensorboardX/tensorboardX/writer.py b/tensorboardX/tensorboardX/writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..58d57a1fcc0aff70a89437b1d88b9d5b46f873be
--- /dev/null
+++ b/tensorboardX/tensorboardX/writer.py
@@ -0,0 +1,1065 @@
+"""Provides an API for writing protocol buffers to event files to be
+consumed by TensorBoard for visualization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import six
+import time
+import logging
+
+from .embedding import make_mat, make_sprite, make_tsv, append_pbtxt
+from .event_file_writer import EventFileWriter
+from .onnx_graph import load_onnx_graph
+from .pytorch_graph import graph
+from .proto import event_pb2
+from .proto import summary_pb2
+from .proto.event_pb2 import SessionLog, Event
+from .utils import figure_to_image
+from .summary import (
+    scalar, histogram, histogram_raw, image, audio, text,
+    pr_curve, pr_curve_raw, video, custom_scalars, image_boxes, mesh, hparams
+)
+
+
+class DummyFileWriter(object):
+    """A fake file writer that writes nothing to the disk.
+    """
+    def __init__(self, logdir):
+        self._logdir = logdir
+
+    def get_logdir(self):
+        """Returns the directory where event file will be written."""
+        return self._logdir
+
+    def add_event(self, event, step=None, walltime=None):
+        return
+
+    def add_summary(self, summary, global_step=None, walltime=None):
+        return
+
+    def add_graph(self, graph_profile, walltime=None):
+        return
+
+    def add_onnx_graph(self, graph, walltime=None):
+        return
+
+    def flush(self):
+        return
+
+    def close(self):
+        return
+
+    def reopen(self):
+        return
+
+
+class FileWriter(object):
+    """Writes protocol buffers to event files to be consumed by TensorBoard.
+
+    The `FileWriter` class provides a mechanism to create an event file in a
+    given directory and add summaries and events to it. The class updates the
+    file contents asynchronously. This allows a training program to call methods
+    to add data to the file directly from the training loop, without slowing down
+    training.
+    """
+
+    def __init__(self, logdir, max_queue=10, flush_secs=120, filename_suffix=''):
+        """Creates a `FileWriter` and an event file.
+        On construction the writer creates a new event file in `logdir`.
+        The other arguments to the constructor control the asynchronous writes to
+        the event file.
+
+        Args:
+          logdir: A string. Directory where event file will be written.
+          max_queue: Integer. Size of the queue for pending events and
+            summaries before one of the 'add' calls forces a flush to disk.
+            Default is ten items.
+          flush_secs: Number. How often, in seconds, to flush the
+            pending events and summaries to disk. Default is every two minutes.
+          filename_suffix: A string. Suffix added to all event filenames
+            in the logdir directory. More details on filename construction in
+            tensorboard.summary.writer.event_file_writer.EventFileWriter.
+        """
+        # Sometimes PosixPath is passed in and we need to coerce it to
+        # a string in all cases
+        # TODO: See if we can remove this in the future if we are
+        # actually the ones passing in a PosixPath
+        logdir = str(logdir)
+        self.event_writer = EventFileWriter(
+            logdir, max_queue, flush_secs, filename_suffix)
+
+    def get_logdir(self):
+        """Returns the directory where event file will be written."""
+        return self.event_writer.get_logdir()
+
+    def add_event(self, event, step=None, walltime=None):
+        """Adds an event to the event file.
+        Args:
+          event: An `Event` protocol buffer.
+          step: Number. Optional global step value for training process
+            to record with the event.
+          walltime: float. Optional walltime to override the default (current)
+            walltime (from time.time())
+        """
+        event.wall_time = time.time() if walltime is None else walltime
+        if step is not None:
+            # Make sure step is converted from numpy or other formats
+            # since protobuf might not convert depending on version
+            event.step = int(step)
+        self.event_writer.add_event(event)
+
+    def add_summary(self, summary, global_step=None, walltime=None):
+        """Adds a `Summary` protocol buffer to the event file.
+        This method wraps the provided summary in an `Event` protocol buffer
+        and adds it to the event file.
+
+        Args:
+          summary: A `Summary` protocol buffer.
+          global_step: Number. Optional global step value for training process
+            to record with the summary.
+          walltime: float. Optional walltime to override the default (current)
+            walltime (from time.time())
+        """
+        event = event_pb2.Event(summary=summary)
+        self.add_event(event, global_step, walltime)
+
+    def add_graph(self, graph_profile, walltime=None):
+        """Adds a `Graph` and step stats protocol buffer to the event file.
+
+        Args:
+          graph_profile: A `Graph` and step stats protocol buffer.
+          walltime: float. Optional walltime to override the default (current)
+            walltime (from time.time()) seconds after epoch
+        """
+        graph = graph_profile[0]
+        stepstats = graph_profile[1]
+        event = event_pb2.Event(graph_def=graph.SerializeToString())
+        self.add_event(event, None, walltime)
+
+        trm = event_pb2.TaggedRunMetadata(
+            tag='step1', run_metadata=stepstats.SerializeToString())
+        event = event_pb2.Event(tagged_run_metadata=trm)
+        self.add_event(event, None, walltime)
+
+    def add_onnx_graph(self, graph, walltime=None):
+        """Adds a `Graph` protocol buffer to the event file.
+
+        Args:
+          graph: A `Graph` protocol buffer.
+          walltime: float. Optional walltime to override the default (current)
+            _get_file_writerfrom time.time())
+        """
+        event = event_pb2.Event(graph_def=graph.SerializeToString())
+        self.add_event(event, None, walltime)
+
+    def flush(self):
+        """Flushes the event file to disk.
+        Call this method to make sure that all pending events have been written to
+        disk.
+        """
+        self.event_writer.flush()
+
+    def close(self):
+        """Flushes the event file to disk and close the file.
+        Call this method when you do not need the summary writer anymore.
+        """
+        self.event_writer.close()
+
+    def reopen(self):
+        """Reopens the EventFileWriter.
+        Can be called after `close()` to add more events in the same directory.
+        The events will go into a new events file.
+        Does nothing if the EventFileWriter was not closed.
+        """
+        self.event_writer.reopen()
+
+
+class SummaryWriter(object):
+    """Writes entries directly to event files in the logdir to be
+    consumed by TensorBoard.
+
+    The `SummaryWriter` class provides a high-level API to create an event file
+    in a given directory and add summaries and events to it. The class updates the
+    file contents asynchronously. This allows a training program to call methods
+    to add data to the file directly from the training loop, without slowing down
+    training.
+    """
+
+    def __init__(self, logdir=None, comment='', purge_step=None, max_queue=10,
+                 flush_secs=120, filename_suffix='', write_to_disk=True, log_dir=None, **kwargs):
+        """Creates a `SummaryWriter` that will write out events and summaries
+        to the event file.
+
+        Args:
+            logdir (string): Save directory location. Default is
+              runs/**CURRENT_DATETIME_HOSTNAME**, which changes after each run.
+              Use hierarchical folder structure to compare
+              between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc.
+              for each new experiment to compare across them.
+            comment (string): Comment logdir suffix appended to the default
+              ``logdir``. If ``logdir`` is assigned, this argument has no effect.
+            purge_step (int):
+              When logging crashes at step :math:`T+X` and restarts at step :math:`T`,
+              any events whose global_step larger or equal to :math:`T` will be
+              purged and hidden from TensorBoard.
+              Note that crashed and resumed experiments should have the same ``logdir``.
+            max_queue (int): Size of the queue for pending events and
+              summaries before one of the 'add' calls forces a flush to disk.
+              Default is ten items.
+            flush_secs (int): How often, in seconds, to flush the
+              pending events and summaries to disk. Default is every two minutes.
+            filename_suffix (string): Suffix added to all event filenames in
+              the logdir directory. More details on filename construction in
+              tensorboard.summary.writer.event_file_writer.EventFileWriter.
+            write_to_disk (boolean):
+              If pass `False`, SummaryWriter will not write to disk.
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+
+            # create a summary writer with automatically generated folder name.
+            writer = SummaryWriter()
+            # folder location: runs/May04_22-14-54_s-MacBook-Pro.local/
+
+            # create a summary writer using the specified folder name.
+            writer = SummaryWriter("my_experiment")
+            # folder location: my_experiment
+
+            # create a summary writer with comment appended.
+            writer = SummaryWriter(comment="LR_0.1_BATCH_16")
+            # folder location: runs/May04_22-14-54_s-MacBook-Pro.localLR_0.1_BATCH_16/
+
+        """
+        if log_dir is not None and logdir is None:
+            logdir = log_dir
+        if not logdir:
+            import socket
+            from datetime import datetime
+            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
+            logdir = os.path.join(
+                'runs', current_time + '_' + socket.gethostname() + comment)
+        self.logdir = logdir
+        self.purge_step = purge_step
+        self._max_queue = max_queue
+        self._flush_secs = flush_secs
+        self._filename_suffix = filename_suffix
+        self._write_to_disk = write_to_disk
+        self.kwargs = kwargs
+
+        # Initialize the file writers, but they can be cleared out on close
+        # and recreated later as needed.
+        self.file_writer = self.all_writers = None
+        self._get_file_writer()
+
+        # Create default bins for histograms, see generate_testdata.py in tensorflow/tensorboard
+        v = 1E-12
+        buckets = []
+        neg_buckets = []
+        while v < 1E20:
+            buckets.append(v)
+            neg_buckets.append(-v)
+            v *= 1.1
+        self.default_bins = neg_buckets[::-1] + [0] + buckets
+
+        self.scalar_dict = {}
+
+    def __append_to_scalar_dict(self, tag, scalar_value, global_step,
+                                timestamp):
+        """This adds an entry to the self.scalar_dict datastructure with format
+        {writer_id : [[timestamp, step, value], ...], ...}.
+        """
+        from .x2num import make_np
+        if tag not in self.scalar_dict.keys():
+            self.scalar_dict[tag] = []
+        self.scalar_dict[tag].append(
+            [timestamp, global_step, float(make_np(scalar_value))])
+
+    def _check_caffe2_blob(self, item):
+        """
+        Caffe2 users have the option of passing a string representing the name of
+        a blob in the workspace instead of passing the actual Tensor/array containing
+        the numeric values. Thus, we need to check if we received a string as input
+        instead of an actual Tensor/array, and if so, we need to fetch the Blob
+        from the workspace corresponding to that name. Fetching can be done with the
+        following:
+
+        from caffe2.python import workspace (if not already imported)
+        workspace.FetchBlob(blob_name)
+        workspace.FetchBlobs([blob_name1, blob_name2, ...])
+        """
+        return isinstance(item, six.string_types)
+
+    def _get_file_writer(self):
+        """Returns the default FileWriter instance. Recreates it if closed."""
+        if not self._write_to_disk:
+            self.file_writer = DummyFileWriter(logdir=self.logdir)
+            self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
+            return self.file_writer
+
+        if self.all_writers is None or self.file_writer is None:
+            if 'purge_step' in self.kwargs.keys():
+                most_recent_step = self.kwargs.pop('purge_step')
+                self.file_writer = FileWriter(logdir=self.logdir,
+                                              max_queue=self._max_queue,
+                                              flush_secs=self._flush_secs,
+                                              filename_suffix=self._filename_suffix,
+                                              **self.kwargs)
+                self.file_writer.add_event(
+                    Event(step=most_recent_step, file_version='brain.Event:2'))
+                self.file_writer.add_event(
+                    Event(step=most_recent_step, session_log=SessionLog(status=SessionLog.START)))
+            else:
+                self.file_writer = FileWriter(logdir=self.logdir,
+                                              max_queue=self._max_queue,
+                                              flush_secs=self._flush_secs,
+                                              filename_suffix=self._filename_suffix,
+                                              **self.kwargs)
+            self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
+        return self.file_writer
+
+    def add_hparams(self, hparam_dict=None, metric_dict=None):
+        """Add a set of hyperparameters to be compared in tensorboard.
+
+        Args:
+            hparam_dict (dictionary): Each key-value pair in the dictionary is the
+              name of the hyper parameter and it's corresponding value.
+            metric_dict (dictionary): Each key-value pair in the dictionary is the
+              name of the metric and it's corresponding value. Note that the key used
+              here should be unique in the tensorboard record. Otherwise the value
+              you added by `add_scalar` will be displayed in hparam plugin. In most
+              cases, this is unwanted.
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            with SummaryWriter() as w:
+                for i in range(5):
+                    w.add_hparams({'lr': 0.1*i, 'bsize': i},
+                                  {'hparam/accuracy': 10*i, 'hparam/loss': 10*i})
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_hparam.png
+           :scale: 50 %
+        """
+        if type(hparam_dict) is not dict or type(metric_dict) is not dict:
+            raise TypeError('hparam_dict and metric_dict should be dictionary.')
+        exp, ssi, sei = hparams(hparam_dict, metric_dict)
+
+        with SummaryWriter(logdir=os.path.join(self.file_writer.get_logdir(), str(time.time()))) as w_hp:
+            w_hp.file_writer.add_summary(exp)
+            w_hp.file_writer.add_summary(ssi)
+            w_hp.file_writer.add_summary(sei)
+            for k, v in metric_dict.items():
+                w_hp.add_scalar(k, v)
+
+    def add_scalar(self, tag, scalar_value, global_step=None, walltime=None):
+        """Add scalar data to summary.
+
+        Args:
+            tag (string): Data identifier
+            scalar_value (float or string/blobname): Value to save
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time()) of event
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            writer = SummaryWriter()
+            x = range(100)
+            for i in x:
+                writer.add_scalar('y=2x', i * 2, i)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_scalar.png
+           :scale: 50 %
+
+        """
+        if self._check_caffe2_blob(scalar_value):
+            scalar_value = workspace.FetchBlob(scalar_value)
+        self._get_file_writer().add_summary(
+            scalar(tag, scalar_value), global_step, walltime)
+
+    def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None):
+        """Adds many scalar data to summary.
+
+        Note that this function also keeps logged scalars in memory. In extreme case it explodes your RAM.
+
+        Args:
+            main_tag (string): The parent name for the tags
+            tag_scalar_dict (dict): Key-value pair storing the tag and corresponding values
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time()) of event
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            writer = SummaryWriter()
+            r = 5
+            for i in range(100):
+                writer.add_scalars('run_14h', {'xsinx':i*np.sin(i/r),
+                                                'xcosx':i*np.cos(i/r),
+                                                'tanx': np.tan(i/r)}, i)
+            writer.close()
+            # This call adds three values to the same scalar plot with the tag
+            # 'run_14h' in TensorBoard's scalar section.
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_scalars.png
+           :scale: 50 %
+
+        """
+        walltime = time.time() if walltime is None else walltime
+        fw_logdir = self._get_file_writer().get_logdir()
+        for tag, scalar_value in tag_scalar_dict.items():
+            fw_tag = fw_logdir + "/" + main_tag + "/" + tag
+            if fw_tag in self.all_writers.keys():
+                fw = self.all_writers[fw_tag]
+            else:
+                fw = FileWriter(logdir=fw_tag)
+                self.all_writers[fw_tag] = fw
+            if self._check_caffe2_blob(scalar_value):
+                scalar_value = workspace.FetchBlob(scalar_value)
+            fw.add_summary(scalar(main_tag, scalar_value),
+                           global_step, walltime)
+            self.__append_to_scalar_dict(
+                fw_tag, scalar_value, global_step, walltime)
+
+    def export_scalars_to_json(self, path):
+        """Exports to the given path an ASCII file containing all the scalars written
+        so far by this instance, with the following format:
+        {writer_id : [[timestamp, step, value], ...], ...}
+
+        The scalars saved by ``add_scalars()`` will be flushed after export.
+        """
+        with open(path, "w") as f:
+            json.dump(self.scalar_dict, f)
+        self.scalar_dict = {}
+
+    def add_histogram(self, tag, values, global_step=None, bins='tensorflow', walltime=None, max_bins=None):
+        """Add histogram to summary.
+
+        Args:
+            tag (string): Data identifier
+            values (torch.Tensor, numpy.array, or string/blobname): Values to build histogram
+            global_step (int): Global step value to record
+            bins (string): One of {'tensorflow','auto', 'fd', ...}. This determines how the bins are made. You can find
+              other options in: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html
+            walltime (float): Optional override default walltime (time.time()) of event
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            import numpy as np
+            writer = SummaryWriter()
+            for i in range(10):
+                x = np.random.random(1000)
+                writer.add_histogram('distribution centers', x + i, i)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_histogram.png
+           :scale: 50 %
+
+        """
+        if self._check_caffe2_blob(values):
+            values = workspace.FetchBlob(values)
+        if isinstance(bins, six.string_types) and bins == 'tensorflow':
+            bins = self.default_bins
+        self._get_file_writer().add_summary(
+            histogram(tag, values, bins, max_bins=max_bins), global_step, walltime)
+
+    def add_histogram_raw(self, tag, min, max, num, sum, sum_squares,
+                          bucket_limits, bucket_counts, global_step=None,
+                          walltime=None):
+        """Adds histogram with raw data.
+
+        Args:
+            tag (string): Data identifier
+            min (float or int): Min value
+            max (float or int): Max value
+            num (int): Number of values
+            sum (float or int): Sum of all values
+            sum_squares (float or int): Sum of squares for all values
+            bucket_limits (torch.Tensor, numpy.array): Upper value per
+              bucket, note that the bucket_limits returned from `np.histogram`
+              has one more element. See the comment in the following example.
+            bucket_counts (torch.Tensor, numpy.array): Number of values per bucket
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time()) of event
+
+        Examples::
+
+            import numpy as np
+            dummy_data = []
+            for idx, value in enumerate(range(30)):
+                dummy_data += [idx + 0.001] * value
+            values = np.array(dummy_data).astype(float).reshape(-1)
+            counts, limits = np.histogram(values)
+            sum_sq = values.dot(values)
+            with SummaryWriter() as summary_writer:
+                summary_writer.add_histogram_raw(
+                        tag='hist_dummy_data',
+                        min=values.min(),
+                        max=values.max(),
+                        num=len(values),
+                        sum=values.sum(),
+                        sum_squares=sum_sq,
+                        bucket_limits=limits[1:].tolist(),  # <- note here.
+                        bucket_counts=counts.tolist(),
+                        global_step=0)
+
+        """
+        if len(bucket_limits) != len(bucket_counts):
+            raise ValueError('len(bucket_limits) != len(bucket_counts), see the document.')
+        self._get_file_writer().add_summary(
+            histogram_raw(tag,
+                          min,
+                          max,
+                          num,
+                          sum,
+                          sum_squares,
+                          bucket_limits,
+                          bucket_counts),
+            global_step,
+            walltime)
+
+    def add_image(self, tag, img_tensor, global_step=None, walltime=None, dataformats='CHW'):
+        """Add image data to summary.
+
+        Note that this requires the ``pillow`` package.
+
+        Args:
+            tag (string): Data identifier
+            img_tensor (torch.Tensor, numpy.array, or string/blobname): An `uint8` or `float`
+                Tensor of shape `[channel, height, width]` where `channel` is 1, 3, or 4.
+                The elements in img_tensor can either have values in [0, 1] (float32) or [0, 255] (uint8).
+                Users are responsible to scale the data in the correct range/type.
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time()) of event.
+            dataformats (string): This parameter specifies the meaning of each dimension of the input tensor.
+        Shape:
+            img_tensor: Default is :math:`(3, H, W)`. You can use ``torchvision.utils.make_grid()`` to
+            convert a batch of tensor into 3xHxW format or use ``add_images()`` and let us do the job.
+            Tensor with :math:`(1, H, W)`, :math:`(H, W)`, :math:`(H, W, 3)` is also suitible as long as
+            corresponding ``dataformats`` argument is passed. e.g. CHW, HWC, HW.
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            import numpy as np
+            img = np.zeros((3, 100, 100))
+            img[0] = np.arange(0, 10000).reshape(100, 100) / 10000
+            img[1] = 1 - np.arange(0, 10000).reshape(100, 100) / 10000
+
+            img_HWC = np.zeros((100, 100, 3))
+            img_HWC[:, :, 0] = np.arange(0, 10000).reshape(100, 100) / 10000
+            img_HWC[:, :, 1] = 1 - np.arange(0, 10000).reshape(100, 100) / 10000
+
+            writer = SummaryWriter()
+            writer.add_image('my_image', img, 0)
+
+            # If you have non-default dimension setting, set the dataformats argument.
+            writer.add_image('my_image_HWC', img_HWC, 0, dataformats='HWC')
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_image.png
+           :scale: 50 %
+
+        """
+        if self._check_caffe2_blob(img_tensor):
+            img_tensor = workspace.FetchBlob(img_tensor)
+        self._get_file_writer().add_summary(
+            image(tag, img_tensor, dataformats=dataformats), global_step, walltime)
+
+    def add_images(self, tag, img_tensor, global_step=None, walltime=None, dataformats='NCHW'):
+        """Add batched (4D) image data to summary.
+        Besides passing 4D (NCHW) tensor, you can also pass a list of tensors of the same size.
+        In this case, the ``dataformats`` should be `CHW` or `HWC`.
+        Note that this requires the ``pillow`` package.
+
+        Args:
+            tag (string): Data identifier
+            img_tensor (torch.Tensor, numpy.array, or string/blobname): Image data
+                The elements in img_tensor can either have values in [0, 1] (float32) or [0, 255] (uint8).
+                Users are responsible to scale the data in the correct range/type.
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time()) of event
+        Shape:
+            img_tensor: Default is :math:`(N, 3, H, W)`. If ``dataformats`` is specified, other shape will be
+            accepted. e.g. NCHW or NHWC.
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            import numpy as np
+
+            img_batch = np.zeros((16, 3, 100, 100))
+            for i in range(16):
+                img_batch[i, 0] = np.arange(0, 10000).reshape(100, 100) / 10000 / 16 * i
+                img_batch[i, 1] = (1 - np.arange(0, 10000).reshape(100, 100) / 10000) / 16 * i
+
+            writer = SummaryWriter()
+            writer.add_images('my_image_batch', img_batch, 0)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_images.png
+           :scale: 30 %
+
+        """
+        if self._check_caffe2_blob(img_tensor):
+            img_tensor = workspace.FetchBlob(img_tensor)
+        if isinstance(img_tensor, list):  # a list of tensors in CHW or HWC
+            if dataformats.upper() != 'CHW' and dataformats.upper() != 'HWC':
+                print('A list of image is passed, but the dataformat is neither CHW nor HWC.')
+                print('Nothing is written.')
+                return
+            import torch
+            try:
+                img_tensor = torch.stack(img_tensor, 0)
+            except TypeError as e:
+                import numpy as np
+                img_tensor = np.stack(img_tensor, 0)
+
+            dataformats = 'N' + dataformats
+
+        self._get_file_writer().add_summary(
+            image(tag, img_tensor, dataformats=dataformats), global_step, walltime)
+
+    def add_image_with_boxes(self, tag, img_tensor, box_tensor, global_step=None,
+                             walltime=None, dataformats='CHW', labels=None, **kwargs):
+        """Add image and draw bounding boxes on the image.
+
+        Args:
+            tag (string): Data identifier
+            img_tensor (torch.Tensor, numpy.array, or string/blobname): Image data
+            box_tensor (torch.Tensor, numpy.array, or string/blobname): Box data (for detected objects)
+              box should be represented as [x1, y1, x2, y2].
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time()) of event
+            labels (list of string): The strings to be show on each bounding box.
+        Shape:
+            img_tensor: Default is :math:`(3, H, W)`. It can be specified with ``dataformat`` agrument.
+            e.g. CHW or HWC
+
+            box_tensor: (torch.Tensor, numpy.array, or string/blobname): NX4,  where N is the number of
+            boxes and each 4 elememts in a row represents (xmin, ymin, xmax, ymax).
+        """
+        if self._check_caffe2_blob(img_tensor):
+            img_tensor = workspace.FetchBlob(img_tensor)
+        if self._check_caffe2_blob(box_tensor):
+            box_tensor = workspace.FetchBlob(box_tensor)
+        if labels is not None:
+            if isinstance(labels, str):
+                labels = [labels]
+            if len(labels) != box_tensor.shape[0]:
+                logging.warning('Number of labels do not equal to number of box, skip the labels.')
+                labels = None
+        self._get_file_writer().add_summary(image_boxes(
+            tag, img_tensor, box_tensor, dataformats=dataformats, labels=labels, **kwargs), global_step, walltime)
+
+    def add_figure(self, tag, figure, global_step=None, close=True, walltime=None):
+        """Render matplotlib figure into an image and add it to summary.
+
+        Note that this requires the ``matplotlib`` package.
+
+        Args:
+            tag (string): Data identifier
+            figure (matplotlib.pyplot.figure) or list of figures: Figure or a list of figures
+            global_step (int): Global step value to record
+            close (bool): Flag to automatically close the figure
+            walltime (float): Optional override default walltime (time.time()) of event
+        """
+        if isinstance(figure, list):
+            self.add_image(tag, figure_to_image(figure, close), global_step, walltime, dataformats='NCHW')
+        else:
+            self.add_image(tag, figure_to_image(figure, close), global_step, walltime, dataformats='CHW')
+
+    def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None):
+        """Add video data to summary.
+
+        Note that this requires the ``moviepy`` package.
+
+        Args:
+            tag (string): Data identifier
+            vid_tensor (torch.Tensor): Video data
+            global_step (int): Global step value to record
+            fps (float or int): Frames per second
+            walltime (float): Optional override default walltime (time.time()) of event
+        Shape:
+            vid_tensor: :math:`(N, T, C, H, W)`. The values should lie in [0, 255] for type
+              `uint8` or [0, 1] for type `float`.
+        """
+        self._get_file_writer().add_summary(
+            video(tag, vid_tensor, fps), global_step, walltime)
+
+    def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=44100, walltime=None):
+        """Add audio data to summary.
+
+        Args:
+            tag (string): Data identifier
+            snd_tensor (torch.Tensor): Sound data
+            global_step (int): Global step value to record
+            sample_rate (int): sample rate in Hz
+            walltime (float): Optional override default walltime (time.time()) of event
+        Shape:
+            snd_tensor: :math:`(L, c)`. The values should lie between [-1, 1].
+        """
+        if self._check_caffe2_blob(snd_tensor):
+            snd_tensor = workspace.FetchBlob(snd_tensor)
+        self._get_file_writer().add_summary(
+            audio(tag, snd_tensor, sample_rate=sample_rate), global_step, walltime)
+
+    def add_text(self, tag, text_string, global_step=None, walltime=None):
+        """Add text data to summary.
+
+        Args:
+            tag (string): Data identifier
+            text_string (string): String to save
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time()) of event
+        Examples::
+
+            writer.add_text('lstm', 'This is an lstm', 0)
+            writer.add_text('rnn', 'This is an rnn', 10)
+        """
+        self._get_file_writer().add_summary(
+            text(tag, text_string), global_step, walltime)
+
+    def add_onnx_graph(self, prototxt):
+        self._get_file_writer().add_onnx_graph(load_onnx_graph(prototxt))
+
+    def add_graph(self, model, input_to_model=None, verbose=False, **kwargs):
+        # prohibit second call?
+        # no, let tensorboard handle it and show its warning message.
+        """Add graph data to summary.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            input_to_model (torch.Tensor or list of torch.Tensor): A variable or a tuple of
+                variables to be fed.
+            verbose (bool): Whether to print graph structure in console.
+            omit_useless_nodes (bool): Default to ``true``, which eliminates unused nodes.
+            operator_export_type (string): One of: ``"ONNX"``, ``"RAW"``. This determines
+                the optimization level of the graph. If error happens during exporting
+                the graph, using ``"RAW"`` might help.
+
+        """
+        if hasattr(model, 'forward'):
+            # A valid PyTorch model should have a 'forward' method
+            import torch
+            from distutils.version import LooseVersion
+            if LooseVersion(torch.__version__) >= LooseVersion("0.3.1"):
+                pass
+            else:
+                if LooseVersion(torch.__version__) >= LooseVersion("0.3.0"):
+                    print('You are using PyTorch==0.3.0, use add_onnx_graph()')
+                    return
+                if not hasattr(torch.autograd.Variable, 'grad_fn'):
+                    print('add_graph() only supports PyTorch v0.2.')
+                    return
+            self._get_file_writer().add_graph(graph(model, input_to_model, verbose, **kwargs))
+        else:
+            # Caffe2 models do not have the 'forward' method
+            from caffe2.proto import caffe2_pb2
+            from caffe2.python import core
+            from .caffe2_graph import (
+                model_to_graph_def, nets_to_graph_def, protos_to_graph_def
+            )
+            if isinstance(model, list):
+                if isinstance(model[0], core.Net):
+                    current_graph = nets_to_graph_def(
+                        model, **kwargs)
+                elif isinstance(model[0], caffe2_pb2.NetDef):
+                    current_graph = protos_to_graph_def(
+                        model, **kwargs)
+            else:
+                # Handles cnn.CNNModelHelper, model_helper.ModelHelper
+                current_graph = model_to_graph_def(
+                    model, **kwargs)
+            event = event_pb2.Event(
+                graph_def=current_graph.SerializeToString())
+            self._get_file_writer().add_event(event)
+
+    @staticmethod
+    def _encode(rawstr):
+        # I'd use urllib but, I'm unsure about the differences from python3 to python2, etc.
+        retval = rawstr
+        retval = retval.replace("%", "%%%02x" % (ord("%")))
+        retval = retval.replace("/", "%%%02x" % (ord("/")))
+        retval = retval.replace("\\", "%%%02x" % (ord("\\")))
+        return retval
+
+    def add_embedding(self, mat, metadata=None, label_img=None, global_step=None, tag='default', metadata_header=None):
+        """Add embedding projector data to summary.
+
+        Args:
+            mat (torch.Tensor or numpy.array): A matrix which each row is the feature vector of the data point
+            metadata (list): A list of labels, each element will be convert to string
+            label_img (torch.Tensor or numpy.array): Images correspond to each data point. Each image should be square.
+            global_step (int): Global step value to record
+            tag (string): Name for the embedding
+        Shape:
+            mat: :math:`(N, D)`, where N is number of data and D is feature dimension
+
+            label_img: :math:`(N, C, H, W)`, where `Height` should be equal to `Width`.
+
+        Examples::
+
+            import keyword
+            import torch
+            meta = []
+            while len(meta)<100:
+                meta = meta+keyword.kwlist # get some strings
+            meta = meta[:100]
+
+            for i, v in enumerate(meta):
+                meta[i] = v+str(i)
+
+            label_img = torch.rand(100, 3, 32, 32)
+            for i in range(100):
+                label_img[i]*=i/100.0
+
+            writer.add_embedding(torch.randn(100, 5), metadata=meta, label_img=label_img)
+            writer.add_embedding(torch.randn(100, 5), label_img=label_img)
+            writer.add_embedding(torch.randn(100, 5), metadata=meta)
+        """
+        from .x2num import make_np
+        mat = make_np(mat)
+        if global_step is None:
+            global_step = 0
+            # clear pbtxt?
+        # Maybe we should encode the tag so slashes don't trip us up?
+        # I don't think this will mess us up, but better safe than sorry.
+        subdir = "%s/%s" % (str(global_step).zfill(5), self._encode(tag))
+        save_path = os.path.join(self._get_file_writer().get_logdir(), subdir)
+        try:
+            os.makedirs(save_path)
+        except OSError:
+            print(
+                'warning: Embedding dir exists, did you set global_step for add_embedding()?')
+        if metadata is not None:
+            assert mat.shape[0] == len(
+                metadata), '#labels should equal with #data points'
+            make_tsv(metadata, save_path, metadata_header=metadata_header)
+        if label_img is not None:
+            assert mat.shape[0] == label_img.shape[0], '#images should equal with #data points'
+            assert label_img.shape[2] == label_img.shape[3], 'Image should be square, see tensorflow/tensorboard#670'
+            make_sprite(label_img, save_path)
+        assert mat.ndim == 2, 'mat should be 2D, where mat.size(0) is the number of data points'
+        make_mat(mat, save_path)
+        # new funcion to append to the config file a new embedding
+        append_pbtxt(metadata, label_img,
+                     self._get_file_writer().get_logdir(), subdir, global_step, tag)
+
+    def add_pr_curve(self, tag, labels, predictions, global_step=None,
+                     num_thresholds=127, weights=None, walltime=None):
+        """Adds precision recall curve.
+        Plotting a precision-recall curve lets you understand your model's
+        performance under different threshold settings. With this function,
+        you provide the ground truth labeling (T/F) and prediction confidence
+        (usually the output of your model) for each target. The TensorBoard UI
+        will let you choose the threshold interactively.
+
+        Args:
+            tag (string): Data identifier
+            labels (torch.Tensor, numpy.array, or string/blobname):
+              Ground truth data. Binary label for each element.
+            predictions (torch.Tensor, numpy.array, or string/blobname):
+              The probability that an element be classified as true.
+              Value should in [0, 1]
+            global_step (int): Global step value to record
+            num_thresholds (int): Number of thresholds used to draw the curve.
+            walltime (float): Optional override default walltime (time.time()) of event
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            import numpy as np
+            labels = np.random.randint(2, size=100)  # binary label
+            predictions = np.random.rand(100)
+            writer = SummaryWriter()
+            writer.add_pr_curve('pr_curve', labels, predictions, 0)
+            writer.close()
+
+        """
+        from .x2num import make_np
+        labels, predictions = make_np(labels), make_np(predictions)
+        self._get_file_writer().add_summary(
+            pr_curve(tag, labels, predictions, num_thresholds, weights),
+            global_step, walltime)
+
+    def add_pr_curve_raw(self, tag, true_positive_counts,
+                         false_positive_counts,
+                         true_negative_counts,
+                         false_negative_counts,
+                         precision,
+                         recall,
+                         global_step=None,
+                         num_thresholds=127,
+                         weights=None,
+                         walltime=None):
+        """Adds precision recall curve with raw data.
+
+        Args:
+            tag (string): Data identifier
+            true_positive_counts (torch.Tensor, numpy.array, or string/blobname): true positive counts
+            false_positive_counts (torch.Tensor, numpy.array, or string/blobname): false positive counts
+            true_negative_counts (torch.Tensor, numpy.array, or string/blobname): true negative counts
+            false_negative_counts (torch.Tensor, numpy.array, or string/blobname): false negative counts
+            precision (torch.Tensor, numpy.array, or string/blobname): precision
+            recall (torch.Tensor, numpy.array, or string/blobname): recall
+            global_step (int): Global step value to record
+            num_thresholds (int): Number of thresholds used to draw the curve.
+            walltime (float): Optional override default walltime (time.time()) of event
+            see: https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/pr_curve/README.md
+        """
+        self._get_file_writer().add_summary(
+            pr_curve_raw(tag,
+                         true_positive_counts,
+                         false_positive_counts,
+                         true_negative_counts,
+                         false_negative_counts,
+                         precision,
+                         recall,
+                         num_thresholds,
+                         weights),
+            global_step,
+            walltime)
+
+    def add_custom_scalars_multilinechart(self, tags, category='default', title='untitled'):
+        """Shorthand for creating multilinechart. Similar to ``add_custom_scalars()``, but the only necessary argument
+        is *tags*.
+
+        Args:
+            tags (list): list of tags that have been used in ``add_scalar()``
+
+        Examples::
+
+            writer.add_custom_scalars_multilinechart(['twse/0050', 'twse/2330'])
+        """
+        layout = {category: {title: ['Multiline', tags]}}
+        self._get_file_writer().add_summary(custom_scalars(layout))
+
+    def add_custom_scalars_marginchart(self, tags, category='default', title='untitled'):
+        """Shorthand for creating marginchart. Similar to ``add_custom_scalars()``, but the only necessary argument
+        is *tags*, which should have exactly 3 elements.
+
+        Args:
+            tags (list): list of tags that have been used in ``add_scalar()``
+
+        Examples::
+
+            writer.add_custom_scalars_marginchart(['twse/0050', 'twse/2330', 'twse/2006'])
+        """
+        assert len(tags) == 3
+        layout = {category: {title: ['Margin', tags]}}
+        self._get_file_writer().add_summary(custom_scalars(layout))
+
+    def add_custom_scalars(self, layout):
+        """Create special chart by collecting charts tags in 'scalars'. Note that this function can only be called once
+        for each SummaryWriter() object. Because it only provides metadata to tensorboard, the function can be called
+        before or after the training loop. See ``examples/demo_custom_scalars.py`` for more.
+
+        Args:
+            layout (dict): {categoryName: *charts*}, where *charts* is also a dictionary
+              {chartName: *ListOfProperties*}. The first element in *ListOfProperties* is the chart's type
+              (one of **Multiline** or **Margin**) and the second element should be a list containing the tags
+              you have used in add_scalar function, which will be collected into the new chart.
+
+        Examples::
+
+            layout = {'Taiwan':{'twse':['Multiline',['twse/0050', 'twse/2330']]},
+                         'USA':{ 'dow':['Margin',   ['dow/aaa', 'dow/bbb', 'dow/ccc']],
+                              'nasdaq':['Margin',   ['nasdaq/aaa', 'nasdaq/bbb', 'nasdaq/ccc']]}}
+
+            writer.add_custom_scalars(layout)
+        """
+        self._get_file_writer().add_summary(custom_scalars(layout))
+
+    def add_mesh(self, tag, vertices, colors=None, faces=None, config_dict=None, global_step=None, walltime=None):
+        """Add meshes or 3D point clouds to TensorBoard. The visualization is based on Three.js,
+        so it allows users to interact with the rendered object. Besides the basic definitions
+        such as vertices, faces, users can further provide camera parameter, lighting condition, etc.
+        Please check https://threejs.org/docs/index.html#manual/en/introduction/Creating-a-scene for
+        advanced usage. Note that currently this depends on tb-nightly to show.
+
+        Args:
+            tag (string): Data identifier
+            vertices (torch.Tensor): List of the 3D coordinates of vertices.
+            colors (torch.Tensor): Colors for each vertex
+            faces (torch.Tensor): Indices of vertices within each triangle. (Optional)
+            config_dict: Dictionary with ThreeJS classes names and configuration.
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+
+        Shape:
+            vertices: :math:`(B, N, 3)`. (batch, number_of_vertices, channels). If you see nothing on
+              tensorboard, try normalizing the values to [-1, 1].
+
+            colors: :math:`(B, N, 3)`. The values should lie in [0, 255].
+
+            faces: :math:`(B, N, 3)`. The values should lie in [0, number_of_vertices] for type `uint8`.
+
+        Examples::
+
+            from tensorboardX import SummaryWriter
+            vertices_tensor = np.array([[
+                [1, 1, 1],
+                [-1, -1, 1],
+                [1, -1, -1],
+                [-1, 1, -1],
+            ]], dtype=float)
+            colors_tensor = np.array([[
+                [255, 0, 0],
+                [0, 255, 0],
+                [0, 0, 255],
+                [255, 0, 255],
+            ]], dtype=int)
+            faces_tensor = np.array([[
+                [0, 2, 3],
+                [0, 3, 1],
+                [0, 1, 2],
+                [1, 3, 2],
+            ]], dtype=int)
+
+            writer = SummaryWriter()
+            writer.add_mesh('my_mesh', vertices=vertices_tensor, colors=colors_tensor, faces=faces_tensor)
+
+            writer.close()
+        """
+        self._get_file_writer().add_summary(mesh(tag, vertices, colors, faces, config_dict), global_step, walltime)
+
+    def close(self):
+        if self.all_writers is None:
+            return  # ignore double close
+        for writer in self.all_writers.values():
+            writer.flush()
+            writer.close()
+        self.file_writer = self.all_writers = None
+
+    def flush(self):
+        if self.all_writers is None:
+            return  # ignore double close
+        for writer in self.all_writers.values():
+            writer.flush()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
diff --git a/tensorboardX/tensorboardX/x2num.py b/tensorboardX/tensorboardX/x2num.py
new file mode 100644
index 0000000000000000000000000000000000000000..e65805e6bf813ba6cc246c5269ee6d7ea3ab05c1
--- /dev/null
+++ b/tensorboardX/tensorboardX/x2num.py
@@ -0,0 +1,64 @@
+# DO NOT alter/distruct/free input object !
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import numpy as np
+import six
+
+
+def check_nan(array):
+    tmp = np.sum(array)
+    if np.isnan(tmp) or np.isinf(tmp):
+        logging.warning('NaN or Inf found in input tensor.')
+    return array
+
+
+def make_np(x):
+    if isinstance(x, list):
+        return check_nan(np.array(x))
+    if isinstance(x, np.ndarray):
+        return check_nan(x)
+    if isinstance(x, six.string_types):  # Caffe2 will pass name of blob(s) to fetch
+        return check_nan(prepare_caffe2(x))
+    if np.isscalar(x):
+        return check_nan(np.array([x]))
+    if 'torch' in str(type(x)):
+        return check_nan(prepare_pytorch(x))
+    if 'chainer' in str(type(x)):
+        return check_nan(prepare_chainer(x))
+    if 'mxnet' in str(type(x)):
+        return check_nan(prepare_mxnet(x))
+    raise NotImplementedError(
+        'Got {}, but expected numpy array or torch tensor.'.format(type(x)))
+
+
+def prepare_pytorch(x):
+    import torch
+    if isinstance(x, torch.autograd.Variable):
+        x = x.data
+    x = x.cpu().numpy()
+    return x
+
+
+def prepare_theano(x):
+    import theano
+    pass
+
+
+def prepare_caffe2(x):
+    from caffe2.python import workspace
+    x = workspace.FetchBlob(x)
+    return x
+
+
+def prepare_mxnet(x):
+    x = x.asnumpy()
+    return x
+
+
+def prepare_chainer(x):
+    import chainer
+    x = chainer.cuda.to_cpu(x.data)
+    return x
diff --git a/tensorboardX/tests/__init__.py b/tensorboardX/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..affa7c0058a34284be75785983dfe3ee8798b33c
--- /dev/null
+++ b/tensorboardX/tests/__init__.py
@@ -0,0 +1,2 @@
+import torch
+import tensorboardX.proto
diff --git a/tensorboardX/tests/event_file_writer_test.py b/tensorboardX/tests/event_file_writer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80b1e015cb93e51e546a680d16f59c1b7d673b8
--- /dev/null
+++ b/tensorboardX/tests/event_file_writer_test.py
@@ -0,0 +1,139 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# """Tests for EventFileWriter and _AsyncWriter"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import glob
+import os
+from tensorboardX.event_file_writer import EventFileWriter
+from tensorboardX.event_file_writer import EventFileWriter as _AsyncWriter
+
+
+from tensorboardX.proto import event_pb2
+from tensorboardX.proto.summary_pb2 import Summary
+
+from tensorboard.compat.tensorflow_stub.pywrap_tensorflow import PyRecordReader_New
+import unittest
+
+
+class EventFileWriterTest(unittest.TestCase):
+  def get_temp_dir(self):
+    import tempfile
+    return tempfile.mkdtemp()
+
+  def test_event_file_writer_roundtrip(self):
+    _TAGNAME = 'dummy'
+    _DUMMY_VALUE = 42
+    logdir = self.get_temp_dir()
+    w = EventFileWriter(logdir)
+    summary = Summary(value=[Summary.Value(tag=_TAGNAME, simple_value=_DUMMY_VALUE)])
+    fakeevent = event_pb2.Event(summary=summary)
+    w.add_event(fakeevent)
+    w.close()
+    event_files = sorted(glob.glob(os.path.join(logdir, '*')))
+    self.assertEqual(len(event_files), 1)
+    r = PyRecordReader_New(event_files[0])
+    r.GetNext()  # meta data, so skip
+    r.GetNext()
+    self.assertEqual(fakeevent.SerializeToString(), r.record())
+
+  def test_setting_filename_suffix_works(self):
+    logdir = self.get_temp_dir()
+
+    w = EventFileWriter(logdir, filename_suffix='.event_horizon')
+    w.close()
+    event_files = sorted(glob.glob(os.path.join(logdir, '*')))
+    self.assertEqual(event_files[0].split('.')[-1], 'event_horizon')
+
+  def test_async_writer_without_write(self):
+    logdir = self.get_temp_dir()
+    w = EventFileWriter(logdir)
+    w.close()
+    event_files = sorted(glob.glob(os.path.join(logdir, '*')))
+    r = PyRecordReader_New(event_files[0])
+    r.GetNext()
+    s = event_pb2.Event.FromString(r.record())
+    self.assertEqual(s.file_version, "brain.Event:2")
+
+
+# skip the test, because tensorboard's implementaion of filewriter
+# writes raw data while that in tensorboardX writes event protobuf.
+class AsyncWriterTest(): #unittest.TestCase):
+  def get_temp_dir(self):
+    import tempfile
+    return tempfile.mkdtemp()
+
+  def test_async_writer_write_once(self):
+    foldername = os.path.join(self.get_temp_dir(), "async_writer_write_once")
+    w = _AsyncWriter(foldername)
+    filename = w._ev_writer._file_name
+    bytes_to_write = b"hello world"
+    w.add_event(bytes_to_write)
+    w.close()
+    with open(filename, 'rb') as f:
+      self.assertEqual(f.read(), bytes_to_write)
+
+  def test_async_writer_write_queue_full(self):
+    filename = os.path.join(self.get_temp_dir(), "async_writer_write_queue_full")
+    w = _AsyncWriter(filename)
+    bytes_to_write = b"hello world"
+    repeat = 100
+    for i in range(repeat):
+      w.write(bytes_to_write)
+    w.close()
+    with open(filename, 'rb') as f:
+      self.assertEqual(f.read(), bytes_to_write * repeat)
+
+  def test_async_writer_write_one_slot_queue(self):
+    filename = os.path.join(self.get_temp_dir(), "async_writer_write_one_slot_queue")
+    w = _AsyncWriter(filename, max_queue_size=1)
+    bytes_to_write = b"hello world"
+    repeat = 10  # faster
+    for i in range(repeat):
+      w.write(bytes_to_write)
+    w.close()
+    with open(filename, 'rb') as f:
+      self.assertEqual(f.read(), bytes_to_write * repeat)
+
+  def test_async_writer_close_triggers_flush(self):
+    filename = os.path.join(self.get_temp_dir(), "async_writer_close_triggers_flush")
+    w = _AsyncWriter(filename)
+    bytes_to_write = b"x" * 64
+    w.write(bytes_to_write)
+    w.close()
+    with open(filename, 'rb') as f:
+      self.assertEqual(f.read(), bytes_to_write)
+
+  def test_write_after_async_writer_closed(self):
+    filename = os.path.join(self.get_temp_dir(), "write_after_async_writer_closed")
+    w = _AsyncWriter(filename)
+    bytes_to_write = b"x" * 64
+    w.write(bytes_to_write)
+    w.close()
+
+    with self.assertRaises(IOError):
+      w.write(bytes_to_write)
+    # nothing is written to the file after close
+    with open(filename, 'rb') as f:
+      self.assertEqual(f.read(), bytes_to_write)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tensorboardX/tests/expect/caffe_mnist.expect b/tensorboardX/tests/expect/caffe_mnist.expect
new file mode 100644
index 0000000000000000000000000000000000000000..0a566570530603da3235dcbeade5a380471ea584
--- /dev/null
+++ b/tensorboardX/tests/expect/caffe_mnist.expect
@@ -0,0 +1,892 @@
+node {
+  name: "conv1/XavierFill"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 20
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/ConstantFill"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 20
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/XavierFill_1"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 20
+          }
+          dim {
+            size: 5
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/ConstantFill_1"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/XavierFill"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 500
+          }
+          dim {
+            size: 1600
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/ConstantFill"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 500
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/XavierFill_1"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 500
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/ConstantFill_1"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ImageInput"
+  op: "ImageInput"
+  input: "db"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "is_test"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "NHWC2NCHW"
+  op: "NHWC2NCHW"
+  input: "data_nhwc"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/Conv"
+  op: "Conv"
+  input: "data"
+  input: "conv1/conv1_w"
+  input: "conv1/conv1_b"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "conv1/MaxPool"
+  op: "MaxPool"
+  input: "conv1/conv1"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "conv1/Conv_1"
+  op: "Conv"
+  input: "conv1/pool1"
+  input: "conv1/conv2_w"
+  input: "conv1/conv2_b"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "conv1/MaxPool_1"
+  op: "MaxPool"
+  input: "conv1/conv2"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "classifier/FC"
+  op: "FC"
+  input: "conv1/pool2"
+  input: "classifier/fc3_w"
+  input: "classifier/fc3_b"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "classifier/Relu"
+  op: "Relu"
+  input: "classifier/fc3"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "classifier/FC_1"
+  op: "FC"
+  input: "classifier/fc3_1"
+  input: "classifier/pred_w"
+  input: "classifier/pred_b"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "classifier/Softmax"
+  op: "Softmax"
+  input: "classifier/pred"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "classifier/LabelCrossEntropy"
+  op: "LabelCrossEntropy"
+  input: "classifier/softmax"
+  input: "label"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/AveragedLoss"
+  op: "AveragedLoss"
+  input: "classifier/xent"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/ConstantFill"
+  op: "ConstantFill"
+  input: "classifier/loss"
+  device: "/gpu:0"
+  attr {
+    key: "value"
+    value {
+      f: 1.0
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/AveragedLossGradient"
+  op: "AveragedLossGradient"
+  input: "classifier/xent"
+  input: "GRADIENTS/classifier/loss_autogen_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/LabelCrossEntropyGradient"
+  op: "LabelCrossEntropyGradient"
+  input: "classifier/softmax"
+  input: "label"
+  input: "GRADIENTS/classifier/xent_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/SoftmaxGradient"
+  op: "SoftmaxGradient"
+  input: "classifier/softmax"
+  input: "GRADIENTS/classifier/softmax_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/FCGradient"
+  op: "FCGradient"
+  input: "classifier/fc3_1"
+  input: "classifier/pred_w"
+  input: "GRADIENTS/classifier/pred_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/ReluGradient"
+  op: "ReluGradient"
+  input: "classifier/fc3_1"
+  input: "GRADIENTS/classifier/fc3_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/c/FCGradient"
+  op: "FCGradient"
+  input: "conv1/pool2"
+  input: "classifier/fc3_w"
+  input: "GRADIENTS/classifier/fc3_grad_1"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/MaxPoolGradient"
+  op: "MaxPoolGradient"
+  input: "conv1/conv2"
+  input: "conv1/pool2"
+  input: "GRADIENTS/conv1/pool2_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/ConvGradient"
+  op: "ConvGradient"
+  input: "conv1/pool1"
+  input: "conv1/conv2_w"
+  input: "GRADIENTS/conv1/conv2_grad"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/MaxPoolGradient_1"
+  op: "MaxPoolGradient"
+  input: "conv1/conv1"
+  input: "conv1/pool1"
+  input: "GRADIENTS/conv1/pool1_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "GRADIENTS/ConvGradient"
+  op: "ConvGradient"
+  input: "data"
+  input: "conv1/conv1_w"
+  input: "GRADIENTS/conv1/conv1_grad"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/NCHW2NHWC"
+  op: "NCHW2NHWC"
+  input: "GRADIENTS/data_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_grad_1"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ReluGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/xent_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/AveragedLossGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/pred_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/FCGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/data_nhwc_grad"
+  op: "Blob"
+  input: "GRADIENTS/NCHW2NHWC:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/MaxPoolGradient_1:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv2_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ConvGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/pred"
+  op: "Blob"
+  input: "classifier/FC_1:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/pool2_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "data"
+  op: "Blob"
+  input: "NHWC2NCHW:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/pool1_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ConvGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "db"
+  op: "Placeholder"
+}
+node {
+  name: "classifier/fc3_b"
+  op: "Blob"
+  input: "classifier/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/pred_b"
+  op: "Blob"
+  input: "classifier/ConstantFill_1:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/softmax"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/data_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/pred_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/FCGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "label"
+  op: "Blob"
+  input: "ImageInput:1"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+  device: "/gpu:0"
+}
+node {
+  name: "data_nhwc"
+  op: "Blob"
+  input: "ImageInput:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv2"
+  op: "Blob"
+  input: "conv1/Conv_1:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv2_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/MaxPoolGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv2_b"
+  op: "Blob"
+  input: "conv1/ConstantFill_1:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_b"
+  op: "Blob"
+  input: "conv1/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc3_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/pred_w"
+  op: "Blob"
+  input: "classifier/XavierFill_1:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/pool2"
+  op: "Blob"
+  input: "conv1/MaxPool_1:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv2_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ConvGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc3_1"
+  op: "Blob"
+  input: "classifier/Relu:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/loss"
+  op: "Blob"
+  input: "classifier/AveragedLoss:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/FCGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_w"
+  op: "Blob"
+  input: "conv1/XavierFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/loss_autogen_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc3"
+  op: "Blob"
+  input: "classifier/FC:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/pred_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/SoftmaxGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/softmax_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/LabelCrossEntropyGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv2_w"
+  op: "Blob"
+  input: "conv1/XavierFill_1:0"
+  device: "/gpu:0"
+}
\ No newline at end of file
diff --git a/tensorboardX/tests/expect/caffe_overfeat.expect b/tensorboardX/tests/expect/caffe_overfeat.expect
new file mode 100644
index 0000000000000000000000000000000000000000..0803cb1bf90e4ad51aec68d86c52ca7780e503a0
--- /dev/null
+++ b/tensorboardX/tests/expect/caffe_overfeat.expect
@@ -0,0 +1,576 @@
+node {
+  name: "conv1/XavierFill"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 96
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 11
+          }
+          dim {
+            size: 11
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/ConstantFill"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 96
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/XavierFill"
+  op: "XavierFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1000
+          }
+          dim {
+            size: 4096
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/ConstantFill"
+  op: "ConstantFill"
+  device: "/gpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1000
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ImageInput"
+  op: "ImageInput"
+  input: "db"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "is_test"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "NHWC2NCHW"
+  op: "NHWC2NCHW"
+  input: "data_nhwc"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/Conv"
+  op: "Conv"
+  input: "data"
+  input: "conv1/conv1_w"
+  input: "conv1/conv1_b"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 11
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "conv1/Relu"
+  op: "Relu"
+  input: "conv1/conv1"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "conv1/MaxPool"
+  op: "MaxPool"
+  input: "conv1/conv1_1"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "classifier/FC"
+  op: "FC"
+  input: "conv1/pool1"
+  input: "classifier/fc_w"
+  input: "classifier/fc_b"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "classifier/Softmax"
+  op: "Softmax"
+  input: "classifier/fc"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "classifier/LabelCrossEntropy"
+  op: "LabelCrossEntropy"
+  input: "classifier/pred"
+  input: "label"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/AveragedLoss"
+  op: "AveragedLoss"
+  input: "classifier/xent"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/ConstantFill"
+  op: "ConstantFill"
+  input: "classifier/loss"
+  device: "/gpu:0"
+  attr {
+    key: "value"
+    value {
+      f: 1.0
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/AveragedLossGradient"
+  op: "AveragedLossGradient"
+  input: "classifier/xent"
+  input: "GRADIENTS/classifier/loss_autogen_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/LabelCrossEntropyGradient"
+  op: "LabelCrossEntropyGradient"
+  input: "classifier/pred"
+  input: "label"
+  input: "GRADIENTS/classifier/xent_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/SoftmaxGradient"
+  op: "SoftmaxGradient"
+  input: "classifier/pred"
+  input: "GRADIENTS/classifier/pred_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/c/FCGradient"
+  op: "FCGradient"
+  input: "conv1/pool1"
+  input: "classifier/fc_w"
+  input: "GRADIENTS/classifier/fc_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/MaxPoolGradient"
+  op: "MaxPoolGradient"
+  input: "conv1/conv1_1"
+  input: "conv1/pool1"
+  input: "GRADIENTS/conv1/pool1_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/ReluGradient"
+  op: "ReluGradient"
+  input: "conv1/conv1_1"
+  input: "GRADIENTS/conv1/conv1_grad"
+  device: "/gpu:0"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/ConvGradient"
+  op: "ConvGradient"
+  input: "data"
+  input: "conv1/conv1_w"
+  input: "GRADIENTS/conv1/conv1_grad_1"
+  device: "/gpu:0"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 11
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "GRADIENTS/NCHW2NHWC"
+  op: "NCHW2NHWC"
+  input: "GRADIENTS/data_grad"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_w"
+  op: "Blob"
+  input: "conv1/XavierFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc"
+  op: "Blob"
+  input: "classifier/FC:0"
+  device: "/gpu:0"
+}
+node {
+  name: "data_nhwc"
+  op: "Blob"
+  input: "ImageInput:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/pred_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/LabelCrossEntropyGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/SoftmaxGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_b"
+  op: "Blob"
+  input: "conv1/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:1"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/fc_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "label"
+  op: "Blob"
+  input: "ImageInput:1"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/data_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/loss"
+  op: "Blob"
+  input: "classifier/AveragedLoss:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/MaxPoolGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/loss_autogen_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/conv1_1"
+  op: "Blob"
+  input: "conv1/Relu:0"
+  device: "/gpu:0"
+}
+node {
+  name: "db"
+  op: "Placeholder"
+}
+node {
+  name: "classifier/pred"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+  device: "/gpu:0"
+}
+node {
+  name: "classifier/fc_b"
+  op: "Blob"
+  input: "classifier/ConstantFill:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/classifier/xent_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/AveragedLossGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "data"
+  op: "Blob"
+  input: "NHWC2NCHW:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/ConvGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_grad_1"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ReluGradient:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/data_nhwc_grad"
+  op: "Blob"
+  input: "GRADIENTS/NCHW2NHWC:0"
+  device: "/gpu:0"
+}
+node {
+  name: "GRADIENTS/conv1/pool1_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:2"
+  device: "/gpu:0"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+  device: "/gpu:0"
+}
+"""
+
diff --git a/tensorboardX/tests/expect/test_caffe2.test_simple_cnnmodel.expect b/tensorboardX/tests/expect/test_caffe2.test_simple_cnnmodel.expect
new file mode 100644
index 0000000000000000000000000000000000000000..100c1543144c8cd483cac47dd3a90a6ce8ea8e69
--- /dev/null
+++ b/tensorboardX/tests/expect/test_caffe2.test_simple_cnnmodel.expect
@@ -0,0 +1,319 @@
+node {
+  name: "conv1/XavierFill"
+  op: "XavierFill"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 96
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 11
+          }
+          dim {
+            size: 11
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/ConstantFill"
+  op: "ConstantFill"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 96
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/XavierFill"
+  op: "XavierFill"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1000
+          }
+          dim {
+            size: 4096
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/ConstantFill"
+  op: "ConstantFill"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1000
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/Conv"
+  op: "Conv"
+  input: "conv1/data"
+  input: "conv1/conv1_w"
+  input: "conv1/conv1_b"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 11
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "conv1/Relu"
+  op: "Relu"
+  input: "conv1/conv1"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "conv1/MaxPool"
+  op: "MaxPool"
+  input: "conv1/conv1_1"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "classifier/FC"
+  op: "FC"
+  input: "conv1/pool1"
+  input: "classifier/fc_w"
+  input: "classifier/fc_b"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "classifier/Softmax"
+  op: "Softmax"
+  input: "classifier/fc"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "classifier/LabelCrossEntropy"
+  op: "LabelCrossEntropy"
+  input: "classifier/pred"
+  input: "classifier/label"
+}
+node {
+  name: "classifier/AveragedLoss"
+  op: "AveragedLoss"
+  input: "classifier/xent"
+}
+node {
+  name: "conv1/conv1_w"
+  op: "Blob"
+  input: "conv1/XavierFill:0"
+}
+node {
+  name: "conv1/conv1_b"
+  op: "Blob"
+  input: "conv1/ConstantFill:0"
+}
+node {
+  name: "classifier/fc_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+}
+node {
+  name: "classifier/fc_b"
+  op: "Blob"
+  input: "classifier/ConstantFill:0"
+}
+node {
+  name: "conv1/data"
+  op: "Placeholder"
+}
+node {
+  name: "conv1/conv1_w"
+  op: "Blob"
+  input: "conv1/XavierFill:0"
+}
+node {
+  name: "conv1/conv1_b"
+  op: "Blob"
+  input: "conv1/ConstantFill:0"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+}
+node {
+  name: "conv1/conv1_1"
+  op: "Blob"
+  input: "conv1/Relu:0"
+}
+node {
+  name: "conv1/conv1_1"
+  op: "Blob"
+  input: "conv1/Relu:0"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+}
+node {
+  name: "classifier/fc_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+}
+node {
+  name: "classifier/fc_b"
+  op: "Blob"
+  input: "classifier/ConstantFill:0"
+}
+node {
+  name: "classifier/fc"
+  op: "Blob"
+  input: "classifier/FC:0"
+}
+node {
+  name: "classifier/fc"
+  op: "Blob"
+  input: "classifier/FC:0"
+}
+node {
+  name: "classifier/pred"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+}
+node {
+  name: "classifier/pred"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+}
+node {
+  name: "classifier/label"
+  op: "Placeholder"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+}
+node {
+  name: "classifier/loss"
+  op: "Blob"
+  input: "classifier/AveragedLoss:0"
+}
diff --git a/tensorboardX/tests/expect/test_caffe2.test_simple_model.expect b/tensorboardX/tests/expect/test_caffe2.test_simple_model.expect
new file mode 100644
index 0000000000000000000000000000000000000000..6a152e96ed1a379017f2320bf73f3de938f5daf1
--- /dev/null
+++ b/tensorboardX/tests/expect/test_caffe2.test_simple_model.expect
@@ -0,0 +1,1042 @@
+node {
+  name: "conv1/XavierFill"
+  op: "XavierFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 20
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/ConstantFill"
+  op: "ConstantFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 20
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/XavierFill_1"
+  op: "XavierFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 20
+          }
+          dim {
+            size: 5
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/ConstantFill_1"
+  op: "ConstantFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/XavierFill"
+  op: "XavierFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 500
+          }
+          dim {
+            size: 1600
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/ConstantFill"
+  op: "ConstantFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 500
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/XavierFill_1"
+  op: "XavierFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 500
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "classifier/ConstantFill_1"
+  op: "ConstantFill"
+  device: "/cpu:*"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv1/Conv"
+  op: "Conv"
+  input: "conv1/data"
+  input: "conv1/conv1_w"
+  input: "conv1/conv1_b"
+  device: "/cpu:*"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "conv1/MaxPool"
+  op: "MaxPool"
+  input: "conv1/conv1"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "conv1/Conv_1"
+  op: "Conv"
+  input: "conv1/pool1"
+  input: "conv1/conv2_w"
+  input: "conv1/conv2_b"
+  device: "/cpu:*"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "conv1/MaxPool_1"
+  op: "MaxPool"
+  input: "conv1/conv2"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "classifier/FC"
+  op: "FC"
+  input: "conv1/pool2"
+  input: "classifier/fc3_w"
+  input: "classifier/fc3_b"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "classifier/Relu"
+  op: "Relu"
+  input: "classifier/fc3"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "classifier/FC_1"
+  op: "FC"
+  input: "classifier/fc3_1"
+  input: "classifier/pred_w"
+  input: "classifier/pred_b"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "classifier/Softmax"
+  op: "Softmax"
+  input: "classifier/pred"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "classifier/LabelCrossEntropy"
+  op: "LabelCrossEntropy"
+  input: "classifier/softmax"
+  input: "classifier/label"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/AveragedLoss"
+  op: "AveragedLoss"
+  input: "classifier/xent"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/ConstantFill"
+  op: "ConstantFill"
+  input: "classifier/loss"
+  device: "/cpu:*"
+  attr {
+    key: "value"
+    value {
+      f: 1.0
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/AveragedLossGradient"
+  op: "AveragedLossGradient"
+  input: "classifier/xent"
+  input: "GRADIENTS/classifier/loss_autogen_grad"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/LabelCrossEntropyGradient"
+  op: "LabelCrossEntropyGradient"
+  input: "classifier/softmax"
+  input: "classifier/label"
+  input: "GRADIENTS/classifier/xent_grad"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/SoftmaxGradient"
+  op: "SoftmaxGradient"
+  input: "classifier/softmax"
+  input: "GRADIENTS/classifier/softmax_grad"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/FCGradient"
+  op: "FCGradient"
+  input: "classifier/fc3_1"
+  input: "classifier/pred_w"
+  input: "GRADIENTS/classifier/pred_grad"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GRADIENTS/classifier/ReluGradient"
+  op: "ReluGradient"
+  input: "classifier/fc3_1"
+  input: "GRADIENTS/classifier/fc3_grad"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/c/FCGradient"
+  op: "FCGradient"
+  input: "conv1/pool2"
+  input: "classifier/fc3_w"
+  input: "GRADIENTS/classifier/fc3_grad_1"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "use_cudnn"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/MaxPoolGradient"
+  op: "MaxPoolGradient"
+  input: "conv1/conv2"
+  input: "conv1/pool2"
+  input: "GRADIENTS/conv1/pool2_grad"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/ConvGradient"
+  op: "ConvGradient"
+  input: "conv1/pool1"
+  input: "conv1/conv2_w"
+  input: "GRADIENTS/conv1/conv2_grad"
+  device: "/cpu:*"
+  attr {
+    key: "exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+}
+node {
+  name: "GRADIENTS/conv1/MaxPoolGradient_1"
+  op: "MaxPoolGradient"
+  input: "conv1/conv1"
+  input: "conv1/pool1"
+  input: "GRADIENTS/conv1/pool1_grad"
+  device: "/cpu:*"
+  attr {
+    key: "cudnn_exhaustive_search"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "kernel"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "order"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "stride"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "conv1/conv1_w"
+  op: "Blob"
+  input: "conv1/XavierFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv1_b"
+  op: "Blob"
+  input: "conv1/ConstantFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2_w"
+  op: "Blob"
+  input: "conv1/XavierFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2_b"
+  op: "Blob"
+  input: "conv1/ConstantFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_b"
+  op: "Blob"
+  input: "classifier/ConstantFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/pred_w"
+  op: "Blob"
+  input: "classifier/XavierFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/pred_b"
+  op: "Blob"
+  input: "classifier/ConstantFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/data"
+  op: "Placeholder"
+}
+node {
+  name: "conv1/conv1_w"
+  op: "Blob"
+  input: "conv1/XavierFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv1_b"
+  op: "Blob"
+  input: "conv1/ConstantFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2_w"
+  op: "Blob"
+  input: "conv1/XavierFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2_b"
+  op: "Blob"
+  input: "conv1/ConstantFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2"
+  op: "Blob"
+  input: "conv1/Conv_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2"
+  op: "Blob"
+  input: "conv1/Conv_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool2"
+  op: "Blob"
+  input: "conv1/MaxPool_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool2"
+  op: "Blob"
+  input: "conv1/MaxPool_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_b"
+  op: "Blob"
+  input: "classifier/ConstantFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3"
+  op: "Blob"
+  input: "classifier/FC:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3"
+  op: "Blob"
+  input: "classifier/FC:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_1"
+  op: "Blob"
+  input: "classifier/Relu:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_1"
+  op: "Blob"
+  input: "classifier/Relu:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/pred_w"
+  op: "Blob"
+  input: "classifier/XavierFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/pred_b"
+  op: "Blob"
+  input: "classifier/ConstantFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/pred"
+  op: "Blob"
+  input: "classifier/FC_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/pred"
+  op: "Blob"
+  input: "classifier/FC_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/softmax"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/softmax"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/label"
+  op: "Placeholder"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/loss"
+  op: "Blob"
+  input: "classifier/AveragedLoss:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/loss"
+  op: "Blob"
+  input: "classifier/AveragedLoss:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/loss_autogen_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ConstantFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/xent"
+  op: "Blob"
+  input: "classifier/LabelCrossEntropy:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/loss_autogen_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ConstantFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/xent_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/AveragedLossGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/softmax"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/label"
+  op: "Placeholder"
+}
+node {
+  name: "GRADIENTS/classifier/xent_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/AveragedLossGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/softmax_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/LabelCrossEntropyGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/softmax"
+  op: "Blob"
+  input: "classifier/Softmax:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/softmax_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/LabelCrossEntropyGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/pred_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/SoftmaxGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_1"
+  op: "Blob"
+  input: "classifier/Relu:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/pred_w"
+  op: "Blob"
+  input: "classifier/XavierFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/pred_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/SoftmaxGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/pred_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/FCGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/pred_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/FCGradient:1"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/FCGradient:2"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_1"
+  op: "Blob"
+  input: "classifier/Relu:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_grad"
+  op: "Blob"
+  input: "GRADIENTS/classifier/FCGradient:2"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_grad_1"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ReluGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool2"
+  op: "Blob"
+  input: "conv1/MaxPool_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "classifier/fc3_w"
+  op: "Blob"
+  input: "classifier/XavierFill:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_grad_1"
+  op: "Blob"
+  input: "GRADIENTS/classifier/ReluGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/classifier/fc3_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:1"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/pool2_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:2"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2"
+  op: "Blob"
+  input: "conv1/Conv_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool2"
+  op: "Blob"
+  input: "conv1/MaxPool_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/pool2_grad"
+  op: "Blob"
+  input: "GRADIENTS/c/FCGradient:2"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/conv2_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/MaxPoolGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv2_w"
+  op: "Blob"
+  input: "conv1/XavierFill_1:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/conv2_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/MaxPoolGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/conv2_w_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ConvGradient:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/conv2_b_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ConvGradient:1"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/pool1_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ConvGradient:2"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/conv1"
+  op: "Blob"
+  input: "conv1/Conv:0"
+  device: "/cpu:*"
+}
+node {
+  name: "conv1/pool1"
+  op: "Blob"
+  input: "conv1/MaxPool:0"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/pool1_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/ConvGradient:2"
+  device: "/cpu:*"
+}
+node {
+  name: "GRADIENTS/conv1/conv1_grad"
+  op: "Blob"
+  input: "GRADIENTS/conv1/MaxPoolGradient_1:0"
+  device: "/cpu:*"
+}
diff --git a/tensorboardX/tests/expect/test_pr_curve.test_pr_purve.expect b/tensorboardX/tests/expect/test_pr_curve.test_pr_purve.expect
new file mode 100644
index 0000000000000000000000000000000000000000..44b54c9f9ed17cf5317bb43994ad8861526ff7a4
--- /dev/null
+++ b/tensorboardX/tests/expect/test_pr_curve.test_pr_purve.expect
@@ -0,0 +1,26 @@
+value {
+  tag: "tag"
+  tensor {
+    dtype: DT_FLOAT
+    tensor_shape {
+      dim {
+        size: 6
+      }
+      dim {
+        size: 1
+      }
+    }
+    float_val: 57.0
+    float_val: 43.0
+    float_val: 0.0
+    float_val: 0.0
+    float_val: 0.57
+    float_val: 1.0
+  }
+  metadata {
+    plugin_data {
+      plugin_name: "pr_curves"
+      content: "\020\001"
+    }
+  }
+}
diff --git a/tensorboardX/tests/expect/test_pr_curve.test_pr_purve_raw.expect b/tensorboardX/tests/expect/test_pr_curve.test_pr_purve_raw.expect
new file mode 100644
index 0000000000000000000000000000000000000000..0d09fb09978c3a205101392a3a0b71da499d98ef
--- /dev/null
+++ b/tensorboardX/tests/expect/test_pr_curve.test_pr_purve_raw.expect
@@ -0,0 +1,50 @@
+value {
+  tag: "prcurve with raw data"
+  tensor {
+    dtype: DT_FLOAT
+    tensor_shape {
+      dim {
+        size: 6
+      }
+      dim {
+        size: 5
+      }
+    }
+    float_val: 75.0
+    float_val: 64.0
+    float_val: 21.0
+    float_val: 5.0
+    float_val: 0.0
+    float_val: 150.0
+    float_val: 105.0
+    float_val: 18.0
+    float_val: 0.0
+    float_val: 0.0
+    float_val: 0.0
+    float_val: 45.0
+    float_val: 132.0
+    float_val: 150.0
+    float_val: 150.0
+    float_val: 0.0
+    float_val: 11.0
+    float_val: 54.0
+    float_val: 70.0
+    float_val: 75.0
+    float_val: 0.3333333
+    float_val: 0.3786982
+    float_val: 0.5384616
+    float_val: 1.0
+    float_val: 0.0
+    float_val: 1.0
+    float_val: 0.8533334
+    float_val: 0.28
+    float_val: 0.0666667
+    float_val: 0.0
+  }
+  metadata {
+    plugin_data {
+      plugin_name: "pr_curves"
+      content: "\020\001"
+    }
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_audio.expect b/tensorboardX/tests/expect/test_summary.test_audio.expect
new file mode 100644
index 0000000000000000000000000000000000000000..84bc0fbdcb95bdba88d31de35296cb33452eb288
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_audio.expect
@@ -0,0 +1,10 @@
+value {
+  tag: "dummy"
+  audio {
+    sample_rate: 44100.0
+    num_channels: 1
+    length_frames: 42
+    encoded_audio_string: "RIFFx\000\000\000WAVEfmt \020\000\000\000\001\000\001\000D\254\000\000\210X\001\000\002\000\020\000dataT\000\000\000\000\000\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177"
+    content_type: "audio/wav"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_custom_scalars.expect b/tensorboardX/tests/expect/test_summary.test_custom_scalars.expect
new file mode 100644
index 0000000000000000000000000000000000000000..d37bb52694e922dc30f1ce4b594b8fe0ccd9752d
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_custom_scalars.expect
@@ -0,0 +1,14 @@
+value {
+  tag: "custom_scalars__config__"
+  tensor {
+    dtype: DT_STRING
+    tensor_shape {
+    }
+    string_val: "\022(\n\006Taiwan\022\036\n\004twse\022\026\n\ttwse/0050\n\ttwse/2330\022]\n\003USA\022$\n\003dow\032\035\n\033\n\007dow/aaa\022\007dow/bbb\032\007dow/ccc\0220\n\006nasdaq\032&\n$\n\nnasdaq/aaa\022\nnasdaq/bbb\032\nnasdaq/ccc"
+  }
+  metadata {
+    plugin_data {
+      plugin_name: "custom_scalars"
+    }
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_float32_image.expect b/tensorboardX/tests/expect/test_summary.test_float32_image.expect
new file mode 100644
index 0000000000000000000000000000000000000000..a1b79408559793676787afa22e8a2bc7b6af6450
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_float32_image.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 32
+    width: 32
+    colorspace: 3
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000 \000\000\000 \010\002\000\000\000\374\030\355\243\000\000\000DIDATx\234cd``\370OK\300\370\340\301\003\232Z\3002j\301\360\267\200QAA\201\266\026\214\346\203Q\013\006\277\005\243\371\200 \030\372\221<j\001A0\232\017\010\202\241\037\311\243\026\020\0044\317\007\000]7\325\342\027k\025c\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_histogram_auto.expect b/tensorboardX/tests/expect/test_summary.test_histogram_auto.expect
new file mode 100644
index 0000000000000000000000000000000000000000..6f31cc5d0fa5f09b46f8dddd54e2b4ea75d4a4c9
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_histogram_auto.expect
@@ -0,0 +1,23 @@
+value {
+  tag: "dummy"
+  histo {
+    max: 1023.0
+    num: 1024.0
+    sum: 523776.0
+    sum_squares: 357389824.0
+    bucket_limit: 0.0
+    bucket_limit: 186.0
+    bucket_limit: 372.0
+    bucket_limit: 558.0
+    bucket_limit: 744.0
+    bucket_limit: 930.0
+    bucket_limit: 1023.0
+    bucket: 0.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 94.0
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_histogram_doane.expect b/tensorboardX/tests/expect/test_summary.test_histogram_doane.expect
new file mode 100644
index 0000000000000000000000000000000000000000..6f31cc5d0fa5f09b46f8dddd54e2b4ea75d4a4c9
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_histogram_doane.expect
@@ -0,0 +1,23 @@
+value {
+  tag: "dummy"
+  histo {
+    max: 1023.0
+    num: 1024.0
+    sum: 523776.0
+    sum_squares: 357389824.0
+    bucket_limit: 0.0
+    bucket_limit: 186.0
+    bucket_limit: 372.0
+    bucket_limit: 558.0
+    bucket_limit: 744.0
+    bucket_limit: 930.0
+    bucket_limit: 1023.0
+    bucket: 0.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 94.0
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_histogram_fd.expect b/tensorboardX/tests/expect/test_summary.test_histogram_fd.expect
new file mode 100644
index 0000000000000000000000000000000000000000..6f31cc5d0fa5f09b46f8dddd54e2b4ea75d4a4c9
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_histogram_fd.expect
@@ -0,0 +1,23 @@
+value {
+  tag: "dummy"
+  histo {
+    max: 1023.0
+    num: 1024.0
+    sum: 523776.0
+    sum_squares: 357389824.0
+    bucket_limit: 0.0
+    bucket_limit: 186.0
+    bucket_limit: 372.0
+    bucket_limit: 558.0
+    bucket_limit: 744.0
+    bucket_limit: 930.0
+    bucket_limit: 1023.0
+    bucket: 0.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 186.0
+    bucket: 94.0
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_hparams.expect b/tensorboardX/tests/expect/test_summary.test_hparams.expect
new file mode 100644
index 0000000000000000000000000000000000000000..d3cb3fa0cb70527f468f2ef0f1d7813a5f5b5d56
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_hparams.expect
@@ -0,0 +1,28 @@
+(value {
+  tag: "_hparams_/experiment"
+  metadata {
+    plugin_data {
+      plugin_name: "hparams"
+      content: "\022\024\"\004\n\002lr*\014\n\n\022\010accuracy"
+    }
+  }
+}
+, value {
+  tag: "_hparams_/session_start_info"
+  metadata {
+    plugin_data {
+      plugin_name: "hparams"
+      content: "\032\021\n\017\n\002lr\022\t\021\232\231\231\231\231\231\271?"
+    }
+  }
+}
+, value {
+  tag: "_hparams_/session_end_info"
+  metadata {
+    plugin_data {
+      plugin_name: "hparams"
+      content: "\"\002\010\001"
+    }
+  }
+}
+)
\ No newline at end of file
diff --git a/tensorboardX/tests/expect/test_summary.test_image_with_3_channel_batched.expect b/tensorboardX/tests/expect/test_summary.test_image_with_3_channel_batched.expect
new file mode 100644
index 0000000000000000000000000000000000000000..2895ff76fdb8f9c7b040e722326298d102c8fc1e
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_image_with_3_channel_batched.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 8
+    width: 16
+    colorspace: 3
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\020\000\000\000\010\010\002\000\000\000\177\024\350\300\000\000\000+IDATx\234cd8\320\360\037\033pww\307*\316\362\343\307\217\037\330$~\374\370\361\037\233\004\013\016\365\377q\211\217H\r\000d\305y\224,\220Z\033\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_image_with_boxes.expect b/tensorboardX/tests/expect/test_summary.test_image_with_boxes.expect
new file mode 100644
index 0000000000000000000000000000000000000000..4364b4841ef1d74e1eded252232468345b088278
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_image_with_boxes.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 32
+    width: 32
+    colorspace: 3
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000 \000\000\000 \010\002\000\000\000\374\030\355\243\000\000\000sIDATx\234\355\323=\n\300 \014\005\340\027p\250\267p\324\373\332\373\345\020vn\007\367>0\204b\311\233\305/\344G\000\334\236\021Uu\005R\000\377\007\244\224\342\013||\007\2655\330BfP\215\337S`>:{_l\020\335\242\tX6-\000\032r\007G\316\000\2561\226\201\244\252/\005V\357\026\271\003\033\0149\000\232\270\003+\260\301\220\003\240y\000T\221\324V\250_v\320\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_image_with_four_channel.expect b/tensorboardX/tests/expect/test_summary.test_image_with_four_channel.expect
new file mode 100644
index 0000000000000000000000000000000000000000..78bf28b2d09499478fcc48473bade07e63462d56
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_image_with_four_channel.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 8
+    width: 8
+    colorspace: 4
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\010\000\000\000\010\010\006\000\000\000\304\017\276\213\000\000\000\036IDATx\234cd8\320\340\360\037\017`\371\361\343\307\217\037\204\024\0204a\260+\000\000\240\302\373\327\246\231O\'\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_image_with_four_channel_batched.expect b/tensorboardX/tests/expect/test_summary.test_image_with_four_channel_batched.expect
new file mode 100644
index 0000000000000000000000000000000000000000..5ceccadb22ff4d24f4351eefe0dd751a0cff110a
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_image_with_four_channel_batched.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 8
+    width: 16
+    colorspace: 4
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\020\000\000\000\010\010\006\000\000\000\360v\177\227\000\000\000-IDATx\234cd8\320\340\360\037\017`ggg\307\'\317\362\343\307\217\037?\360(\370\001\305x\r\300g\003!0j\000\025\014\000\000\356b\366\370\366\336\316\301\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_image_with_one_channel.expect b/tensorboardX/tests/expect/test_summary.test_image_with_one_channel.expect
new file mode 100644
index 0000000000000000000000000000000000000000..7b43f507fc2d20dcf78c0907d1e6a7d3d3949185
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_image_with_one_channel.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 8
+    width: 8
+    colorspace: 3
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\010\000\000\000\010\010\002\000\000\000Km)\334\000\000\000\031IDATx\234cd``\370\217\r0\376\370\361\003\253\004\313\240\224\000\000;\267\273\313%\020=\255\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_image_with_one_channel_batched.expect b/tensorboardX/tests/expect/test_summary.test_image_with_one_channel_batched.expect
new file mode 100644
index 0000000000000000000000000000000000000000..e16187d04cb8e8be4c0d1d46c1941ab53c6f6356
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_image_with_one_channel_batched.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 8
+    width: 16
+    colorspace: 3
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\020\000\000\000\010\010\002\000\000\000\177\024\350\300\000\000\000(IDATx\234cd``\370\217\r\034?~\034\2538\313\217\037?~\374\370\201)\201U\020\252\001\253\304\250\006$\000\000\230\346y\315\204l;t\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_image_without_channel.expect b/tensorboardX/tests/expect/test_summary.test_image_without_channel.expect
new file mode 100644
index 0000000000000000000000000000000000000000..7b43f507fc2d20dcf78c0907d1e6a7d3d3949185
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_image_without_channel.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 8
+    width: 8
+    colorspace: 3
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\010\000\000\000\010\010\002\000\000\000Km)\334\000\000\000\031IDATx\234cd``\370\217\r0\376\370\361\003\253\004\313\240\224\000\000;\267\273\313%\020=\255\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_mesh.expect b/tensorboardX/tests/expect/test_summary.test_mesh.expect
new file mode 100644
index 0000000000000000000000000000000000000000..7cf14a60f4f5d92791a39425646c8824c7339a49
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_mesh.expect
@@ -0,0 +1,105 @@
+value {
+  tag: "my_mesh_1"
+  tensor {
+    dtype: DT_FLOAT
+    tensor_shape {
+      dim {
+        size: 1
+      }
+      dim {
+        size: 4
+      }
+      dim {
+        size: 3
+      }
+    }
+    float_val: 1.0
+    float_val: 1.0
+    float_val: 1.0
+    float_val: -1.0
+    float_val: -1.0
+    float_val: 1.0
+    float_val: 1.0
+    float_val: -1.0
+    float_val: -1.0
+    float_val: -1.0
+    float_val: 1.0
+    float_val: -1.0
+  }
+  metadata {
+    plugin_data {
+      plugin_name: "mesh"
+      content: "\022\007my_mesh\030\001*\004null2\003\001\004\003"
+    }
+  }
+}
+value {
+  tag: "my_mesh_2"
+  tensor {
+    dtype: DT_FLOAT
+    tensor_shape {
+      dim {
+        size: 1
+      }
+      dim {
+        size: 4
+      }
+      dim {
+        size: 3
+      }
+    }
+    float_val: 0.0
+    float_val: 2.0
+    float_val: 3.0
+    float_val: 0.0
+    float_val: 3.0
+    float_val: 1.0
+    float_val: 0.0
+    float_val: 1.0
+    float_val: 2.0
+    float_val: 1.0
+    float_val: 3.0
+    float_val: 2.0
+  }
+  metadata {
+    plugin_data {
+      plugin_name: "mesh"
+      content: "\022\007my_mesh\030\002*\004null2\003\001\004\003"
+    }
+  }
+}
+value {
+  tag: "my_mesh_3"
+  tensor {
+    dtype: DT_FLOAT
+    tensor_shape {
+      dim {
+        size: 1
+      }
+      dim {
+        size: 4
+      }
+      dim {
+        size: 3
+      }
+    }
+    float_val: 255.0
+    float_val: 0.0
+    float_val: 0.0
+    float_val: 0.0
+    float_val: 255.0
+    float_val: 0.0
+    float_val: 0.0
+    float_val: 0.0
+    float_val: 255.0
+    float_val: 255.0
+    float_val: 0.0
+    float_val: 255.0
+  }
+  metadata {
+    plugin_data {
+      plugin_name: "mesh"
+      content: "\022\007my_mesh\030\003*\004null2\003\001\004\003"
+    }
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_text.expect b/tensorboardX/tests/expect/test_summary.test_text.expect
new file mode 100644
index 0000000000000000000000000000000000000000..644244467670491c287dfc11f02e2cc7f365b4ca
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_text.expect
@@ -0,0 +1,17 @@
+value {
+  tag: "dummy/text_summary"
+  tensor {
+    dtype: DT_STRING
+    tensor_shape {
+      dim {
+        size: 1
+      }
+    }
+    string_val: "text 123"
+  }
+  metadata {
+    plugin_data {
+      plugin_name: "text"
+    }
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_uint8_image.expect b/tensorboardX/tests/expect/test_summary.test_uint8_image.expect
new file mode 100644
index 0000000000000000000000000000000000000000..985945fd6a36e668c30d76ba4594462424fe7c5f
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_uint8_image.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 32
+    width: 32
+    colorspace: 3
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000 \000\000\000 \010\002\000\000\000\374\030\355\243\000\000\000CIDATx\234cd```\244)PPP\240\251\371,\243\026\014\177\013\030\037<x@[\013F\363\301\250\005\203\337\202\321|@\020\014\375H\036\265\2000\030\315\007\204\300\320\217\344Q\013\010\003Z\347\003\000\211\014\037}z\035\001}\000\000\000\000IEND\256B`\202"
+  }
+}
diff --git a/tensorboardX/tests/expect/test_summary.test_video.expect b/tensorboardX/tests/expect/test_summary.test_video.expect
new file mode 100644
index 0000000000000000000000000000000000000000..e08e866222bf69d2b0322a73b7783a1a4e3338a5
--- /dev/null
+++ b/tensorboardX/tests/expect/test_summary.test_video.expect
@@ -0,0 +1,9 @@
+value {
+  tag: "dummy"
+  image {
+    height: 16
+    width: 16
+    colorspace: 1
+    encoded_image_string: "GIF89a\020\000\020\000\207\000\000\377\377\377\376\376\376\375\375\375\374\374\374\373\373\373\372\372\372\371\371\371\370\370\370\367\367\367\366\366\366\365\365\365\364\364\364\363\363\363\362\362\362\361\361\361\360\360\360\357\357\357\356\356\356\355\355\355\354\354\354\353\353\353\352\352\352\351\351\351\350\350\350\347\347\347\346\346\346\345\345\345\344\344\344\343\343\343\342\342\342\341\341\341\340\340\340\337\337\337\336\336\336\335\335\335\334\334\334\333\333\333\332\332\332\331\331\331\330\330\330\327\327\327\326\326\326\325\325\325\324\324\324\323\323\323\322\322\322\321\321\321\320\320\320\317\317\317\316\316\316\315\315\315\314\314\314\313\313\313\312\312\312\311\311\311\310\310\310\307\307\307\306\306\306\305\305\305\304\304\304\303\303\303\302\302\302\301\301\301\300\300\300\277\277\277\276\276\276\275\275\275\274\274\274\273\273\273\272\272\272\271\271\271\270\270\270\267\267\267\266\266\266\265\265\265\264\264\264\263\263\263\262\262\262\261\261\261\260\260\260\257\257\257\256\256\256\255\255\255\254\254\254\253\253\253\252\252\252\251\251\251\250\250\250\247\247\247\246\246\246\245\245\245\244\244\244\243\243\243\242\242\242\241\241\241\240\240\240\237\237\237\236\236\236\235\235\235\234\234\234\233\233\233\232\232\232\231\231\231\230\230\230\227\227\227\226\226\226\225\225\225\224\224\224\223\223\223\222\222\222\221\221\221\220\220\220\217\217\217\216\216\216\215\215\215\214\214\214\213\213\213\212\212\212\211\211\211\210\210\210\207\207\207\206\206\206\205\205\205\204\204\204\203\203\203\202\202\202\201\201\201\200\200\200\177\177\177~~~}}}|||{{{zzzyyyxxxwwwvvvuuutttsssrrrqqqpppooonnnmmmlllkkkjjjiiihhhgggfffeeedddcccbbbaaa```___^^^]]]\\\\\\[[[ZZZYYYXXXWWWVVVUUUTTTSSSRRRQQQPPPOOONNNMMMLLLKKKJJJIIIHHHGGGFFFEEEDDDCCCBBBAAA@@@???>>>===<<<;;;:::999888777666555444333222111000///...---,,,+++***)))(((\'\'\'&&&%%%$$$###\"\"\"!!!   \037\037\037\036\036\036\035\035\035\034\034\034\033\033\033\032\032\032\031\031\031\030\030\030\027\027\027\026\026\026\025\025\025\024\024\024\023\023\023\022\022\022\021\021\021\020\020\020\017\017\017\016\016\016\r\r\r\014\014\014\013\013\013\n\n\n\t\t\t\010\010\010\007\007\007\006\006\006\005\005\005\004\004\004\003\003\003\002\002\002\001\001\001\000\000\000!\377\013NETSCAPE2.0\003\001\377\377\000!\371\004\010\031\000\000\000,\000\000\000\000\020\000\020\000\000\010\377\000\377\001\010 `\000\201\002\006~\001\013&l\030\261b\306\016 H\240`\001\203\006\016\216!K\246l\031\263f\316\036@\210 a\002\205\n\026\236A\213&m\032\265j\326.`\310\240a\003\207\016\036\256a\313\246m\033\267n\336>\200\010!b\004\211\022&\276\201\013\'n\034\271r\346N\240H\241b\005\213\026.\316\241K\247n\035\273v\356^\300\210!c\006\215\0326\336\301\213\'o\036\275z\366n\340\310\241c\007\217\036>\356\341\313\247o\037\277~\376\376\000\n$h\020\241B\206~\000\t\"d\010\221\"F\016!J\244h\021\243F\216\216 I\242d\t\223&N\036A\212$i\022\245J\226\236@\211\"e\n\225*V.a\312\244i\023\247N\236\256`\311\242e\013\227.^>\201\n%j\024\251R\246\276\200\t#f\014\2312fN\241J\245j\025\253V\256\316\240I\243f\r\2336n^\301%\212%k\026\255Z\266\336\300\211#g\016\235:vn\341\312\245k\027\257^\276\356\340\311\243g\017\037\200}\374\004\004\000!\371\004\010\031\000\000\000,\000\000\000\000\020\000\020\000\000\010\377\000\177\000\t\"d\010\221\"F\376\001\010 `\000\201\002\006\216 I\242d\t\223&N\016 H\240`\001\203\006\016\236@\211\"e\n\225*V\036@\210 a\002\205\n\026\256`\311\242e\013\227.^.`\310\240a\003\207\016\036\276\200\t#f\014\2312f>\200\010!b\004\211\022&\316\240I\243f\r\2336nN\240H\241b\005\213\026.\336\300\211#g\016\235:v^\300\210!c\006\215\0326\356\340\311\243g\017\237>~n\340\310\241c\007\217\036>~\001\013&l\030\261b\306\376\000\n$h\020\241B\206\216!K\246l\031\263f\316\016!J\244h\021\243F\216\236A\213&m\032\265j\326\036A\212$i\022\245J\226\256a\313\246m\033\267n\336.a\312\244i\023\247N\236\276\201\013\'n\034\271r\346>\201\n%j\024\251R\246\316\241K\247n\035\273v\356N\241J\245j\025\253V\256\336\301%\213\'o\036\275z\366^\301\212%k\026\255Z\266\356\341\313\247o\037\277~\376n\341\312\245k\027/\200\275|\005\004\000!\371\004\010\031\000\000\000,\000\000\000\000\020\000\020\000\000\010\377\000\377\000\n$h\020\241B\206~\000\t\"d\010\221\"F\016!J\244h\021\243F\216\216 I\242d\t\223&N\036A\212$i\022\245J\226\236@\211\"e\n\225*V.a\312\244i\023\247N\236\256`\311\242e\013\227.^>\201\n%j\024\251R\246\276\200\t#f\014\2312fN\241J\245j\025\253V\256\316\240I\243f\r\2336n^\301\212%k\026\255Z\266\336\300\211#g\016\235:vn\341\312\245k\027\257^\276\356\340\311\243g\017\237>~\376\001\010 `\000\201\002\006~\001\013&l\030\261b\306\016 H\240`\001\203\006\016\216!K\246l\031\263f\316\036@\210 a\002\205\n\026\236A\213&m\032\265j\326.`\310\240a\003\207\016\036\256a\313\246m\033\267n\336>\200\010!b\004\211\022&\276\201\013\'n\034\271r\346N\240H\241b\005\213\026.\316\241K\247n\035\273v\356^\300%\210!c\006\215\0326\336\301\213\'o\036\275z\366n\340\310\241c\007\217\036>\356\341\313\247o\037?\200\375\374\005\004\000;"
+  }
+}
diff --git a/tensorboardX/tests/expect_reader.py b/tensorboardX/tests/expect_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..946219a2be4aadcc315c8e48df9d186a4a873590
--- /dev/null
+++ b/tensorboardX/tests/expect_reader.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import sys
+
+
+def removeWhiteChar(string):
+    return string.replace(' ', '').replace('\t', '').replace('\n', '')
+
+
+def compare_proto(str_to_compare, function_ptr):
+    module_id = function_ptr.__class__.__module__
+    functionName = function_ptr.id().split('.')[-1]
+    test_file = os.path.realpath(sys.modules[module_id].__file__)
+    expected_file = os.path.join(os.path.dirname(test_file),
+                        "expect",
+                        module_id.split('.')[-1] + '.' + functionName + ".expect")
+    print("expected_file: %s" % expected_file)
+    assert os.path.exists(expected_file)
+    with open(expected_file) as f:
+        expected = f.read()
+    str_to_compare = str(str_to_compare)
+    print("str_to_compare:", removeWhiteChar(str_to_compare))
+    print("expected:", removeWhiteChar(expected))
+    assert removeWhiteChar(str_to_compare) == removeWhiteChar(expected)
+
+
+def write_proto(str_to_compare, function_ptr):
+    module_id = function_ptr.__class__.__module__
+    functionName = function_ptr.id().split('.')[-1]
+    test_file = os.path.realpath(sys.modules[module_id].__file__)
+    expected_file = os.path.join(os.path.dirname(test_file),
+                    "expect",
+                    module_id.split('.')[-1] + '.' + functionName + ".expect")
+    print(expected_file)
+    with open(expected_file, 'w') as f:
+        f.write(str(str_to_compare))
diff --git a/tensorboardX/tests/record_writer_test.py b/tensorboardX/tests/record_writer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..975a0cbc2d9dde18d29a757558c5c74bd95f1a84
--- /dev/null
+++ b/tensorboardX/tests/record_writer_test.py
@@ -0,0 +1,78 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# """Tests for RecordWriter"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import os
+from tensorboardX.record_writer import RecordWriter
+from tensorboard.compat.tensorflow_stub.pywrap_tensorflow import PyRecordReader_New
+import unittest
+
+
+class RecordWriterTest(unittest.TestCase):
+  def get_temp_dir(self):
+    import tempfile
+    return tempfile.mkdtemp()
+
+  def test_expect_bytes_written(self):
+    filename = os.path.join(self.get_temp_dir(), "expect_bytes_written")
+    byte_len = 64
+    w = RecordWriter(filename)
+    bytes_to_write = b"x" * byte_len
+    w.write(bytes_to_write)
+    w.close()
+    with open(filename, 'rb') as f:
+      self.assertEqual(len(f.read()), (8 + 4 + byte_len + 4))  # uint64+uint32+data+uint32
+
+  def test_empty_record(self):
+    filename = os.path.join(self.get_temp_dir(), "empty_record")
+    w = RecordWriter(filename)
+    bytes_to_write = b""
+    w.write(bytes_to_write)
+    w.close()
+    r = PyRecordReader_New(filename)
+    r.GetNext()
+    self.assertEqual(r.record(), bytes_to_write)
+
+  def test_record_writer_roundtrip(self):
+    filename = os.path.join(self.get_temp_dir(), "record_writer_roundtrip")
+    w = RecordWriter(filename)
+    bytes_to_write = b"hello world"
+    times_to_test = 50
+    for _ in range(times_to_test):
+      w.write(bytes_to_write)
+    w.close()
+
+    r = PyRecordReader_New(filename)
+    for i in range(times_to_test):
+      r.GetNext()
+      self.assertEqual(r.record(), bytes_to_write)
+
+  # def test_expect_bytes_written_bytes_IO(self):
+  #   byte_len = 64
+  #   Bytes_io = six.BytesIO()
+  #   w = RecordWriter(Bytes_io)
+  #   bytes_to_write = b"x" * byte_len
+  #   w.write(bytes_to_write)
+  #   self.assertEqual(len(Bytes_io.getvalue()), (8 + 4 + byte_len + 4))  # uint64+uint32+data+uint32
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tensorboardX/tests/test_beholder.py b/tensorboardX/tests/test_beholder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89bdc7d5e63d00062b8580f6eea24374d609fd9
--- /dev/null
+++ b/tensorboardX/tests/test_beholder.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorboardX import SummaryWriter
+import numpy as np
+import pytest
+import unittest
+import tensorboardX.beholder as beholder_lib
+import tensorboardX.beholder.file_system_tools as fio
+from collections import namedtuple
+
+
+class BeholderTest(unittest.TestCase):
+    def test_beholder(self):
+        LOG_DIRECTORY = '/tmp/beholder-demo'
+        tensor_and_name = namedtuple('tensor_and_name', 'tensor, name')
+        fake_param = [tensor_and_name(np.random.randn(128, 768, 3), 'test' + str(i)) for i in range(5)]
+        arrays = [tensor_and_name(np.random.randn(128, 768, 3), 'test' + str(i)) for i in range(5)]
+        beholder = beholder_lib.Beholder(logdir=LOG_DIRECTORY)
+        beholder.update(
+            trainable=fake_param,
+            arrays=arrays,
+            frame=np.random.randn(128, 128),
+        )
+
+    def test_beholder_video(self):
+        LOG_DIRECTORY = '/tmp/beholder-demo-recording'
+        tensor_and_name = namedtuple('tensor_and_name', 'tensor, name')
+        fake_param = [tensor_and_name(np.random.randn(128, 768, 3), 'test' + str(i)) for i in range(5)]
+        arrays = [tensor_and_name(np.random.randn(128, 768, 3), 'test' + str(i)) for i in range(5)]
+        beholder = beholder_lib.Beholder(logdir=LOG_DIRECTORY)
+        pkl = fio.read_pickle(LOG_DIRECTORY + '/plugins/beholder/config.pkl')
+        pkl['is_recording'] = True
+        fio.write_pickle(pkl, LOG_DIRECTORY + '/plugins/beholder/config.pkl')
+        for i in range(3):
+            if i == 2:
+                pkl = fio.read_pickle(LOG_DIRECTORY + '/plugins/beholder/config.pkl')
+                pkl['is_recording'] = False
+                fio.write_pickle(pkl, LOG_DIRECTORY + '/plugins/beholder/config.pkl')
+            beholder.update(
+                trainable=fake_param,
+                arrays=arrays,
+                frame=np.random.randn(128, 128),
+            )
diff --git a/tensorboardX/tests/test_caffe2.py b/tensorboardX/tests/test_caffe2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed8c36124a5964c54b918498d0f2a188517b186f
--- /dev/null
+++ b/tensorboardX/tests/test_caffe2.py
@@ -0,0 +1,280 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from tensorboardX import SummaryWriter
+import os
+import unittest
+
+# try:
+import numpy as np
+import caffe2.python.brew as brew
+import caffe2.python.cnn as cnn
+import caffe2.python.core as core
+import caffe2.python.model_helper as model_helper
+from caffe2.proto import caffe2_pb2
+from caffe2.python import workspace
+import tensorboardX.caffe2_graph as tb
+from tensorboardX import x2num
+from .expect_reader import compare_proto, write_proto
+
+
+class Caffe2Test(unittest.TestCase):
+    def test_caffe2_np(self):
+        workspace.FeedBlob("testBlob", np.random.randn(1, 3, 64, 64).astype(np.float32))
+        assert isinstance(x2num.make_np('testBlob'), np.ndarray)
+        # assert isinstance(x2num.make_np('testBlob', 'IMG'), np.ndarray)
+
+    def test_that_operators_gets_non_colliding_names(self):
+        op = caffe2_pb2.OperatorDef()
+        op.type = 'foo'
+        op.input.extend(['foo'])
+        tb._fill_missing_operator_names([op])
+        self.assertEqual(op.input[0], 'foo')
+        self.assertEqual(op.name, 'foo_1')
+
+    def test_that_replacing_colons_gives_non_colliding_names(self):
+        # .. and update shapes
+        op = caffe2_pb2.OperatorDef()
+        op.name = 'foo:0'
+        op.input.extend(['foo:0', 'foo$0'])
+        shapes = {'foo:0': [1]}
+        blob_name_tracker = tb._get_blob_names([op])
+        tb._replace_colons(shapes, blob_name_tracker, [op], '$')
+        self.assertEqual(op.input[0], 'foo$0')
+        self.assertEqual(op.input[1], 'foo$0_1')
+        # Collision but blobs and op names are handled later by
+        # _fill_missing_operator_names.
+        self.assertEqual(op.name, 'foo$0')
+        self.assertEqual(len(shapes), 1)
+        self.assertEqual(shapes['foo$0'], [1])
+        self.assertEqual(len(blob_name_tracker), 2)
+        self.assertEqual(blob_name_tracker['foo$0'], 'foo:0')
+        self.assertEqual(blob_name_tracker['foo$0_1'], 'foo$0')
+
+    def test_that_adding_gradient_scope_does_no_fancy_renaming(self):
+        # because it cannot create collisions
+        op = caffe2_pb2.OperatorDef()
+        op.name = 'foo_grad'
+        op.input.extend(['foo_grad', 'foo_grad_1'])
+        shapes = {'foo_grad': [1]}
+        blob_name_tracker = tb._get_blob_names([op])
+        tb._add_gradient_scope(shapes, blob_name_tracker, [op])
+        self.assertEqual(op.input[0], 'GRADIENTS/foo_grad')
+        self.assertEqual(op.input[1], 'GRADIENTS/foo_grad_1')
+        self.assertEqual(op.name, 'GRADIENTS/foo_grad')
+        self.assertEqual(len(shapes), 1)
+        self.assertEqual(shapes['GRADIENTS/foo_grad'], [1])
+        self.assertEqual(len(blob_name_tracker), 2)
+        self.assertEqual(
+            blob_name_tracker['GRADIENTS/foo_grad'], 'foo_grad')
+        self.assertEqual(
+            blob_name_tracker['GRADIENTS/foo_grad_1'], 'foo_grad_1')
+
+    def test_that_auto_ssa_gives_non_colliding_names(self):
+        op1 = caffe2_pb2.OperatorDef()
+        op1.output.extend(['foo'])
+        op2 = caffe2_pb2.OperatorDef()
+        op2.input.extend(['foo'])
+        op2.output.extend(['foo'])
+        op2.output.extend(['foo_1'])
+        shapes = {'foo': [1], 'foo_1': [2]}
+        blob_name_tracker = tb._get_blob_names([op1, op2])
+        tb._convert_to_ssa(shapes, blob_name_tracker, [op1, op2])
+        self.assertEqual(op1.output[0], 'foo')
+        self.assertEqual(op2.input[0], 'foo')
+        self.assertEqual(op2.output[0], 'foo_1')
+        # Unfortunate name but we do not parse original `_` for now.
+        self.assertEqual(op2.output[1], 'foo_1_1')
+        self.assertEqual(len(shapes), 3)
+        self.assertEqual(shapes['foo'], [1])
+        self.assertEqual(shapes['foo_1'], [1])
+        self.assertEqual(shapes['foo_1_1'], [2])
+        self.assertEqual(len(blob_name_tracker), 3)
+        self.assertEqual(blob_name_tracker['foo'], 'foo')
+        self.assertEqual(blob_name_tracker['foo_1'], 'foo')
+        self.assertEqual(blob_name_tracker['foo_1_1'], 'foo_1')
+
+    def test_renaming_tensorflow_style(self):
+        # Construct some dummy operators here
+        # NOTE: '_w', '_bn', etc without the postfix '_' are only renamed when
+        # they are at the very end of the name.
+        # Test that '_w', '_w_' are renamed to '/weight', '/weight_', resp.
+        op1 = caffe2_pb2.OperatorDef()
+        op1.input.extend(['foo_w'])
+        op1.output.extend(['foo_w_2'])
+        # Test that '_bn', '_bn_' are renamed to '/batchnorm', '/batchnorm_',
+        # respectively.
+        op2 = caffe2_pb2.OperatorDef()
+        op2.input.extend(['foo_bn'])
+        op2.output.extend(['foo_bn_2'])
+        # Test that '_b', '_b_', are renamed to '/bias', '/bias_', resp.
+        op3 = caffe2_pb2.OperatorDef()
+        op3.input.extend(['foo_b'])
+        op3.output.extend(['foo_b_2'])
+        # Test that '_s', '_s_', are renamed to '/scale', '/scale_', resp.
+        op4 = caffe2_pb2.OperatorDef()
+        op4.input.extend(['foo_s'])
+        op4.output.extend(['foo_s_2'])
+        # Test that '_sum', '_sum_', are renamed to '/sum', '/sum_', resp.
+        op5 = caffe2_pb2.OperatorDef()
+        op5.input.extend(['foo_sum'])
+        op5.output.extend(['foo_sum_2'])
+        # Test that '_branch', '_branch_', are renamed to '/branch', '/branch_',
+        # respectively. Multiple inputs/outputs are also tested in this case.
+        op6 = caffe2_pb2.OperatorDef()
+        op6.input.extend(['foo_branch'])
+        op6.input.extend(['test_branch_2'])
+        op6.output.extend(['foo_branch_3'])
+        op6.output.extend(['test_branch4'])
+        shapes = {
+            'foo_w': [1], 'foo_w_2': [2], 'foo_bn': [3], 'foo_bn_2': [4],
+            'foo_b': [5], 'foo_b_2': [6], 'foo_s': [7], 'foo_s_2': [8],
+            'foo_sum': [9], 'foo_sum_2': [10], 'foo_branch': [11],
+            'test_branch_2': [12], 'foo_branch_3': [13], 'test_branch4': [14],
+        }
+        ops = [op1, op2, op3, op4, op5, op6]
+        blob_name_tracker = tb._get_blob_names(ops)
+        tb._rename_tensorflow_style(shapes, blob_name_tracker, ops)
+        # Testing that keys in blob name tracker were renamed correctly
+        self.assertEqual(blob_name_tracker['foo/weight'], 'foo_w')
+        self.assertEqual(blob_name_tracker['foo/weight_2'], 'foo_w_2')
+        self.assertEqual(blob_name_tracker['foo/batchnorm'], 'foo_bn')
+        self.assertEqual(blob_name_tracker['foo/batchnorm_2'], 'foo_bn_2')
+        self.assertEqual(blob_name_tracker['foo/bias'], 'foo_b')
+        self.assertEqual(blob_name_tracker['foo/bias_2'], 'foo_b_2')
+        self.assertEqual(blob_name_tracker['foo/scale'], 'foo_s')
+        self.assertEqual(blob_name_tracker['foo/scale_2'], 'foo_s_2')
+        self.assertEqual(blob_name_tracker['foo/sum'], 'foo_sum')
+        self.assertEqual(blob_name_tracker['foo/sum_2'], 'foo_sum_2')
+        self.assertEqual(blob_name_tracker['foo/branch'], 'foo_branch')
+        self.assertEqual(blob_name_tracker['test/branch_2'], 'test_branch_2')
+        self.assertEqual(blob_name_tracker['foo/branch_3'], 'foo_branch_3')
+        self.assertEqual(blob_name_tracker['test/branch4'], 'test_branch4')
+        # Testing that keys in shapes were renamed correctly
+        self.assertEqual(shapes['foo/weight'], [1])
+        self.assertEqual(shapes['foo/batchnorm_2'], [4])
+        self.assertEqual(shapes['foo/sum'], [9])
+        self.assertEqual(shapes['test/branch_2'], [12])
+        # Testing that the ops were renamed correctly
+        self.assertEqual(op1.input[0], 'foo/weight')
+        self.assertEqual(op1.output[0], 'foo/weight_2')
+        self.assertEqual(op2.input[0], 'foo/batchnorm')
+        self.assertEqual(op2.output[0], 'foo/batchnorm_2')
+        self.assertEqual(op3.input[0], 'foo/bias')
+        self.assertEqual(op3.output[0], 'foo/bias_2')
+        self.assertEqual(op4.input[0], 'foo/scale')
+        self.assertEqual(op4.output[0], 'foo/scale_2')
+        self.assertEqual(op5.input[0], 'foo/sum')
+        self.assertEqual(op5.output[0], 'foo/sum_2')
+        self.assertEqual(op6.input[0], 'foo/branch')
+        self.assertEqual(op6.input[1], 'test/branch_2')
+        self.assertEqual(op6.output[0], 'foo/branch_3')
+        self.assertEqual(op6.output[1], 'test/branch4')
+
+    def test_filter_ops(self):
+        op1 = caffe2_pb2.OperatorDef()
+        op1.input.extend(['remove_this'])
+        op1.output.extend(['random_output'])
+        op2 = caffe2_pb2.OperatorDef()
+        op2.input.extend(['leave_this'])
+        op2.output.extend(['leave_this_also'])
+        op3 = caffe2_pb2.OperatorDef()
+        op3.input.extend(['random_input'])
+        op3.output.extend(['remove_this_also'])
+
+        def filter_fn(blob):
+            # Filter all blobs with names containing 'remove'
+            return 'remove' not in str(blob)
+
+        op_set1 = [op1, op2, op3]
+        op_set2 = [op1, op2, op3]
+
+        # Test case for when perform_filter = True.
+        result_ops1 = tb._filter_ops(op_set1, filter_fn, True)
+        new_op1, new_op2 = result_ops1[0], result_ops1[1]
+        # input named 'remove_this' should have been filtered
+        self.assertEqual(len(new_op1.input), 0)
+        self.assertEqual(new_op1.output, ['random_output'])
+        self.assertEqual(new_op2.input, ['leave_this'])
+        self.assertEqual(new_op2.output, ['leave_this_also'])
+        # output named 'remove_this_also' should have been filtered as well.
+        # This should have also removed op3 as the filter function excludes ops
+        # with no outputs.
+        self.assertEqual(len(result_ops1), 2)
+
+        # Test case for when perform_filter = False. op_set2 should remain
+        # unchanged.
+        result_ops2 = tb._filter_ops(op_set2, filter_fn, False)
+        self.assertEqual(result_ops2, op_set2)
+
+    # Use show_simplified=False. This shows the original style of graph
+    # visualization from caffe2.contrib.tensorboard.
+    # TODO: Add test for show_simplified=True.
+    def test_simple_cnnmodel(self):
+        model = cnn.CNNModelHelper("NCHW", name="overfeat")
+        workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
+        workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+        with core.NameScope("conv1"):
+            conv1 = model.Conv("data", "conv1", 3, 96, 11, stride=4)
+            relu1 = model.Relu(conv1, conv1)
+            pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2)
+        with core.NameScope("classifier"):
+            fc = model.FC(pool1, "fc", 4096, 1000)
+            pred = model.Softmax(fc, "pred")
+            xent = model.LabelCrossEntropy([pred, "label"], "xent")
+            loss = model.AveragedLoss(xent, "loss")
+
+        blob_name_tracker = {}
+        graph = tb.model_to_graph_def(
+            model,
+            blob_name_tracker=blob_name_tracker,
+            shapes={},
+            show_simplified=False,
+        )
+
+        compare_proto(graph, self)
+
+    # cnn.CNNModelHelper is deprecated, so we also test with
+    # model_helper.ModelHelper. The model used in this test is taken from the
+    # Caffe2 MNIST tutorial. Also use show_simplified=False here.
+    def test_simple_model(self):
+        model = model_helper.ModelHelper(name="mnist")
+        # how come those inputs don't break the forward pass =.=a
+        workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
+        workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+
+        with core.NameScope("conv1"):
+            conv1 = brew.conv(model, "data", 'conv1', dim_in=1, dim_out=20, kernel=5)
+            # Image size: 24 x 24 -> 12 x 12
+            pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2)
+            # Image size: 12 x 12 -> 8 x 8
+            conv2 = brew.conv(model, pool1, 'conv2', dim_in=20, dim_out=100, kernel=5)
+            # Image size: 8 x 8 -> 4 x 4
+            pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2)
+        with core.NameScope("classifier"):
+            # 50 * 4 * 4 stands for dim_out from previous layer multiplied by the image size
+            fc3 = brew.fc(model, pool2, 'fc3', dim_in=100 * 4 * 4, dim_out=500)
+            relu = brew.relu(model, fc3, fc3)
+            pred = brew.fc(model, relu, 'pred', 500, 10)
+            softmax = brew.softmax(model, pred, 'softmax')
+            xent = model.LabelCrossEntropy([softmax, "label"], 'xent')
+            # compute the expected loss
+            loss = model.AveragedLoss(xent, "loss")
+        model.net.RunAllOnMKL()
+        model.param_init_net.RunAllOnMKL()
+        model.AddGradientOperators([loss], skip=1)
+        blob_name_tracker = {}
+        graph = tb.model_to_graph_def(
+            model,
+            blob_name_tracker=blob_name_tracker,
+            shapes={},
+            show_simplified=False,
+        )
+
+        compare_proto(graph, self)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tensorboardX/tests/test_chainer_np.py b/tensorboardX/tests/test_chainer_np.py
new file mode 100644
index 0000000000000000000000000000000000000000..923ea72639216576e5d42a1046c3439a11ec392a
--- /dev/null
+++ b/tensorboardX/tests/test_chainer_np.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from tensorboardX import x2num, SummaryWriter
+try:
+    import chainer
+    chainer_installed = True
+except ImportError:
+    print('Chainer is not installed, skipping test')
+    chainer_installed = False
+import numpy as np
+import unittest
+
+
+if chainer_installed:
+    chainer.Variable
+    tensors = [chainer.Variable(np.random.rand(3, 10, 10)),
+               chainer.Variable(np.random.rand(1)),
+               chainer.Variable(np.random.rand(1, 2, 3, 4, 5))]
+
+    class ChainerTest(unittest.TestCase):
+        def test_chainer_np(self):
+            for tensor in tensors:
+                # regular variable
+                assert isinstance(x2num.make_np(tensor), np.ndarray)
+
+            # python primitive type
+            assert(isinstance(x2num.make_np(0), np.ndarray))
+            assert(isinstance(x2num.make_np(0.1), np.ndarray))
+
+        def test_chainer_img(self):
+            shapes = [(77, 3, 13, 7), (77, 1, 13, 7), (3, 13, 7), (1, 13, 7), (13, 7)]
+            for s in shapes:
+                x = chainer.Variable(np.random.random_sample(s))
+                # assert x2num.make_np(x, 'IMG').shape[2] == 3
+
+        def test_chainer_write(self):
+            with SummaryWriter() as w:
+                w.add_scalar('scalar', chainer.Variable(np.random.rand(1)), 0)
diff --git a/tensorboardX/tests/test_crc32c.py b/tensorboardX/tests/test_crc32c.py
new file mode 100644
index 0000000000000000000000000000000000000000..792d6c7f0dee115712130109f02adb80c35b2156
--- /dev/null
+++ b/tensorboardX/tests/test_crc32c.py
@@ -0,0 +1,18 @@
+import unittest
+from tensorboardX.crc32c import _crc32c, _crc32c_native, crc32c
+
+
+class CRC32CTest(unittest.TestCase):
+    def test_crc32c(self):
+        data = b'abcd'
+        assert crc32c(data) == 0x92c80a31
+
+    def test_crc32c_python(self):
+        data = b'abcd'
+        assert _crc32c(data) == 0x92c80a31
+
+    def test_crc32c_native(self):
+        if _crc32c_native is None:
+            return
+        data = b'abcd'
+        assert _crc32c_native(data) == 0x92c80a31
diff --git a/tensorboardX/tests/test_embedding.py b/tensorboardX/tests/test_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf45e2cc147a8bc7c3734677f94d42f0b583a2c4
--- /dev/null
+++ b/tensorboardX/tests/test_embedding.py
@@ -0,0 +1,65 @@
+import unittest
+import torch
+from tensorboardX import SummaryWriter
+
+
+class EmbeddingTest(unittest.TestCase):
+    def test_embedding(self):
+        w = SummaryWriter()
+        all_features = torch.Tensor([[1, 2, 3], [5, 4, 1], [3, 7, 7]])
+        all_labels = torch.Tensor([33, 44, 55])
+        all_images = torch.zeros(3, 3, 5, 5)
+
+        w.add_embedding(all_features,
+                        metadata=all_labels,
+                        label_img=all_images,
+                        global_step=2)
+
+        dataset_label = ['test'] * 2 + ['train'] * 2
+        all_labels = list(zip(all_labels, dataset_label))
+        w.add_embedding(all_features,
+                        metadata=all_labels,
+                        label_img=all_images,
+                        metadata_header=['digit', 'dataset'],
+                        global_step=2)
+        # assert...
+
+    def test_embedding_64(self):
+        w = SummaryWriter()
+        all_features = torch.Tensor([[1, 2, 3], [5, 4, 1], [3, 7, 7]])
+        all_labels = torch.Tensor([33, 44, 55])
+        all_images = torch.zeros((3, 3, 5, 5), dtype=torch.float64)
+
+        w.add_embedding(all_features,
+                        metadata=all_labels,
+                        label_img=all_images,
+                        global_step=2)
+
+        dataset_label = ['test'] * 2 + ['train'] * 2
+        all_labels = list(zip(all_labels, dataset_label))
+        w.add_embedding(all_features,
+                        metadata=all_labels,
+                        label_img=all_images,
+                        metadata_header=['digit', 'dataset'],
+                        global_step=2)
+
+    def test_embedding_square(self):
+        w = SummaryWriter(comment='sq')
+        all_features = torch.rand(228,256)
+        all_images = torch.rand(228, 3, 32, 32)
+        for i in range(all_images.shape[0]):
+            all_images[i] *= (float(i)+60)/(all_images.shape[0]+60)
+        w.add_embedding(all_features,
+                        label_img=all_images,
+                        global_step=2)
+
+    def test_embedding_fail(self):
+        with self.assertRaises(AssertionError):
+            w = SummaryWriter(comment='shouldfail')
+            all_features = torch.rand(228,256)
+            all_images = torch.rand(228, 3, 16, 32)
+            for i in range(all_images.shape[0]):
+                all_images[i] *= (float(i)+60)/(all_images.shape[0]+60)
+            w.add_embedding(all_features,
+                            label_img=all_images,
+                            global_step=2)
diff --git a/tensorboardX/tests/test_figure.py b/tensorboardX/tests/test_figure.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbe5c74f49b508dd4768f099ee1b7c0aa505b63d
--- /dev/null
+++ b/tensorboardX/tests/test_figure.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import matplotlib.pyplot as plt
+import unittest
+
+from tensorboardX import SummaryWriter
+
+
+class FigureTest(unittest.TestCase):
+    def test_figure(self):
+        writer = SummaryWriter()
+
+        figure, axes = plt.figure(), plt.gca()
+        circle1 = plt.Circle((0.2, 0.5), 0.2, color='r')
+        circle2 = plt.Circle((0.8, 0.5), 0.2, color='g')
+        axes.add_patch(circle1)
+        axes.add_patch(circle2)
+        plt.axis('scaled')
+        plt.tight_layout()
+
+        writer.add_figure("add_figure/figure", figure, 0, close=False)
+        assert plt.fignum_exists(figure.number) is True
+
+        writer.add_figure("add_figure/figure", figure, 1)
+        assert plt.fignum_exists(figure.number) is False
+
+        writer.close()
+
+    def test_figure_list(self):
+        writer = SummaryWriter()
+
+        figures = []
+        for i in range(5):
+            figure = plt.figure()
+            plt.plot([i * 1, i * 2, i * 3], label="Plot " + str(i))
+            plt.xlabel("X")
+            plt.xlabel("Y")
+            plt.legend()
+            plt.tight_layout()
+            figures.append(figure)
+
+        writer.add_figure("add_figure/figure_list", figures, 0, close=False)
+        assert all([plt.fignum_exists(figure.number) is True for figure in figures])
+
+        writer.add_figure("add_figure/figure_list", figures, 1)
+        assert all([plt.fignum_exists(figure.number) is False for figure in figures])
+
+        writer.close()
diff --git a/tensorboardX/tests/test_numpy.py b/tensorboardX/tests/test_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..799c7b3c67889f37a28dd247e3ddd429a1d1b2da
--- /dev/null
+++ b/tensorboardX/tests/test_numpy.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import unittest
+
+from tensorboardX import x2num
+
+
+class NumpyTest(unittest.TestCase):
+    def test_scalar(self):
+        res = x2num.make_np(1.1)
+        assert isinstance(res, np.ndarray) and res.shape == (1,)
+        res = x2num.make_np(1 << 64 - 1)  # uint64_max
+        assert isinstance(res, np.ndarray) and res.shape == (1,)
+        res = x2num.make_np(np.float16(1.00000087))
+        assert isinstance(res, np.ndarray) and res.shape == (1,)
+        res = x2num.make_np(np.float128(1.00008 + 9))
+        assert isinstance(res, np.ndarray) and res.shape == (1,)
+        res = x2num.make_np(np.int64(100000000000))
+        assert isinstance(res, np.ndarray) and res.shape == (1,)
+
+    def test_make_grid(self):
+        pass
+
+    def test_numpy_vid(self):
+        shapes = [(16, 3, 30, 28, 28), (19, 3, 30, 28, 28), (19, 3, 29, 23, 19)]
+        for s in shapes:
+            x = np.random.random_sample(s)
+            # assert x2num.make_np(x, 'VID').shape[3] == 3
+
+    def test_numpy_vid_uint8(self):
+        x = np.random.randint(0, 256, (16, 3, 30, 28, 28)).astype(np.uint8)
+        # x2num.make_np(x, 'VID').shape[3] == 3
diff --git a/tensorboardX/tests/test_onnx_graph.py b/tensorboardX/tests/test_onnx_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..01f3acdef1ce60a6102b5e902bb62b4e3d46f105
--- /dev/null
+++ b/tensorboardX/tests/test_onnx_graph.py
@@ -0,0 +1,16 @@
+import unittest
+import torch
+from tensorboardX import SummaryWriter
+
+
+class ONNXGraphTest(unittest.TestCase):
+    def test_onnx_graph(self):
+        import subprocess
+        zoo_address = 'https://onnxzoo.blob.core.windows.net/models/opset_8/mnist/mnist.tar.gz'
+
+        res = subprocess.call(['wget', '-nc', zoo_address])
+        assert res == 0, 'cannot download example onnx model from the zoo'
+        res = subprocess.call(['tar', 'xf', 'mnist.tar.gz', '-C', 'examples/', 'mnist/model.onnx'])
+
+        with SummaryWriter() as w:
+            w.add_onnx_graph('examples/mnist/model.onnx')
diff --git a/tensorboardX/tests/test_pr_curve.py b/tensorboardX/tests/test_pr_curve.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96ddfb7b9c54a376490a7e83f646715711c2c45
--- /dev/null
+++ b/tensorboardX/tests/test_pr_curve.py
@@ -0,0 +1,68 @@
+import unittest
+import torch
+import numpy as np
+from tensorboardX import SummaryWriter
+from tensorboardX import summary
+from .expect_reader import compare_proto
+
+np.random.seed(0)
+true_positive_counts = [75, 64, 21, 5, 0]
+false_positive_counts = [150, 105, 18, 0, 0]
+true_negative_counts = [0, 45, 132, 150, 150]
+false_negative_counts = [0, 11, 54, 70, 75]
+precision = [0.3333333, 0.3786982, 0.5384616, 1.0, 0.0]
+recall = [1.0, 0.8533334, 0.28, 0.0666667, 0.0]
+
+
+class PRCurveTest(unittest.TestCase):
+    def test_smoke(self):
+        with SummaryWriter() as writer:
+            writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(
+                100), 1)
+            writer.add_pr_curve_raw('prcurve with raw data',
+                                    true_positive_counts,
+                                    false_positive_counts,
+                                    true_negative_counts,
+                                    false_negative_counts,
+                                    precision,
+                                    recall,
+                                    1)
+
+    def test_pr_purve(self):
+        random_labels = np.array([0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
+            1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+            0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
+            1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
+            1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0])
+        random_probs = np.array([0.33327776, 0.30032885, 0.79012837, 0.04306813, 0.65221544,
+            0.58481968, 0.28305522, 0.53795795, 0.00729739, 0.52266951,
+            0.22464247, 0.11262435, 0.41573075, 0.92493992, 0.73066758,
+            0.43867735, 0.27955449, 0.56975382, 0.53933028, 0.34392824,
+            0.30312509, 0.81732807, 0.55408544, 0.3969487 , 0.31768033,
+            0.24353266, 0.47198005, 0.19999122, 0.05788022, 0.24046305,
+            0.04651082, 0.30061738, 0.78321545, 0.82670207, 0.49200517,
+            0.80904619, 0.96711993, 0.3160946 , 0.01049424, 0.60108337,
+            0.56508792, 0.83729429, 0.9717386 , 0.46306053, 0.80232138,
+            0.24166823, 0.7393237 , 0.50820418, 0.04944932, 0.53854157,
+            0.10765172, 0.84723855, 0.20518299, 0.3143431 , 0.51299074,
+            0.47065695, 0.54267833, 0.1812676 , 0.06265177, 0.34110327,
+            0.30915171, 0.91870169, 0.91309447, 0.31395817, 0.36780571,
+            0.98297986, 0.00594547, 0.52839042, 0.70229202, 0.37779588,
+            0.15207045, 0.59759632, 0.72397032, 0.71502195, 0.90135725,
+            0.43970107, 0.17123532, 0.08785938, 0.04986818, 0.62702444,
+            0.69171023, 0.30537792, 0.30285433, 0.27124347, 0.27693729,
+            0.7136039 , 0.48022489, 0.20916285, 0.2018599 , 0.92401008,
+            0.30189681, 0.46862626, 0.96353024, 0.30468533, 0.68281294,
+            0.30623562, 0.40795975, 0.76824531, 0.89824215, 0.69845035], dtype=np.float16)
+        compare_proto(summary.pr_curve('tag', random_labels, random_probs, 1), self)
+
+    def test_pr_purve_raw(self):
+        compare_proto(summary.pr_curve_raw('prcurve with raw data',
+                                           true_positive_counts,
+                                           false_positive_counts,
+                                           true_negative_counts,
+                                           false_negative_counts,
+                                           precision,
+                                           recall,
+                                           1),
+                      self)
diff --git a/tensorboardX/tests/test_pytorch_graph.py b/tensorboardX/tests/test_pytorch_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4b5ce7f3a364f0534fd61d2f293167e2015004
--- /dev/null
+++ b/tensorboardX/tests/test_pytorch_graph.py
@@ -0,0 +1,28 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import unittest
+import torch
+from tensorboardX import SummaryWriter
+
+
+class PytorchGraphTest(unittest.TestCase):
+    def test_pytorch_graph(self):
+        dummy_input = (torch.zeros(1, 3),)
+
+        class myLinear(torch.nn.Module):
+            def __init__(self):
+                super(myLinear, self).__init__()
+                self.linear = torch.nn.Linear(3, 5)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        with SummaryWriter(comment='LinearModel') as w:
+            w.add_graph(myLinear(), dummy_input, True)
+
+    def test_wrong_input_size(self):
+        print('expect error here:')
+        with self.assertRaises(RuntimeError):
+            dummy_input = torch.rand(1, 9)
+            model = torch.nn.Linear(3, 5)
+            with SummaryWriter(comment='expect_error') as w:
+                w.add_graph(model, dummy_input)  # error
diff --git a/tensorboardX/tests/test_pytorch_np.py b/tensorboardX/tests/test_pytorch_np.py
new file mode 100644
index 0000000000000000000000000000000000000000..440bacbe9049f19993d9603687b0dec122dc6281
--- /dev/null
+++ b/tensorboardX/tests/test_pytorch_np.py
@@ -0,0 +1,70 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from tensorboardX import x2num, SummaryWriter
+import torch
+import numpy as np
+import unittest
+
+
+class PyTorchNumpyTest(unittest.TestCase):
+    def test_pytorch_np(self):
+        tensors = [torch.rand(3, 10, 10), torch.rand(1), torch.rand(1, 2, 3, 4, 5)]
+        for tensor in tensors:
+            # regular tensor
+            assert isinstance(x2num.make_np(tensor), np.ndarray)
+
+            # CUDA tensor
+            if torch.cuda.device_count() > 0:
+                assert isinstance(x2num.make_np(tensor.cuda()), np.ndarray)
+
+            # regular variable
+            assert isinstance(x2num.make_np(torch.autograd.Variable(tensor)), np.ndarray)
+
+            # CUDA variable
+            if torch.cuda.device_count() > 0:
+                assert isinstance(x2num.make_np(torch.autograd.Variable(tensor).cuda()), np.ndarray)
+
+        # python primitive type
+        assert(isinstance(x2num.make_np(0), np.ndarray))
+        assert(isinstance(x2num.make_np(0.1), np.ndarray))
+
+    def test_pytorch_write(self):
+        with SummaryWriter() as w:
+            w.add_scalar('scalar', torch.autograd.Variable(torch.rand(1)), 0)
+
+    def test_pytorch_histogram(self):
+        with SummaryWriter() as w:
+            w.add_histogram('float histogram', torch.rand((50,)))
+            w.add_histogram('int histogram', torch.randint(0, 100, (50,)))
+
+    def test_pytorch_histogram_raw(self):
+        with SummaryWriter() as w:
+            num = 50
+            floats = x2num.make_np(torch.rand((num,)))
+            bins = [0.0, 0.25, 0.5, 0.75, 1.0]
+            counts, limits = np.histogram(floats, bins)
+            sum_sq = floats.dot(floats).item()
+            w.add_histogram_raw('float histogram raw',
+                                min=floats.min().item(),
+                                max=floats.max().item(),
+                                num=num,
+                                sum=floats.sum().item(),
+                                sum_squares=sum_sq,
+                                bucket_limits=limits[1:].tolist(),
+                                bucket_counts=counts.tolist())
+
+            ints = x2num.make_np(torch.randint(0, 100, (num,)))
+            bins = [0, 25, 50, 75, 100]
+            counts, limits = np.histogram(ints, bins)
+            sum_sq = ints.dot(ints).item()
+            w.add_histogram_raw('int histogram raw',
+                                min=ints.min().item(),
+                                max=ints.max().item(),
+                                num=num,
+                                sum=ints.sum().item(),
+                                sum_squares=sum_sq,
+                                bucket_limits=limits[1:].tolist(),
+                                bucket_counts=counts.tolist())
diff --git a/tensorboardX/tests/test_record_writer.py b/tensorboardX/tests/test_record_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..517b2557fa2dbc8ffa2af36e5f1a48baf9d32fe9
--- /dev/null
+++ b/tensorboardX/tests/test_record_writer.py
@@ -0,0 +1,26 @@
+from tensorboardX import SummaryWriter
+import unittest
+from tensorboardX.record_writer import S3RecordWriter, make_valid_tf_name
+import os
+import boto3
+from moto import mock_s3
+
+os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
+os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+
+
+class RecordWriterTest(unittest.TestCase):
+    @mock_s3
+    def test_record_writer_s3(self):
+        client = boto3.client('s3', region_name='us-east-1')
+        client.create_bucket(Bucket='this')
+        writer = S3RecordWriter('s3://this/is/apen')
+        bucket, path = writer.bucket_and_path()
+        assert bucket == 'this'
+        assert path == 'is/apen'
+        writer.write(bytes(42))
+        writer.flush()
+
+    def test_make_valid_tf_name(self):
+        newname = make_valid_tf_name('$ave/&sound')
+        assert newname == '._ave/_sound'
diff --git a/tensorboardX/tests/test_summary.py b/tensorboardX/tests/test_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..180a1d22cecdf5a01920fe54eff2752c29bc87df
--- /dev/null
+++ b/tensorboardX/tests/test_summary.py
@@ -0,0 +1,133 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+from tensorboardX import summary
+from .expect_reader import compare_proto, write_proto
+import numpy as np
+import pytest
+import unittest
+# compare_proto = write_proto  # massive update expect
+
+def tensor_N(shape, dtype=float):
+    numel = np.prod(shape)
+    x = (np.arange(numel, dtype=dtype)).reshape(shape)
+    return x
+
+class SummaryTest(unittest.TestCase):
+    def test_uint8_image(self):
+        '''
+        Tests that uint8 image (pixel values in [0, 255]) is not changed
+        '''
+        test_image = tensor_N(shape=(3, 32, 32), dtype=np.uint8)
+        compare_proto(summary.image('dummy', test_image), self)
+
+    def test_float32_image(self):
+        '''
+        Tests that float32 image (pixel values in [0, 1]) are scaled correctly
+        to [0, 255]
+        '''
+        test_image = tensor_N(shape=(3, 32, 32))
+        compare_proto(summary.image('dummy', test_image), self)
+
+    def test_float_1_converts_to_uint8_255(self):
+        green_uint8 = np.array([[[0, 255, 0]]], dtype='uint8') 
+        green_float32 = np.array([[[0, 1, 0]]], dtype='float32') 
+
+        a = summary.image(tensor=green_uint8, tag='')
+        b = summary.image(tensor=green_float32, tag='')
+        self.assertEqual(a, b)
+
+    def test_list_input(self):
+        with pytest.raises(Exception):
+            summary.histogram('dummy', [1, 3, 4, 5, 6], 'tensorflow')
+
+    def test_empty_input(self):
+        print('expect error here:')
+        with pytest.raises(Exception):
+            summary.histogram('dummy', np.ndarray(0), 'tensorflow')
+
+    def test_image_with_boxes(self):
+        compare_proto(summary.image_boxes('dummy',
+                            tensor_N(shape=(3, 32, 32)),
+                            np.array([[10, 10, 40, 40]])), self)
+
+    def test_image_with_one_channel(self):
+        compare_proto(summary.image('dummy', tensor_N(shape=(1, 8, 8)), dataformats='CHW'), self)
+
+    def test_image_with_four_channel(self):
+        compare_proto(summary.image('dummy', tensor_N(shape=(4, 8, 8)), dataformats='CHW'), self)
+
+    def test_image_with_one_channel_batched(self):
+        compare_proto(summary.image('dummy', tensor_N(shape=(2, 1, 8, 8)), dataformats='NCHW'), self)
+
+    def test_image_with_3_channel_batched(self):
+        compare_proto(summary.image('dummy', tensor_N(shape=(2, 3, 8, 8)), dataformats='NCHW'), self)
+
+    def test_image_with_four_channel_batched(self):
+        compare_proto(summary.image('dummy', tensor_N(shape=(2, 4, 8, 8)), dataformats='NCHW'), self)
+
+    def test_image_without_channel(self):
+        compare_proto(summary.image('dummy', tensor_N(shape=(8, 8)), dataformats='HW'), self)
+
+    def test_video(self):
+        try:
+            import moviepy
+        except ImportError:
+            return
+        compare_proto(summary.video('dummy', tensor_N(shape=(4, 3, 1, 8, 8))), self)
+        summary.video('dummy', tensor_N(shape=(16, 48, 1, 28, 28)))
+        summary.video('dummy', tensor_N(shape=(20, 7, 1, 8, 8)))
+
+    def test_audio(self):
+        compare_proto(summary.audio('dummy', tensor_N(shape=(42,))), self)
+
+    def test_text(self):
+        compare_proto(summary.text('dummy', 'text 123'), self)
+
+    def test_histogram_auto(self):
+        compare_proto(summary.histogram('dummy', tensor_N(shape=(1024,)), bins='auto', max_bins=5), self)
+
+    def test_histogram_fd(self):
+        compare_proto(summary.histogram('dummy', tensor_N(shape=(1024,)), bins='fd', max_bins=5), self)
+
+    def test_histogram_doane(self):
+        compare_proto(summary.histogram('dummy', tensor_N(shape=(1024,)), bins='doane', max_bins=5), self)
+
+    def test_custom_scalars(self):
+        layout = {'Taiwan': {'twse': ['Multiline', ['twse/0050', 'twse/2330']]},
+                    'USA': {'dow': ['Margin', ['dow/aaa', 'dow/bbb', 'dow/ccc']],
+                            'nasdaq': ['Margin', ['nasdaq/aaa', 'nasdaq/bbb', 'nasdaq/ccc']]}}
+        summary.custom_scalars(layout)  # smoke test only.
+
+    def test_mesh(self):
+        vertices_tensor = np.array([[
+            [1, 1, 1],
+            [-1, -1, 1],
+            [1, -1, -1],
+            [-1, 1, -1],
+        ]], dtype=float)
+        colors_tensor = np.array([[
+            [255, 0, 0],
+            [0, 255, 0],
+            [0, 0, 255],
+            [255, 0, 255],
+        ]], dtype=int)
+        faces_tensor = np.array([[
+            [0, 2, 3],
+            [0, 3, 1],
+            [0, 1, 2],
+            [1, 3, 2],
+        ]], dtype=int)
+        compare_proto(summary.mesh('my_mesh', vertices=vertices_tensor, colors=colors_tensor, faces=faces_tensor), self)
+
+    # It's hard to get dictionary sorted with same result in various envs. So only use one.
+    def test_hparams(self):
+        hp = {'lr': 0.1}
+        mt = {'accuracy': 0.1}
+        compare_proto(summary.hparams(hp, mt), self)
+
+    def test_hparams_smoke(self):
+        hp = {'lr': 0.1, 'bsize': 4}
+        mt = {'accuracy': 0.1, 'loss': 10}
+        summary.hparams(hp, mt)
+        
+        hp = {'string': "1b", 'use magic': True}
+        summary.hparams(hp, mt)
diff --git a/tensorboardX/tests/test_summary_writer.py b/tensorboardX/tests/test_summary_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e697108553412246e6464d7b233105b2eb51800
--- /dev/null
+++ b/tensorboardX/tests/test_summary_writer.py
@@ -0,0 +1,45 @@
+from tensorboardX import SummaryWriter
+import unittest
+
+
+class SummaryWriterTest(unittest.TestCase):
+    def test_summary_writer_ctx(self):
+        # after using a SummaryWriter as a ctx it should be closed
+        with SummaryWriter(filename_suffix='.test') as writer:
+            writer.add_scalar('test', 1)
+        assert writer.file_writer is None
+
+    def test_summary_writer_backcomapt(self):
+        with SummaryWriter(log_dir='/tmp/tbxtest') as writer:
+            writer.add_scalar('test', 1)
+
+    def test_summary_writer_close(self):
+        # Opening and closing SummaryWriter a lot should not run into
+        # OSError: [Errno 24] Too many open files
+        passed = True
+        try:
+            writer = SummaryWriter()
+            writer.close()
+        except OSError:
+            passed = False
+
+        assert passed
+
+    def test_windowsPath(self):
+        dummyPath = "C:\\Downloads\\fjoweifj02utj43tj430"
+        with SummaryWriter(dummyPath) as writer:
+            writer.add_scalar('test', 1)
+        import shutil
+        shutil.rmtree(dummyPath)
+
+    def test_pathlib(self):
+        import sys
+        if sys.version_info.major == 2:
+            import pathlib2 as pathlib
+        else:
+            import pathlib
+        p = pathlib.Path('./pathlibtest')
+        with SummaryWriter(p) as writer:
+            writer.add_scalar('test', 1)
+        import shutil
+        shutil.rmtree(str(p))
diff --git a/tensorboardX/tests/test_test.py b/tensorboardX/tests/test_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5eaf2a64cb1e31f44d6643ef7958b13a85c099c
--- /dev/null
+++ b/tensorboardX/tests/test_test.py
@@ -0,0 +1,3 @@
+def test_linting():
+    import subprocess
+    # subprocess.check_output(['flake8', 'tensorboardX'])
diff --git a/tensorboardX/tests/test_utils.py b/tensorboardX/tests/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fe24dbd7f704a706ec21098877bc759eea376a7
--- /dev/null
+++ b/tensorboardX/tests/test_utils.py
@@ -0,0 +1,29 @@
+from tensorboardX import summary
+from tensorboardX.utils import make_grid, _prepare_video, convert_to_HWC
+import numpy as np
+import pytest
+import unittest
+
+
+class UtilsTest(unittest.TestCase):
+    def test_to_HWC(self):
+        np.random.seed(1)
+        test_image = np.random.randint(0, 256, size=(3, 32, 32), dtype=np.uint8)
+        converted = convert_to_HWC(test_image, 'chw')
+        assert converted.shape == (32, 32, 3)
+        test_image = np.random.randint(0, 256, size=(16, 3, 32, 32), dtype=np.uint8)
+        converted = convert_to_HWC(test_image, 'nchw')
+        assert converted.shape == (64, 256, 3)
+        test_image = np.random.randint(0, 256, size=(32, 32), dtype=np.uint8)
+        converted = convert_to_HWC(test_image, 'hw')
+        assert converted.shape == (32, 32, 3)
+
+    def test_prepare_video(self):
+        # at each timestep the sum over all other dimensions of the video should stay the same
+        np.random.seed(1)
+        V_before = np.random.random((4, 10, 3, 20, 20))
+        V_after = _prepare_video(np.copy(V_before))
+        V_before = np.swapaxes(V_before, 0, 1)
+        V_before = np.reshape(V_before, newshape=(10, -1))
+        V_after = np.reshape(V_after, newshape=(10, -1))
+        np.testing.assert_array_almost_equal(np.sum(V_before, axis=1), np.sum(V_after, axis=1))
diff --git a/tensorboardX/tests/test_visdom.py b/tensorboardX/tests/test_visdom.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6b0e13fc6ed3b5e9ba67c0d9822f0f5047f8a2c
--- /dev/null
+++ b/tensorboardX/tests/test_visdom.py
@@ -0,0 +1,33 @@
+from tensorboardX import TorchVis
+
+import numpy as np
+import pytest
+import unittest
+
+true_positive_counts = [75, 64, 21, 5, 0]
+false_positive_counts = [150, 105, 18, 0, 0]
+true_negative_counts = [0, 45, 132, 150, 150]
+false_negative_counts = [0, 11, 54, 70, 75]
+precision = [0.3333333, 0.3786982, 0.5384616, 1.0, 0.0]
+recall = [1.0, 0.8533334, 0.28, 0.0666667, 0.0]
+
+
+class VisdomTest(unittest.TestCase):
+    def test_TorchVis(self):
+        w = TorchVis('visdom')
+        w.add_scalar('scalar_visdom', 1, 0)
+        w.add_scalar('scalar_visdom', 2, 1)
+        w.add_histogram('histogram_visdom', np.array([1, 2, 3, 4, 5]), 1)
+        w.add_image('image_visdom', np.ndarray((3, 20, 20)), 2)
+        # w.add_video('video_visdom', np.ndarray((1, 3, 10, 20, 20)), 3)
+        w.add_audio('audio_visdom', [1, 2, 3, 4, 5])
+        w.add_text('text_visdom', 'mystring')
+        w.add_pr_curve('pr_curve_visdom', np.random.randint(2, size=100), np.random.rand(100), 10)
+        w.add_pr_curve_raw('prcurve with raw data',
+                           true_positive_counts,
+                           false_positive_counts,
+                           true_negative_counts,
+                           false_negative_counts,
+                           precision,
+                           recall, 20)
+        del w
diff --git a/tensorboardX/tests/test_writer.py b/tensorboardX/tests/test_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..992bb7614f80b3e4a6e31fffdb2694b25bb0bc37
--- /dev/null
+++ b/tensorboardX/tests/test_writer.py
@@ -0,0 +1,111 @@
+from tensorboardX import SummaryWriter
+from tensorboard.compat.tensorflow_stub.pywrap_tensorflow import PyRecordReader_New
+from tensorboardX.proto import event_pb2
+
+import numpy as np
+import pytest
+import unittest
+import time
+freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440]
+
+true_positive_counts = [75, 64, 21, 5, 0]
+false_positive_counts = [150, 105, 18, 0, 0]
+true_negative_counts = [0, 45, 132, 150, 150]
+false_negative_counts = [0, 11, 54, 70, 75]
+precision = [0.3333333, 0.3786982, 0.5384616, 1.0, 0.0]
+recall = [1.0, 0.8533334, 0.28, 0.0666667, 0.0]
+
+
+class WriterTest(unittest.TestCase):
+    def test_flush(self):
+        N_TEST = 5
+        w = SummaryWriter(flush_secs=1)
+        f = w.file_writer.event_writer._ev_writer._file_name
+        for i in range(N_TEST):
+            w.add_scalar('a', i)
+            time.sleep(2)
+        r = PyRecordReader_New(f)
+        r.GetNext()  # meta data, so skip
+        for _ in range(N_TEST):  # all of the data should be flushed
+            r.GetNext()
+
+    def test_flush_timer_is_long_so_data_is_not_there(self):
+        with self.assertRaises(BaseException):
+            N_TEST = 5
+            w = SummaryWriter(flush_secs=20)
+            f = w.file_writer.event_writer._ev_writer._file_name
+            for i in range(N_TEST):
+                w.add_scalar('a', i)
+                time.sleep(2)
+            r = PyRecordReader_New(f)
+            r.GetNext()  # meta data, so skip
+            for _ in range(N_TEST):  # missing data
+                r.GetNext()
+
+    def test_flush_after_close(self):
+        N_TEST = 5
+        w = SummaryWriter(flush_secs=20)
+        f = w.file_writer.event_writer._ev_writer._file_name
+        for i in range(N_TEST):
+            w.add_scalar('a', i)
+            time.sleep(2)
+        w.close()
+        r = PyRecordReader_New(f)
+        r.GetNext()  # meta data, so skip
+        for _ in range(N_TEST):  # all of the data should be flushed
+            r.GetNext()
+
+    def test_flush(self):
+        N_TEST = 5
+        w = SummaryWriter(flush_secs=20)
+        f = w.file_writer.event_writer._ev_writer._file_name
+        for i in range(N_TEST):
+            w.add_scalar('a', i)
+            time.sleep(2)
+        w.flush()
+        r = PyRecordReader_New(f)
+        r.GetNext()  # meta data, so skip
+        for _ in range(N_TEST):  # all of the data should be flushed
+            r.GetNext()
+
+    def test_auto_close(self):
+        pass
+
+    def test_writer(self):
+        with SummaryWriter() as writer:
+            sample_rate = 44100
+
+            n_iter = 0
+            writer.add_scalar('data/scalar_systemtime', 0.1, n_iter)
+            writer.add_scalar('data/scalar_customtime', 0.2, n_iter, walltime=n_iter)
+            writer.add_scalars('data/scalar_group', {"xsinx": n_iter * np.sin(n_iter),
+                                                     "xcosx": n_iter * np.cos(n_iter),
+                                                     "arctanx": np.arctan(n_iter)}, n_iter)
+            x = np.zeros((32, 3, 64, 64))  # output from network
+            writer.add_images('Image', x, n_iter)  # Tensor
+            writer.add_image_with_boxes('imagebox',
+                                        np.zeros((3, 64, 64)),
+                                        np.array([[10, 10, 40, 40], [40, 40, 60, 60]]),
+                                        n_iter)
+            x = np.zeros(sample_rate * 2)
+
+            writer.add_audio('myAudio', x, n_iter)
+            writer.add_video('myVideo', np.random.rand(16, 48, 1, 28, 28).astype(np.float32), n_iter)
+            writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
+            writer.add_text('markdown Text', '''a|b\n-|-\nc|d''', n_iter)
+            writer.add_histogram('hist', np.random.rand(100, 100), n_iter)
+            writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(
+                100), n_iter)  # needs tensorboard 0.4RC or later
+            writer.add_pr_curve_raw('prcurve with raw data', true_positive_counts,
+                                    false_positive_counts,
+                                    true_negative_counts,
+                                    false_negative_counts,
+                                    precision,
+                                    recall, n_iter)
+            # export scalar data to JSON for external processing
+            writer.export_scalars_to_json("./all_scalars.json")
+            imgs = []
+            for i in range(5):
+                imgs.append(np.ones((3, 100, 110)))
+            with SummaryWriter() as w:
+                w.add_images('img_list', imgs, dataformats='CHW')
\ No newline at end of file