Spaces:

patriotyk
/

Apollo

Running on Zero

App Files Files Community

Serhiy Stetskovych commited on Nov 13, 2024

Commit

78e32cc

0 Parent(s):

Initial code

Browse files

Files changed (36) hide show

.gitattributes +35 -0
.gitignore +4 -0
README.md +10 -0
app.py +142 -0
configs/apollo.yaml +106 -0
inference.py +150 -0
look2hear/__init__.py +0 -0
look2hear/datas/__init__.py +11 -0
look2hear/datas/musdb_moisesdb_datamodule.py +215 -0
look2hear/discriminators/__init__.py +47 -0
look2hear/discriminators/frequencydis.py +81 -0
look2hear/losses/__init__.py +14 -0
look2hear/losses/gan_losses.py +58 -0
look2hear/losses/matrix.py +46 -0
look2hear/metrics/__init__.py +9 -0
look2hear/metrics/wrapper.py +86 -0
look2hear/models/__init__.py +49 -0
look2hear/models/apollo.py +303 -0
look2hear/models/base_model.py +96 -0
look2hear/system/__init__.py +17 -0
look2hear/system/audio_litmodule.py +245 -0
look2hear/system/optimizers.py +113 -0
look2hear/system/schedulers.py +129 -0
look2hear/utils/__init__.py +53 -0
look2hear/utils/complex_utils.py +191 -0
look2hear/utils/get_layer_from_string.py +43 -0
look2hear/utils/inversible_interface.py +13 -0
look2hear/utils/lightning_utils.py +110 -0
look2hear/utils/nets_utils.py +503 -0
look2hear/utils/parser_utils.py +178 -0
look2hear/utils/pylogger.py +54 -0
look2hear/utils/separator.py +138 -0
look2hear/utils/stft.py +797 -0
look2hear/utils/torch_utils.py +49 -0
requirements.txt +11 -0
weights/apollo.bin +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*.pyc
+__pycache__
+.venv
+.DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Apollo
+emoji: 💻
+colorFrom: green
+colorTo: purple
+sdk: gradio
+sdk_version: 5.5.0
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os
+import torchaudio
+import torch
+import numpy as np
+import gradio as gr
+import yaml
+import librosa
+import tqdm
+import look2hear.models
+from ml_collections import ConfigDict
+def load_audio(file_path):
+    audio, samplerate = librosa.load(file_path, mono=False, sr=44100)
+    print(f'INPUT audio.shape = {audio.shape} | samplerate = {samplerate}')
+    #audio = dBgain(audio, -6)
+    return torch.from_numpy(audio), samplerate
+def get_config(config_path):
+    with open(config_path) as f:
+        #config = OmegaConf.load(config_path)
+        config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader))
+        return config
+def _getWindowingArray(window_size, fade_size):
+    # IMPORTANT NOTE :
+    # no fades here in the end, only removing the failed ending of the chunk
+    fadein = torch.linspace(1, 1, fade_size)
+    fadeout = torch.linspace(0, 0, fade_size)
+    window = torch.ones(window_size)
+    window[-fade_size:] *= fadeout
+    window[:fade_size] *= fadein
+    return window
+description = f'''
+texts
+'''
+apollo_config = get_config('configs/apollo.yaml')
+apollo_model = look2hear.models.BaseModel.from_pretrain('weights/apollo.bin', **apollo_config['model']).cuda()
+models = [
+    ('MP3 restore', apollo_model)
+]
+@spaces.GPU
+def enchance(model, audio):
+    test_data, samplerate = load_audio(audio)
+    C = 10 * samplerate  # chunk_size seconds to samples
+    N = 2
+    step = C // N
+    fade_size = 3 * 44100 # 3 seconds
+    print(f"N = {N} | C = {C} | step = {step} | fade_size = {fade_size}")
+    border = C - step
+    # handle mono inputs correctly
+    if len(test_data.shape) == 1:
+        test_data = test_data.unsqueeze(0)
+    # Pad the input if necessary
+    if test_data.shape[1] > 2 * border and (border > 0):
+        test_data = torch.nn.functional.pad(test_data, (border, border), mode='reflect')
+    windowingArray = _getWindowingArray(C, fade_size)
+    result = torch.zeros((1,) + tuple(test_data.shape), dtype=torch.float32)
+    counter = torch.zeros((1,) + tuple(test_data.shape), dtype=torch.float32)
+    i = 0
+    progress_bar = tqdm(total=test_data.shape[1], desc="Processing audio chunks", leave=False)
+    while i < test_data.shape[1]:
+        part = test_data[:, i:i + C]
+        length = part.shape[-1]
+        if length < C:
+            if length > C // 2 + 1:
+                part = torch.nn.functional.pad(input=part, pad=(0, C - length), mode='reflect')
+            else:
+                part = torch.nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0)
+        chunk = part.unsqueeze(0).cuda()
+        with torch.no_grad():
+            out = model(chunk).squeeze(0).squeeze(0).cpu()
+        window = windowingArray
+        if i == 0:  # First audio chunk, no fadein
+            window[:fade_size] = 1
+        elif i + C >= test_data.shape[1]:  # Last audio chunk, no fadeout
+            window[-fade_size:] = 1
+        result[..., i:i+length] += out[..., :length] * window[..., :length]
+        counter[..., i:i+length] += window[..., :length]
+        i += step
+        progress_bar.update(step)
+    progress_bar.close()
+    final_output = result / counter
+    final_output = final_output.squeeze(0).numpy()
+    np.nan_to_num(final_output, copy=False, nan=0.0)
+    # Remove padding if added earlier
+    if test_data.shape[1] > 2 * border and (border > 0):
+        final_output = final_output[..., border:-border]
+    return samplerate, final_output.T
+if __name__ == "__main__":
+    i = gr.Interface(
+        fn=enchance,
+        description=description,
+        inputs=[
+            gr.Dropdown(label="Model", choices=models, value=models[0]),
+            gr.Audio(label="Input Audio:", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'}),
+        ],
+        outputs=[
+            gr.Audio(
+                        label="Output Audio",
+                        autoplay=False,
+                        streaming=False,
+                        type="numpy",
+                    ),
+        ],
+        allow_flagging ='never',
+        cache_examples=False,
+        title='Enchanser',
+    )
+    i.queue(max_size=20, default_concurrency_limit=4)
+    i.launch(share=False, server_name="0.0.0.0")

configs/apollo.yaml ADDED Viewed

	@@ -0,0 +1,106 @@

+exp:
+  dir: ./Exps
+  name: Apollo
+# seed: 614020
+datas:
+  _target_: look2hear.datas.MusdbMoisesdbDataModule
+  train_dir: ./hdf5_datas
+  eval_dir: ./eval
+  codec_type: mp3
+  codec_options:
+    bitrate: random
+    compression: random
+    complexity: random
+    vbr: random
+  sr: 44100
+  segments: 3
+  num_stems: 8
+  snr_range: [-10, 10]
+  num_samples: 40000
+  batch_size: 1
+  num_workers: 8
+model:
+  sr: 44100
+  win: 20 # ms
+  feature_dim: 256
+  layer: 6
+discriminator:
+  _target_: look2hear.discriminators.frequencydis.MultiFrequencyDiscriminator
+  nch: 2
+  window: [32, 64, 128, 256, 512, 1024, 2048]
+optimizer_g:
+  _target_: torch.optim.AdamW
+  lr: 0.001
+  weight_decay: 0.01
+optimizer_d:
+  _target_: torch.optim.AdamW
+  lr: 0.0001
+  weight_decay: 0.01
+  betas: [0.5, 0.99]
+scheduler_g:
+  _target_: torch.optim.lr_scheduler.StepLR
+  step_size: 2
+  gamma: 0.98
+scheduler_d:
+  _target_: torch.optim.lr_scheduler.StepLR
+  step_size: 2
+  gamma: 0.98
+loss_g:
+  _target_: look2hear.losses.gan_losses.MultiFrequencyGenLoss
+  eps: 1e-8
+loss_d:
+  _target_: look2hear.losses.gan_losses.MultiFrequencyDisLoss
+  eps: 1e-8
+metrics:
+  _target_: look2hear.losses.MultiSrcNegSDR
+  sdr_type: sisdr
+system:
+  _target_: look2hear.system.audio_litmodule.AudioLightningModule
+early_stopping:
+  _target_: pytorch_lightning.callbacks.EarlyStopping
+  monitor: val_loss
+  patience: 20
+  mode: min
+  verbose: true
+checkpoint:
+  _target_: pytorch_lightning.callbacks.ModelCheckpoint
+  dirpath: ${exp.dir}/${exp.name}/checkpoints
+  monitor: val_loss
+  mode: min
+  verbose: true
+  save_top_k: 5
+  save_last: true
+  filename: '{epoch}-{val_loss:.4f}'
+logger:
+  _target_: pytorch_lightning.loggers.WandbLogger
+  name: ${exp.name}
+  save_dir: ${exp.dir}/${exp.name}/logs
+  offline: false
+  project: Audio-Restoration
+trainer:
+  _target_: pytorch_lightning.Trainer
+  devices: [0,1,2,3,4,5,6,7]
+  max_epochs: 500
+  sync_batchnorm: true
+  default_root_dir: ${exp.dir}/${exp.name}/
+  accelerator: cuda
+  limit_train_batches: 1.0
+  fast_dev_run: false

inference.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import torch
+import librosa
+import look2hear.models
+import soundfile as sf
+from tqdm.auto import tqdm
+import argparse
+import numpy as np
+import yaml
+from ml_collections import ConfigDict
+#from omegaconf import OmegaConf
+import warnings
+warnings.filterwarnings("ignore")
+def get_config(config_path):
+    with open(config_path) as f:
+        #config = OmegaConf.load(config_path)
+        config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader))
+        return config
+def load_audio(file_path):
+    audio, samplerate = librosa.load(file_path, mono=False, sr=44100)
+    print(f'INPUT audio.shape = {audio.shape} | samplerate = {samplerate}')
+    #audio = dBgain(audio, -6)
+    return torch.from_numpy(audio), samplerate
+def save_audio(file_path, audio, samplerate=44100):
+    #audio = dBgain(audio, +6)
+    sf.write(file_path, audio.T, samplerate, subtype="PCM_16")
+def process_chunk(chunk):
+    chunk = chunk.unsqueeze(0).cpu()
+    with torch.no_grad():
+        return model(chunk).squeeze(0).squeeze(0).cpu()
+def _getWindowingArray(window_size, fade_size):
+    # IMPORTANT NOTE :
+    # no fades here in the end, only removing the failed ending of the chunk
+    fadein = torch.linspace(1, 1, fade_size)
+    fadeout = torch.linspace(0, 0, fade_size)
+    window = torch.ones(window_size)
+    window[-fade_size:] *= fadeout
+    window[:fade_size] *= fadein
+    return window
+def dBgain(audio, volume_gain_dB):
+    gain = 10 ** (volume_gain_dB / 20)
+    gained_audio = audio * gain
+    return gained_audio
+def main(input_wav, output_wav, ckpt_path):
+    os.environ['CUDA_VISIBLE_DEVICES'] = "0"
+    global model
+    feature_dim = config['model']['feature_dim']
+    sr = config['model']['sr']
+    win = config['model']['win']
+    layer = config['model']['layer']
+    model = look2hear.models.BaseModel.from_pretrain(ckpt_path, sr=sr, win=win, feature_dim=feature_dim, layer=layer).cpu()
+    test_data, samplerate = load_audio(input_wav)
+    C = chunk_size * samplerate  # chunk_size seconds to samples
+    N = overlap
+    step = C // N
+    fade_size = 3 * 44100 # 3 seconds
+    print(f"N = {N} | C = {C} | step = {step} | fade_size = {fade_size}")
+    border = C - step
+    # handle mono inputs correctly
+    if len(test_data.shape) == 1:
+        test_data = test_data.unsqueeze(0)
+    # Pad the input if necessary
+    if test_data.shape[1] > 2 * border and (border > 0):
+        test_data = torch.nn.functional.pad(test_data, (border, border), mode='reflect')
+    windowingArray = _getWindowingArray(C, fade_size)
+    result = torch.zeros((1,) + tuple(test_data.shape), dtype=torch.float32)
+    counter = torch.zeros((1,) + tuple(test_data.shape), dtype=torch.float32)
+    i = 0
+    progress_bar = tqdm(total=test_data.shape[1], desc="Processing audio chunks", leave=False)
+    while i < test_data.shape[1]:
+        part = test_data[:, i:i + C]
+        length = part.shape[-1]
+        if length < C:
+            if length > C // 2 + 1:
+                part = torch.nn.functional.pad(input=part, pad=(0, C - length), mode='reflect')
+            else:
+                part = torch.nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0)
+        out = process_chunk(part)
+        window = windowingArray
+        if i == 0:  # First audio chunk, no fadein
+            window[:fade_size] = 1
+        elif i + C >= test_data.shape[1]:  # Last audio chunk, no fadeout
+            window[-fade_size:] = 1
+        result[..., i:i+length] += out[..., :length] * window[..., :length]
+        counter[..., i:i+length] += window[..., :length]
+        i += step
+        progress_bar.update(step)
+    progress_bar.close()
+    final_output = result / counter
+    final_output = final_output.squeeze(0).numpy()
+    np.nan_to_num(final_output, copy=False, nan=0.0)
+    # Remove padding if added earlier
+    if test_data.shape[1] > 2 * border and (border > 0):
+        final_output = final_output[..., border:-border]
+    save_audio(output_wav, final_output, samplerate)
+    print(f'Success! Output file saved as {output_wav}')
+    # Memory clearing
+    model.cpu()
+    del model
+    torch.cuda.empty_cache()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Audio Inference Script")
+    parser.add_argument("--in_wav", type=str, required=True, help="Path to input wav file")
+    parser.add_argument("--out_wav", type=str, required=True, help="Path to output wav file")
+    parser.add_argument("--ckpt", type=str, required=True, help="Path to model checkpoint file", default="model/pytorch_model.bin")
+    parser.add_argument("--config", type=str, help="Path to model config file", default="config/apollo.yaml")
+    parser.add_argument("--chunk_size", type=int, help="chunk size value in seconds", default=10)
+    parser.add_argument("--overlap", type=int, help="Overlap", default=2)
+    args = parser.parse_args()
+    ckpt_path = args.ckpt
+    chunk_size = args.chunk_size
+    overlap = args.overlap
+    config = get_config(args.config)
+    print(config['model'])
+    print(f'ckpt_path = {ckpt_path}')
+    #print(f'config = {config}')
+    print(f'chunk_size = {chunk_size}, overlap = {overlap}')
+    main(args.in_wav, args.out_wav, ckpt_path)

look2hear/__init__.py ADDED Viewed

File without changes

look2hear/datas/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+###
+# Author: Kai Li
+# Date: 2021-06-03 18:29:46
+# LastEditors: Please set LastEditors
+# LastEditTime: 2022-07-29 06:23:03
+###
+from .musdb_moisesdb_datamodule import MusdbMoisesdbDataModule
+__all__ = [
+    "MusdbMoisesdbDataModule"
+]

look2hear/datas/musdb_moisesdb_datamodule.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import os
+import h5py
+import numpy as np
+from typing import Any, Tuple
+import torch
+import random
+from pytorch_lightning import LightningDataModule
+import torchaudio
+from torchaudio.functional import apply_codec
+from torch.utils.data import DataLoader, Dataset
+from typing import Any, Dict, Optional, Tuple
+def compute_mch_rms_dB(mch_wav, fs=16000, energy_thresh=-50):
+    """Return the wav RMS calculated only in the active portions"""
+    mean_square = max(1e-20, torch.mean(mch_wav ** 2))
+    return 10 * np.log10(mean_square)
+def match2(x, d):
+    assert x.dim()==2, x.shape
+    assert d.dim()==2, d.shape
+    minlen = min(x.shape[-1], d.shape[-1])
+    x, d = x[:,0:minlen], d[:,0:minlen]
+    Fx = torch.fft.rfft(x, dim=-1)
+    Fd = torch.fft.rfft(d, dim=-1)
+    Phi = Fd*Fx.conj()
+    Phi = Phi / (Phi.abs() + 1e-3)
+    Phi[:,0] = 0
+    tmp = torch.fft.irfft(Phi, dim=-1)
+    tau = torch.argmax(tmp.abs(),dim=-1).tolist()
+    return tau
+def codec_simu(wav, sr=16000, options={'bitrate':'random','compression':'random', 'complexity':'random', 'vbr':'random'}):
+    if options['bitrate'] == 'random':
+        options['bitrate'] = random.choice([24000, 32000, 48000, 64000, 96000, 128000])
+    compression = int(options['bitrate']//1000)
+    param = {'format': "mp3", "compression": compression}
+    wav_encdec = apply_codec(wav, sr, **param)
+    if wav_encdec.shape[-1] >= wav.shape[-1]:
+        wav_encdec = wav_encdec[...,:wav.shape[-1]]
+    else:
+        wav_encdec = torch.cat([wav_encdec, wav[..., wav_encdec.shape[-1]:]], -1)
+    tau = match2(wav, wav_encdec)
+    wav_encdec = torch.roll(wav_encdec, -tau[0], -1)
+    return wav_encdec
+def get_wav_files(root_dir):
+    wav_files = []
+    for dirpath, dirnames, filenames in os.walk(root_dir):
+        for filename in filenames:
+            if filename.endswith('.wav'):
+                if "musdb18hq" in dirpath and "mixture" not in filename:
+                    wav_files.append(os.path.join(dirpath, filename))
+                elif "moisesdb" in dirpath:
+                    wav_files.append(os.path.join(dirpath, filename))
+    return wav_files
+class MusdbMoisesdbDataset(Dataset):
+    def __init__(
+        self,
+        data_dir: str,
+        codec_type: str,
+        codec_options: dict,
+        sr: int = 16000,
+        segments: int = 10,
+        num_stems: int = 4,
+        snr_range: Tuple[int, int] = (-10, 10),
+        num_samples: int = 1000,
+    ) -> None:
+        self.data_dir = data_dir
+        self.codec_type = codec_type
+        self.codec_options = codec_options
+        self.segments = int(segments * sr)
+        self.sr = sr
+        self.num_stems = num_stems
+        self.snr_range = snr_range
+        self.num_samples = num_samples
+        self.instruments = [
+            "bass",
+            "bowed_strings",
+            "drums",
+            "guitar",
+            "other",
+            "other_keys",
+            "other_plucked",
+            "percussion",
+            "piano",
+            "vocals",
+            "wind"
+        ]
+    def __len__(self) -> int:
+        return self.num_samples
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        if random.random() > 0.5:
+            select_stems = random.randint(1, self.num_stems)
+            select_stems = random.choices(self.instruments, k=select_stems)
+            ori_wav = []
+            for stem in select_stems:
+                h5path = random.choice(os.listdir(os.path.join(self.data_dir, stem)))
+                datas = h5py.File(os.path.join(self.data_dir, stem, h5path), 'r')['data']
+                random_index = random.randint(0, datas.shape[0]-1)
+                music_wav = torch.FloatTensor(datas[random_index])
+                start = random.randint(0, music_wav.shape[-1] - self.segments)
+                music_wav = music_wav[:, start:start+self.segments]
+                rescale_snr = random.randint(self.snr_range[0], self.snr_range[1])
+                music_wav = music_wav * np.sqrt(10**(rescale_snr/10))
+                ori_wav.append(music_wav)
+            ori_wav = torch.stack(ori_wav).sum(0)
+        else:
+            h5path = random.choice(os.listdir(os.path.join(self.data_dir, "mixture")))
+            datas = h5py.File(os.path.join(self.data_dir, "mixture", h5path), 'r')['data']
+            random_index = random.randint(0, datas.shape[0]-1)
+            music_wav = torch.FloatTensor(datas[random_index])
+            start = random.randint(0, music_wav.shape[-1] - self.segments)
+            ori_wav = music_wav[:, start:start+self.segments]
+        codec_wav = codec_simu(ori_wav, sr=self.sr, options=self.codec_options)
+        max_scale = max(ori_wav.abs().max(), codec_wav.abs().max())
+        if max_scale > 0:
+            ori_wav = ori_wav / max_scale
+            codec_wav = codec_wav / max_scale
+        return ori_wav, codec_wav
+class MusdbMoisesdbEval(Dataset):
+    def __init__(
+        self,
+        data_dir: str
+    ) -> None:
+        self.data_path = os.listdir(data_dir)
+        self.data_path = [os.path.join(data_dir, i) for i in self.data_path]
+    def __len__(self) -> int:
+        return len(self.data_path)
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        ori_wav = torchaudio.load(self.data_path[idx]+"/ori_wav.wav")[0]
+        codec_wav = torchaudio.load(self.data_path[idx]+"/codec_wav.wav")[0]
+        return ori_wav, codec_wav, self.data_path[idx]
+class MusdbMoisesdbDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_dir: str,
+        eval_dir: str,
+        codec_type: str,
+        codec_options: dict,
+        sr: int = 16000,
+        segments: int = 10,
+        num_stems: int = 4,
+        snr_range: Tuple[int, int] = (-10, 10),
+        num_samples: int = 1000,
+        batch_size: int = 32,
+        num_workers: int = 4,
+    ) -> None:
+        super().__init__()
+        self.save_hyperparameters(logger=False)
+        self.data_train: Optional[Dataset] = None
+        self.data_val: Optional[Dataset] = None
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by Lightning before `trainer.fit()`, `trainer.validate()`, `trainer.test()`, and
+        `trainer.predict()`, so be careful not to execute things like random split twice! Also, it is called after
+        `self.prepare_data()` and there is a barrier in between which ensures that all the processes proceed to
+        `self.setup()` once the data is prepared and available for use.
+        :param stage: The stage to setup. Either `"fit"`, `"validate"`, `"test"`, or `"predict"`. Defaults to ``None``.
+        """
+        # load and split datasets only if not loaded already
+        if not self.data_train and not self.data_val:
+            self.data_train = MusdbMoisesdbDataset(
+                data_dir=self.hparams.train_dir,
+                codec_type=self.hparams.codec_type,
+                codec_options=self.hparams.codec_options,
+                sr=self.hparams.sr,
+                segments=self.hparams.segments,
+                num_stems=self.hparams.num_stems,
+                snr_range=self.hparams.snr_range,
+                num_samples=self.hparams.num_samples,
+            )
+            self.data_val = MusdbMoisesdbEval(
+                data_dir=self.hparams.eval_dir
+            )
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.data_train,
+            batch_size=self.hparams.batch_size,
+            num_workers=self.hparams.num_workers,
+            shuffle=True,
+            pin_memory=True,
+        )
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.data_val,
+            batch_size=self.hparams.batch_size,
+            num_workers=self.hparams.num_workers,
+            shuffle=False,
+            pin_memory=True,
+        )

look2hear/discriminators/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+###
+# Author: Kai Li
+# Date: 2022-02-12 15:16:35
+# Email: [email protected]
+# LastEditTime: 2022-10-04 16:24:53
+###
+from .frequencydis import MultiFrequencyDiscriminator, FrequencyDiscriminator
+__all__ = [
+    "MultiFrequencyDiscriminator",
+    "FrequencyDiscriminator"
+]
+def register_model(custom_model):
+    """Register a custom model, gettable with `models.get`.
+    Args:
+        custom_model: Custom model to register.
+    """
+    if (
+        custom_model.__name__ in globals().keys()
+        or custom_model.__name__.lower() in globals().keys()
+    ):
+        raise ValueError(
+            f"Model {custom_model.__name__} already exists. Choose another name."
+        )
+    globals().update({custom_model.__name__: custom_model})
+def get(identifier):
+    """Returns an model class from a string (case-insensitive).
+    Args:
+        identifier (str): the model name.
+    Returns:
+        :class:`torch.nn.Module`
+    """
+    if isinstance(identifier, str):
+        to_get = {k.lower(): v for k, v in globals().items()}
+        cls = to_get.get(identifier.lower())
+        if cls is None:
+            raise ValueError(f"Could not interpret model name : {str(identifier)}")
+        return cls
+    raise ValueError(f"Could not interpret model name : {str(identifier)}")

look2hear/discriminators/frequencydis.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+import numpy as np
+class MultiFrequencyDiscriminator(nn.Module):
+    def __init__(self, nch, window):
+        super(MultiFrequencyDiscriminator, self).__init__()
+        self.nch = nch
+        self.window = window
+        self.hidden_channels = 8
+        self.eps = torch.finfo(torch.float32).eps
+        self.discriminators = nn.ModuleList([FrequencyDiscriminator(2*nch, self.hidden_channels) for _ in range(len(self.window))])
+    def forward(self, est, sample_rate=44100):
+        B, nch, _ = est.shape
+        assert nch == self.nch
+        # normalize power
+        est = est / (est.pow(2).sum((1,2)) + self.eps).sqrt().reshape(B, 1, 1)
+        est = est.view(-1, est.shape[-1])
+        est_outputs = []
+        est_feature_maps = []
+        for i in range(len(self.discriminators)):
+            est_spec = torch.stft(est.float(), self.window[i], self.window[i]//2,
+                                  window=torch.hann_window(self.window[i]).to(est.device).float(),
+                                  return_complex=True)
+            est_RI = torch.stack([est_spec.real, est_spec.imag], dim=1)
+            est_RI = est_RI.view(B, nch*2, est_RI.shape[-2], est_RI.shape[-1]).type(est.type())
+            valid_enc = int(est_RI.shape[2] * sample_rate / 44100)
+            est_out, est_feat_map = self.discriminators[i](est_RI[:,:,:valid_enc].contiguous())
+            est_outputs.append(est_out)
+            est_feature_maps.append(est_feat_map)
+        return est_outputs, est_feature_maps
+class FrequencyDiscriminator(nn.Module):
+    def __init__(self, in_channels, hidden_channels=512):
+        super(FrequencyDiscriminator, self).__init__()
+        self.eps = torch.finfo(torch.float32).eps
+        self.discriminator = nn.ModuleList()
+        self.discriminator += [
+            nn.Sequential(
+                nn.utils.spectral_norm(nn.Conv2d(in_channels, hidden_channels, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1))),
+                nn.LeakyReLU(0.2, True)
+            ),
+            nn.Sequential(
+                nn.utils.spectral_norm(nn.Conv2d(hidden_channels, hidden_channels*2, kernel_size=(3, 3), padding=(1, 1), stride=(2, 2))),
+                nn.LeakyReLU(0.2, True)
+            ),
+            nn.Sequential(
+                nn.utils.spectral_norm(nn.Conv2d(hidden_channels*2, hidden_channels*4, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1))),
+                nn.LeakyReLU(0.2, True)
+            ),
+            nn.Sequential(
+                nn.utils.spectral_norm(nn.Conv2d(hidden_channels*4, hidden_channels*8, kernel_size=(3, 3), padding=(1, 1), stride=(2, 2))),
+                nn.LeakyReLU(0.2, True)
+            ),
+            nn.Sequential(
+                nn.utils.spectral_norm(nn.Conv2d(hidden_channels*8, hidden_channels*16, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1))),
+                nn.LeakyReLU(0.2, True)
+            ),
+            nn.Sequential(
+                nn.utils.spectral_norm(nn.Conv2d(hidden_channels*16, hidden_channels*32, kernel_size=(3, 3), padding=(1, 1), stride=(2, 2))),
+                nn.LeakyReLU(0.2, True)
+            ),
+            nn.Conv2d(hidden_channels*32, 1, kernel_size=(3, 3), padding=(1, 1), stride=(1, 1))
+        ]
+    def forward(self, x):
+        hiddens = []
+        for layer in self.discriminator:
+            x = layer(x)
+            hiddens.append(x)
+        return x, hiddens[:-1]

look2hear/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+###
+# Author: Kai Li
+# Date: 2021-06-09 16:34:19
+# LastEditors: Kai Li
+# LastEditTime: 2021-07-12 20:55:35
+###
+from .gan_losses import MultiFrequencyDisLoss, MultiFrequencyGenLoss
+from .matrix import MultiSrcNegSDR
+__all__ = [
+    "MultiFrequencyDisLoss",
+    "MultiFrequencyGenLoss",
+    "MultiSrcNegSDR"
+]

look2hear/losses/gan_losses.py ADDED Viewed

	@@ -0,0 +1,58 @@

+###
+# Author: Kai Li
+# Date: 2021-06-09 16:43:09
+# LastEditors: Please set LastEditors
+# LastEditTime: 2024-01-24 00:00:52
+###
+import torch
+from torch.nn.modules.loss import _Loss
+def freq_MAE(output, target):
+    loss = 0.
+    eps = torch.finfo(torch.float32).eps
+    all_win = [32, 64, 128, 256, 512, 1024, 2048]
+    for win in all_win:
+        est_spec = torch.stft(output.view(-1, output.shape[-1]), n_fft=win, hop_length=win//2,
+                            window=torch.hann_window(win).to(output.device).float(),
+                            return_complex=True)
+        target_spec = torch.stft(target.view(-1, target.shape[-1]), n_fft=win, hop_length=win//2,
+                                window=torch.hann_window(win).to(target.device).float(),
+                                return_complex=True)
+        loss = loss + (est_spec.abs() - target_spec.abs()).abs().mean() / (target_spec.abs().mean() + eps)
+    return loss / len(all_win)
+class MultiFrequencyDisLoss(_Loss):
+    def __init__(self, eps=1e-8):
+        super(MultiFrequencyDisLoss, self).__init__()
+    def forward(self, target_outputs, est_outputs):
+        D_real = 0
+        D_fake = 0
+        for i in range(len(target_outputs)):
+            D_real = D_real + (target_outputs[i] - 1).pow(2).mean() / len(target_outputs)
+            D_fake = D_fake + (est_outputs[i]).pow(2).mean() / len(est_outputs)
+        return D_real + D_fake
+class MultiFrequencyGenLoss(_Loss):
+    def __init__(self, eps=1e-8):
+        super(MultiFrequencyGenLoss, self).__init__()
+        self.eps = eps
+    def forward(self, est_outputs, est_feature_maps, targets_feature_maps, output, ori_data):
+        G_fake = 0
+        feature_matching = 0
+        eps = self.eps
+        for i in range(len(est_outputs)):
+            G_fake = G_fake + (est_outputs[i] - 1).pow(2).mean() / len(est_outputs)
+            for j in range(len(est_feature_maps[i])):
+                feature_matching = feature_matching + (est_feature_maps[i][j] - targets_feature_maps[i][j].detach()).abs().mean() / (targets_feature_maps[i][j].detach().abs().mean() + eps)
+        feature_matching = feature_matching / (len(est_outputs) * len(est_feature_maps[0]))
+        freq_loss = freq_MAE(output, ori_data.unsqueeze(1))
+        total_loss = freq_loss + G_fake + feature_matching
+        return total_loss

look2hear/losses/matrix.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+from torch.nn.modules.loss import _Loss
+class MultiSrcNegSDR(_Loss):
+    def __init__(self, sdr_type, zero_mean=True, take_log=True, EPS=1e-8):
+        super().__init__()
+        assert sdr_type in ["snr", "sisdr", "sdsdr"]
+        self.sdr_type = sdr_type
+        self.zero_mean = zero_mean
+        self.take_log = take_log
+        self.EPS = 1e-8
+    def forward(self, ests, targets):
+        if targets.size() != ests.size() or targets.ndim != 3:
+            raise TypeError(
+                f"Inputs must be of shape [batch, n_src, time], got {targets.size()} and {ests.size()} instead"
+            )
+        # Step 1. Zero-mean norm
+        if self.zero_mean:
+            mean_source = torch.mean(targets, dim=2, keepdim=True)
+            mean_est = torch.mean(ests, dim=2, keepdim=True)
+            targets = targets - mean_source
+            ests = ests - mean_est
+        # Step 2. Pair-wise SI-SDR.
+        if self.sdr_type in ["sisdr", "sdsdr"]:
+            # [batch, n_src]
+            pair_wise_dot = torch.sum(ests * targets, dim=2, keepdim=True)
+            # [batch, n_src]
+            s_target_energy = torch.sum(targets ** 2, dim=2, keepdim=True) + self.EPS
+            # [batch, n_src, time]
+            scaled_targets = pair_wise_dot * targets / s_target_energy
+        else:
+            # [batch, n_src, time]
+            scaled_targets = targets
+        if self.sdr_type in ["sdsdr", "snr"]:
+            e_noise = ests - targets
+        else:
+            e_noise = ests - scaled_targets
+        # [batch, n_src]
+        pair_wise_sdr = torch.sum(scaled_targets ** 2, dim=2) / (
+            torch.sum(e_noise ** 2, dim=2) + self.EPS
+        )
+        if self.take_log:
+            pair_wise_sdr = 10 * torch.log10(pair_wise_sdr + self.EPS)
+        return -torch.mean(pair_wise_sdr, dim=-1).mean(0)

look2hear/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+###
+# Author: Kai Li
+# Date: 2021-06-22 12:22:41
+# LastEditors: Kai Li
+# LastEditTime: 2021-07-14 19:15:22
+###
+from .wrapper import MetricsTracker
+__all__ = ["MetricsTracker"]

look2hear/metrics/wrapper.py ADDED Viewed

	@@ -0,0 +1,86 @@

+###
+# Author: Kai Li
+# Date: 2021-06-22 12:41:36
+# LastEditors: Please set LastEditors
+# LastEditTime: 2022-06-05 14:48:00
+###
+import csv
+from sympy import im
+import torch
+import numpy as np
+import logging
+import os
+import librosa
+from torch_mir_eval.separation import bss_eval_sources
+import fast_bss_eval
+from visqol import visqol_lib_py
+from visqol.pb2 import visqol_config_pb2
+from visqol.pb2 import similarity_result_pb2
+logger = logging.getLogger(__name__)
+def is_silent(wav, threshold=1e-4):
+    return torch.sum(wav ** 2) / wav.numel() < threshold
+class MetricsTracker:
+    def __init__(self, save_file: str = ""):
+        self.all_sdrs = []
+        self.all_sisnrs = []
+        self.all_visqols = []
+        csv_columns = ["snt_id", "sdr", "si-snr", "visqol"]
+        self.visqol_config = visqol_config_pb2.VisqolConfig()
+        self.visqol_config.audio.sample_rate = 48000
+        self.visqol_config.options.use_speech_scoring = False
+        svr_model_path = "libsvm_nu_svr_model.txt"
+        self.visqol_config.options.svr_model_path = os.path.join(os.path.dirname(visqol_lib_py.__file__), "model", svr_model_path)
+        self.visqol_api = visqol_lib_py.VisqolApi()
+        self.visqol_api.Create(self.visqol_config)
+        self.results_csv = open(save_file, "w")
+        self.writer = csv.DictWriter(self.results_csv, fieldnames=csv_columns)
+        self.writer.writeheader()
+    def __call__(self, clean, estimate, key):
+        sisnr = fast_bss_eval.si_sdr(clean.unsqueeze(0), estimate.unsqueeze(0), zero_mean=True).mean()
+        sdr = fast_bss_eval.sdr(clean.unsqueeze(0), estimate.unsqueeze(0), zero_mean=True).mean()
+        clean = librosa.resample(clean.squeeze(0).mean(0).cpu().numpy(), orig_sr=44100, target_sr=48000).astype(np.float64)
+        estimate = librosa.resample(estimate.squeeze(0).mean(0).cpu().numpy(), orig_sr=44100, target_sr=48000).astype(np.float64)
+        visqol = self.visqol_api.Measure(clean, estimate).moslqo
+        # import pdb; pdb.set_trace()
+        row = {
+            "snt_id": key,
+            "sdr": sdr.item(),
+            "si-snr": sisnr.item(),
+            "visqol": visqol
+        }
+        self.writer.writerow(row)
+        # Metric Accumulation
+        self.all_sdrs.append(sdr.item())
+        self.all_sisnrs.append(sisnr.item())
+        self.all_visqols.append(visqol)
+    def update(self, ):
+        return {"sdr": np.array(self.all_sdrs).mean(),
+                "si-snr": np.array(self.all_sisnrs).mean(),
+                "visqol": np.array(self.all_visqols).mean()}
+    def final(self,):
+        row = {
+            "snt_id": "avg",
+            "sdr": np.array(self.all_sdrs).mean(),
+            "si-snr": np.array(self.all_sisnrs).mean(),
+            "visqol": np.array(self.all_visqols).mean()
+        }
+        self.writer.writerow(row)
+        row = {
+            "snt_id": "std",
+            "sdr": np.array(self.all_sdrs).std(),
+            "si-snr": np.array(self.all_sisnrs).std(),
+            "visqol": np.array(self.all_visqols).std()
+        }
+        self.writer.writerow(row)
+        self.results_csv.close()

look2hear/models/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+###
+# Author: Kai Li
+# Date: 2022-02-12 15:16:35
+# Email: [email protected]
+# LastEditTime: 2022-10-04 16:24:53
+###
+from .base_model import BaseModel
+from .apollo import Apollo
+__all__ = [
+    "BaseModel",
+    "GullFullband",
+    "Apollo"
+]
+def register_model(custom_model):
+    """Register a custom model, gettable with `models.get`.
+    Args:
+        custom_model: Custom model to register.
+    """
+    if (
+        custom_model.__name__ in globals().keys()
+        or custom_model.__name__.lower() in globals().keys()
+    ):
+        raise ValueError(
+            f"Model {custom_model.__name__} already exists. Choose another name."
+        )
+    globals().update({custom_model.__name__: custom_model})
+def get(identifier):
+    """Returns an model class from a string (case-insensitive).
+    Args:
+        identifier (str): the model name.
+    Returns:
+        :class:`torch.nn.Module`
+    """
+    if isinstance(identifier, str):
+        to_get = {k.lower(): v for k, v in globals().items()}
+        cls = to_get.get(identifier.lower())
+        if cls is None:
+            raise ValueError(f"Could not interpret model name : {str(identifier)}")
+        return cls
+    raise ValueError(f"Could not interpret model name : {str(identifier)}")

look2hear/models/apollo.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from .base_model import BaseModel
+class RMSNorm(nn.Module):
+    def __init__(self, dimension, groups=1):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dimension))
+        self.groups = groups
+        self.eps = 1e-5
+    def forward(self, input):
+        # input size: (B, N, T)
+        B, N, T = input.shape
+        assert N % self.groups == 0
+        input_float = input.reshape(B, self.groups, -1, T).float()
+        input_norm = input_float * torch.rsqrt(input_float.pow(2).mean(-2, keepdim=True) + self.eps)
+        return input_norm.type_as(input).reshape(B, N, T) * self.weight.reshape(1, -1, 1)
+class RMVN(nn.Module):
+    """
+    Rescaled MVN.
+    """
+    def __init__(self, dimension, groups=1):
+        super(RMVN, self).__init__()
+        self.mean = nn.Parameter(torch.zeros(dimension))
+        self.std = nn.Parameter(torch.ones(dimension))
+        self.groups = groups
+        self.eps = 1e-5
+    def forward(self, input):
+        # input size: (B, N, *)
+        B, N = input.shape[:2]
+        assert N % self.groups == 0
+        input_reshape = input.reshape(B, self.groups, N // self.groups, -1)
+        T = input_reshape.shape[-1]
+        input_norm = (input_reshape - input_reshape.mean(2).unsqueeze(2)) / (input_reshape.var(2).unsqueeze(2) + self.eps).sqrt()
+        input_norm = input_norm.reshape(B, N, T) * self.std.reshape(1, -1, 1) + self.mean.reshape(1, -1, 1)
+        return input_norm.reshape(input.shape)
+class Roformer(nn.Module):
+    """
+    Transformer with rotary positional embedding.
+    """
+    def __init__(self, input_size, hidden_size, num_head=8, theta=10000, window=10000,
+                 input_drop=0., attention_drop=0., causal=True):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size // num_head
+        self.num_head = num_head
+        self.theta = theta  # base frequency for RoPE
+        self.window = window
+        # pre-calculate rotary embeddings
+        cos_freq, sin_freq = self._calc_rotary_emb()
+        self.register_buffer("cos_freq", cos_freq)  # win, N
+        self.register_buffer("sin_freq", sin_freq)  # win, N
+        self.attention_drop = attention_drop
+        self.causal = causal
+        self.eps = 1e-5
+        self.input_norm = RMSNorm(self.input_size)
+        self.input_drop = nn.Dropout(p=input_drop)
+        self.weight = nn.Conv1d(self.input_size, self.hidden_size*self.num_head*3, 1, bias=False)
+        self.output = nn.Conv1d(self.hidden_size*self.num_head, self.input_size, 1, bias=False)
+        self.MLP = nn.Sequential(RMSNorm(self.input_size),
+                                 nn.Conv1d(self.input_size, self.input_size*8, 1, bias=False),
+                                 nn.SiLU()
+                                )
+        self.MLP_output = nn.Conv1d(self.input_size*4, self.input_size, 1, bias=False)
+    def _calc_rotary_emb(self):
+        freq = 1. / (self.theta ** (torch.arange(0, self.hidden_size, 2)[:(self.hidden_size // 2)] / self.hidden_size))  # theta_i
+        freq = freq.reshape(1, -1)  # 1, N//2
+        pos = torch.arange(0, self.window).reshape(-1, 1)  # win, 1
+        cos_freq = torch.cos(pos*freq)  # win, N//2
+        sin_freq = torch.sin(pos*freq)  # win, N//2
+        cos_freq = torch.stack([cos_freq]*2, -1).reshape(self.window, self.hidden_size)  # win, N
+        sin_freq = torch.stack([sin_freq]*2, -1).reshape(self.window, self.hidden_size)  # win, N
+        return cos_freq, sin_freq
+    def _add_rotary_emb(self, feature, pos):
+        # feature shape: ..., N
+        N = feature.shape[-1]
+        feature_reshape = feature.reshape(-1, N)
+        pos = min(pos, self.window-1)
+        cos_freq = self.cos_freq[pos]
+        sin_freq = self.sin_freq[pos]
+        reverse_sign = torch.from_numpy(np.asarray([-1, 1])).to(feature.device).type(feature.dtype)
+        feature_reshape_neg = (torch.flip(feature_reshape.reshape(-1, N//2, 2), [-1]) * reverse_sign.reshape(1, 1, 2)).reshape(-1, N)
+        feature_rope = feature_reshape * cos_freq.unsqueeze(0) + feature_reshape_neg * sin_freq.unsqueeze(0)
+        return feature_rope.reshape(feature.shape)
+    def _add_rotary_sequence(self, feature):
+        # feature shape: ..., T, N
+        T, N = feature.shape[-2:]
+        feature_reshape = feature.reshape(-1, T, N)
+        cos_freq = self.cos_freq[:T]
+        sin_freq = self.sin_freq[:T]
+        reverse_sign = torch.from_numpy(np.asarray([-1, 1])).to(feature.device).type(feature.dtype)
+        feature_reshape_neg = (torch.flip(feature_reshape.reshape(-1, N//2, 2), [-1]) * reverse_sign.reshape(1, 1, 2)).reshape(-1, T, N)
+        feature_rope = feature_reshape * cos_freq.unsqueeze(0) + feature_reshape_neg * sin_freq.unsqueeze(0)
+        return feature_rope.reshape(feature.shape)
+    def forward(self, input):
+        # input shape: B, N, T
+        B, _, T = input.shape
+        weight = self.weight(self.input_drop(self.input_norm(input))).reshape(B, self.num_head, self.hidden_size*3, T).mT
+        Q, K, V = torch.split(weight, self.hidden_size, dim=-1)  # B, num_head, T, N
+        # rotary positional embedding
+        Q_rot = self._add_rotary_sequence(Q)
+        K_rot = self._add_rotary_sequence(K)
+        attention_output = F.scaled_dot_product_attention(Q_rot.contiguous(), K_rot.contiguous(), V.contiguous(), dropout_p=self.attention_drop, is_causal=self.causal)  # B, num_head, T, N
+        attention_output = attention_output.mT.reshape(B, -1, T)
+        output = self.output(attention_output) + input
+        gate, z = self.MLP(output).chunk(2, dim=1)
+        output = output + self.MLP_output(F.silu(gate) * z)
+        return output, (K_rot, V)
+class ConvActNorm1d(nn.Module):
+    def __init__(self, in_channel, hidden_channel, kernel=7, causal=False):
+        super(ConvActNorm1d, self).__init__()
+        self.in_channel = in_channel
+        self.kernel = kernel
+        self.causal = causal
+        if not causal:
+            self.conv = nn.Sequential(nn.Conv1d(in_channel, in_channel, kernel, padding=(kernel-1)//2, groups=in_channel),
+                                      RMSNorm(in_channel),
+                                      nn.Conv1d(in_channel, hidden_channel, 1),
+                                      nn.SiLU(),
+                                      nn.Conv1d(hidden_channel, in_channel, 1)
+                                     )
+        else:
+            self.conv = nn.Sequential(nn.Conv1d(in_channel, in_channel, kernel, padding=kernel-1, groups=in_channel),
+                                      RMSNorm(in_channel),
+                                      nn.Conv1d(in_channel, hidden_channel, 1),
+                                      nn.SiLU(),
+                                      nn.Conv1d(hidden_channel, in_channel, 1)
+                                     )
+    def forward(self, input):
+        output = self.conv(input)
+        if self.causal:
+            output = output[...,:-self.kernel+1]
+        return input + output
+class ICB(nn.Module):
+    def __init__(self, in_channel, kernel=7, causal=False):
+        super(ICB, self).__init__()
+        self.blocks = nn.Sequential(ConvActNorm1d(in_channel, in_channel*4, kernel, causal=causal),
+                                    ConvActNorm1d(in_channel, in_channel*4, kernel, causal=causal),
+                                    ConvActNorm1d(in_channel, in_channel*4, kernel, causal=causal)
+                                    )
+    def forward(self, input):
+        return self.blocks(input)
+class BSNet(nn.Module):
+    def __init__(self, feature_dim, kernel=7):
+        super(BSNet, self).__init__()
+        self.feature_dim = feature_dim
+        self.band_net = Roformer(self.feature_dim, self.feature_dim, num_head=8, window=100, causal=False)
+        self.seq_net = ICB(self.feature_dim, kernel=kernel)
+    def forward(self, input):
+        # input shape: B, nband, N, T
+        B, nband, N, T = input.shape
+        # band comm
+        band_input = input.permute(0,3,2,1).reshape(B*T, -1, nband)
+        band_output, _ = self.band_net(band_input)
+        band_output = band_output.reshape(B, T, -1, nband).permute(0,3,2,1)
+        # sequence modeling
+        output = self.seq_net(band_output.reshape(B*nband, -1, T)).reshape(B, nband, -1, T)  # B, nband, N, T
+        return output
+class Apollo(BaseModel):
+    def __init__(
+        self,
+        sr: int,
+        win: int,
+        feature_dim: int,
+        layer: int
+    ):
+        super().__init__(sample_rate=sr)
+        self.sr = sr
+        self.win = int(sr * win // 1000)
+        self.stride = self.win // 2
+        self.enc_dim = self.win // 2 + 1
+        self.feature_dim = feature_dim
+        self.eps = torch.finfo(torch.float32).eps
+        # 80 bands
+        bandwidth = int(self.win / 160)
+        self.band_width = [bandwidth]*79
+        self.band_width.append(self.enc_dim - np.sum(self.band_width))
+        self.nband = len(self.band_width)
+        print(self.band_width, self.nband)
+        self.BN = nn.ModuleList([])
+        for i in range(self.nband):
+            self.BN.append(nn.Sequential(RMSNorm(self.band_width[i]*2+1),
+                                         nn.Conv1d(self.band_width[i]*2+1, self.feature_dim, 1))
+                          )
+        self.net = []
+        for _ in range(layer):
+            self.net.append(BSNet(self.feature_dim))
+        self.net = nn.Sequential(*self.net)
+        self.output = nn.ModuleList([])
+        for i in range(self.nband):
+            self.output.append(nn.Sequential(RMSNorm(self.feature_dim),
+                                                 nn.Conv1d(self.feature_dim, self.band_width[i]*4, 1),
+                                                 nn.GLU(dim=1)
+                                                )
+                                  )
+    def spec_band_split(self, input):
+        B, nch, nsample = input.shape
+        spec = torch.stft(input.view(B*nch, nsample), n_fft=self.win, hop_length=self.stride,
+                          window=torch.hann_window(self.win).to(input.device), return_complex=True)
+        subband_spec = []
+        subband_spec_norm = []
+        subband_power = []
+        band_idx = 0
+        for i in range(self.nband):
+            this_spec = spec[:,band_idx:band_idx+self.band_width[i]]
+            subband_spec.append(this_spec)  # B, BW, T
+            subband_power.append((this_spec.abs().pow(2).sum(1) + self.eps).sqrt().unsqueeze(1))  # B, 1, T
+            subband_spec_norm.append(torch.complex(this_spec.real / subband_power[-1], this_spec.imag / subband_power[-1]))  # B, BW, T
+            band_idx += self.band_width[i]
+        subband_power = torch.cat(subband_power, 1)  # B, nband, T
+        return subband_spec_norm, subband_power
+    def feature_extractor(self, input):
+        subband_spec_norm, subband_power = self.spec_band_split(input)
+        # normalization and bottleneck
+        subband_feature = []
+        for i in range(self.nband):
+            concat_spec = torch.cat([subband_spec_norm[i].real, subband_spec_norm[i].imag, torch.log(subband_power[:,i].unsqueeze(1))], 1)
+            subband_feature.append(self.BN[i](concat_spec))
+        subband_feature = torch.stack(subband_feature, 1)  # B, nband, N, T
+        return subband_feature
+    def forward(self, input):
+        B, nch, nsample = input.shape
+        subband_feature = self.feature_extractor(input)
+        feature = self.net(subband_feature)
+        est_spec = []
+        for i in range(self.nband):
+            this_RI = self.output[i](feature[:,i]).view(B*nch, 2, self.band_width[i], -1)
+            est_spec.append(torch.complex(this_RI[:,0], this_RI[:,1]))
+        est_spec = torch.cat(est_spec, 1)
+        output = torch.istft(est_spec, n_fft=self.win, hop_length=self.stride,
+                             window=torch.hann_window(self.win).to(input.device), length=nsample).view(B, nch, -1)
+        return output
+    def get_model_args(self):
+        model_args = {"n_sample_rate": 2}
+        return model_args

look2hear/models/base_model.py ADDED Viewed

	@@ -0,0 +1,96 @@

+###
+# Author: Kai Li
+# Date: 2021-06-17 23:08:32
+# LastEditors: Please set LastEditors
+# LastEditTime: 2022-05-26 18:06:22
+###
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+def _unsqueeze_to_3d(x):
+    """Normalize shape of `x` to [batch, n_chan, time]."""
+    if x.ndim == 1:
+        return x.reshape(1, 1, -1)
+    elif x.ndim == 2:
+        return x.unsqueeze(1)
+    else:
+        return x
+def pad_to_appropriate_length(x, lcm):
+    values_to_pad = int(x.shape[-1]) % lcm
+    if values_to_pad:
+        appropriate_shape = x.shape
+        padded_x = torch.zeros(
+            list(appropriate_shape[:-1])
+            + [appropriate_shape[-1] + lcm - values_to_pad],
+            dtype=torch.float32,
+        ).to(x.device)
+        padded_x[..., : x.shape[-1]] = x
+        return padded_x
+    return x
+class BaseModel(nn.Module, PyTorchModelHubMixin, repo_url="https://github.com/JusperLee/Apollo", pipeline_tag="audio-to-audio"):
+    def __init__(self, sample_rate, in_chan=1):
+        super().__init__()
+        self._sample_rate = sample_rate
+        self._in_chan = in_chan
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+    def sample_rate(self,):
+        return self._sample_rate
+    @staticmethod
+    def load_state_dict_in_audio(model, pretrained_dict):
+        model_dict = model.state_dict()
+        update_dict = {}
+        for k, v in pretrained_dict.items():
+            if "audio_model" in k:
+                update_dict[k[12:]] = v
+        model_dict.update(update_dict)
+        model.load_state_dict(model_dict)
+        return model
+    @staticmethod
+    def from_pretrain(pretrained_model_conf_or_path, *args, **kwargs):
+        from . import get
+        conf = torch.load(
+            pretrained_model_conf_or_path, map_location="cpu"
+        )  # Attempt to find the model and instantiate it.
+        model_class = get(conf["model_name"])
+        # model_class = get("Conv_TasNet")
+        model = model_class(*args, **kwargs)
+        model.load_state_dict(conf["state_dict"])
+        return model
+    def serialize(self):
+        import pytorch_lightning as pl  # Not used in torch.hub
+        model_conf = dict(
+            model_name=self.__class__.__name__,
+            state_dict=self.get_state_dict(),
+            model_args=self.get_model_args(),
+        )
+        # Additional infos
+        infos = dict()
+        infos["software_versions"] = dict(
+            torch_version=torch.__version__, pytorch_lightning_version=pl.__version__,
+        )
+        model_conf["infos"] = infos
+        return model_conf
+    def get_state_dict(self):
+        """In case the state dict needs to be modified before sharing the model."""
+        return self.state_dict()
+    def get_model_args(self):
+        """Should return args to re-instantiate the class."""
+        raise NotImplementedError

look2hear/system/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+###
+# Author: Kai Li
+# Date: 2021-06-20 17:52:35
+# LastEditors: Please set LastEditors
+# LastEditTime: 2022-05-26 18:27:43
+###
+from .optimizers import make_optimizer
+from .audio_litmodule import AudioLightningModule
+from .schedulers import DPTNetScheduler
+__all__ = [
+    "make_optimizer",
+    "AudioLightningModule",
+    "DPTNetScheduler"
+]

look2hear/system/audio_litmodule.py ADDED Viewed

	@@ -0,0 +1,245 @@

+###
+# Author: Kai Li
+# Date: 2022-05-26 18:09:54
+# Email: [email protected]
+# LastEditTime: 2024-01-24 00:00:28
+###
+import gc
+from omegaconf import OmegaConf
+import torch
+import pytorch_lightning as pl
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from collections.abc import MutableMapping
+from omegaconf import ListConfig
+def flatten_dict(d, parent_key="", sep="_"):
+    """Flattens a dictionary into a single-level dictionary while preserving
+    parent keys. Taken from
+    `SO <https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys>`_
+    Args:
+        d (MutableMapping): Dictionary to be flattened.
+        parent_key (str): String to use as a prefix to all subsequent keys.
+        sep (str): String to use as a separator between two key levels.
+    Returns:
+        dict: Single-level dictionary, flattened.
+    """
+    items = []
+    for k, v in d.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+class AudioLightningModule(pl.LightningModule):
+    def __init__(
+        self,
+        model=None,
+        discriminator=None,
+        optimizer=None,
+        loss_func=None,
+        metrics=None,
+        scheduler=None,
+    ):
+        super().__init__()
+        self.audio_model = model
+        self.discriminator = discriminator
+        self.optimizer = list(optimizer)
+        self.loss_func = loss_func
+        self.metrics = metrics
+        self.scheduler = list(scheduler)
+        # Save lightning"s AttributeDict under self.hparams
+        self.default_monitor = "val_loss"
+        # self.print(self.audio_model)
+        self.validation_step_outputs = []
+        self.test_step_outputs = []
+        self.automatic_optimization = False
+    def forward(self, wav):
+        """Applies forward pass of the model.
+        Returns:
+            :class:`torch.Tensor`
+        """
+        return self.audio_model(wav)
+    def training_step(self, batch, batch_nb):
+        ori_data, codec_data = batch
+        optimizer_g, optimizer_d = self.optimizers()
+        # multiple schedulers
+        scheduler_g, scheduler_d = self.lr_schedulers()
+        # train discriminator
+        optimizer_g.zero_grad()
+        output = self(codec_data)
+        optimizer_d.zero_grad()
+        est_outputs, _ = self.discriminator(output.detach(), sample_rate=44100)
+        target_outputs, _ = self.discriminator(ori_data, sample_rate=44100)
+        loss_d = self.loss_func["d"](target_outputs, est_outputs)
+        self.manual_backward(loss_d)
+        self.clip_gradients(optimizer_d, gradient_clip_val=5, gradient_clip_algorithm="norm")
+        optimizer_d.step()
+        # train generator
+        est_outputs, est_feature_maps = self.discriminator(output, sample_rate=44100)
+        _, targets_feature_maps = self.discriminator(ori_data, sample_rate=44100)
+        loss_g = self.loss_func["g"](est_outputs, est_feature_maps, targets_feature_maps, output, ori_data)
+        self.manual_backward(loss_g)
+        self.clip_gradients(optimizer_g, gradient_clip_val=5, gradient_clip_algorithm="norm")
+        optimizer_g.step()
+        # print(loss)
+        if self.trainer.is_last_batch:
+            scheduler_g.step()
+            scheduler_d.step()
+        self.log(
+            "train_loss_d",
+            loss_d,
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True,
+            logger=True,
+        )
+        self.log(
+            "train_loss_g",
+            loss_g,
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True,
+            logger=True,
+        )
+    def validation_step(self, batch, batch_nb):
+        # cal val loss
+        ori_data, codec_data = batch
+        # print(mixtures.shape)
+        est_sources = self(codec_data)
+        loss = self.metrics(est_sources, ori_data)
+        self.log(
+            "val_loss",
+            loss,
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True,
+            logger=True,
+        )
+        self.validation_step_outputs.append(loss)
+        return {"val_loss": loss}
+    def on_validation_epoch_end(self):
+        # val
+        avg_loss = torch.stack(self.validation_step_outputs).mean()
+        val_loss = torch.mean(self.all_gather(avg_loss))
+        self.log(
+            "lr",
+            self.optimizer[0].param_groups[0]["lr"],
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+        self.logger.experiment.log(
+            {"learning_rate": self.optimizer[0].param_groups[0]["lr"], "epoch": self.current_epoch}
+        )
+        self.logger.experiment.log(
+            {"val_pit_sisnr": -val_loss, "epoch": self.current_epoch}
+        )
+        self.validation_step_outputs.clear()  # free memory
+        torch.cuda.empty_cache()
+    def test_step(self, batch, batch_nb):
+        mixtures, targets = batch
+        est_sources = self(mixtures)
+        loss = self.metrics(est_sources, targets)
+        self.log(
+            "test_loss",
+            loss,
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True,
+            logger=True,
+        )
+        self.test_step_outputs.append(loss)
+        return {"test_loss": loss}
+    def on_test_epoch_end(self):
+        # val
+        avg_loss = torch.stack(self.test_step_outputs).mean()
+        test_loss = torch.mean(self.all_gather(avg_loss))
+        self.log(
+            "lr",
+            self.optimizer.param_groups[0]["lr"],
+            on_epoch=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+        self.logger.experiment.log(
+            {"learning_rate": self.optimizer.param_groups[0]["lr"], "epoch": self.current_epoch}
+        )
+        self.logger.experiment.log(
+            {"test_pit_sisnr": -test_loss, "epoch": self.current_epoch}
+        )
+        self.test_step_outputs.clear()
+    def configure_optimizers(self):
+        """Initialize optimizers, batch-wise and epoch-wise schedulers."""
+        if self.scheduler is None:
+            return self.optimizer
+        if not isinstance(self.scheduler, (list, tuple)):
+            self.scheduler = [self.scheduler]  # support multiple schedulers
+        if not isinstance(self.optimizer, (list, tuple)):
+            self.optimizer = [self.optimizer]  # support multiple schedulers
+        epoch_schedulers = []
+        for sched in self.scheduler:
+            if not isinstance(sched, dict):
+                if isinstance(sched, ReduceLROnPlateau):
+                    sched = {"scheduler": sched, "monitor": self.default_monitor}
+                epoch_schedulers.append(sched)
+            else:
+                sched.setdefault("monitor", self.default_monitor)
+                sched.setdefault("frequency", 1)
+                # Backward compat
+                if sched["interval"] == "batch":
+                    sched["interval"] = "step"
+                assert sched["interval"] in [
+                    "epoch",
+                    "step",
+                ], "Scheduler interval should be either step or epoch"
+                epoch_schedulers.append(sched)
+        return self.optimizer, epoch_schedulers
+    @staticmethod
+    def config_to_hparams(dic):
+        """Sanitizes the config dict to be handled correctly by torch
+        SummaryWriter. It flatten the config dict, converts ``None`` to
+        ``"None"`` and any list and tuple into torch.Tensors.
+        Args:
+            dic (dict): Dictionary to be transformed.
+        Returns:
+            dict: Transformed dictionary.
+        """
+        dic = flatten_dict(dic)
+        for k, v in dic.items():
+            if v is None:
+                dic[k] = str(v)
+            elif isinstance(v, (list, tuple)):
+                dic[k] = torch.tensor(v)
+        return dic

look2hear/system/optimizers.py ADDED Viewed

	@@ -0,0 +1,113 @@

+###
+# Author: Kai Li
+# Date: 2021-06-20 00:21:33
+# LastEditors: Please set LastEditors
+# LastEditTime: 2022-05-27 11:19:51
+###
+from torch.optim.optimizer import Optimizer
+from torch.optim import Adam, RMSprop, SGD, Adadelta, Adagrad, Adamax, AdamW, ASGD
+from torch_optimizer import (
+    AccSGD,
+    AdaBound,
+    AdaMod,
+    DiffGrad,
+    Lamb,
+    NovoGrad,
+    PID,
+    QHAdam,
+    QHM,
+    RAdam,
+    SGDW,
+    Yogi,
+    Ranger,
+    RangerQH,
+    RangerVA,
+)
+__all__ = [
+    "AccSGD",
+    "AdaBound",
+    "AdaMod",
+    "DiffGrad",
+    "Lamb",
+    "NovoGrad",
+    "PID",
+    "QHAdam",
+    "QHM",
+    "RAdam",
+    "SGDW",
+    "Yogi",
+    "Ranger",
+    "RangerQH",
+    "RangerVA",
+    "Adam",
+    "RMSprop",
+    "SGD",
+    "Adadelta",
+    "Adagrad",
+    "Adamax",
+    "AdamW",
+    "ASGD",
+    "make_optimizer",
+    "get",
+]
+def make_optimizer(params, optim_name="adam", **kwargs):
+    """
+    Args:
+        params (iterable): Output of `nn.Module.parameters()`.
+        optimizer (str or :class:`torch.optim.Optimizer`): Identifier understood
+            by :func:`~.get`.
+        **kwargs (dict): keyword arguments for the optimizer.
+    Returns:
+        torch.optim.Optimizer
+    Examples
+        >>> from torch import nn
+        >>> model = nn.Sequential(nn.Linear(10, 10))
+        >>> optimizer = make_optimizer(model.parameters(), optimizer='sgd',
+        >>>                            lr=1e-3)
+    """
+    return get(optim_name)(params, **kwargs)
+def register_optimizer(custom_opt):
+    """Register a custom opt, gettable with `optimzers.get`.
+    Args:
+        custom_opt: Custom optimizer to register.
+    """
+    if (
+        custom_opt.__name__ in globals().keys()
+        or custom_opt.__name__.lower() in globals().keys()
+    ):
+        raise ValueError(
+            f"Activation {custom_opt.__name__} already exists. Choose another name."
+        )
+    globals().update({custom_opt.__name__: custom_opt})
+def get(identifier):
+    """Returns an optimizer function from a string. Returns its input if it
+    is callable (already a :class:`torch.optim.Optimizer` for example).
+    Args:
+        identifier (str or Callable): the optimizer identifier.
+    Returns:
+        :class:`torch.optim.Optimizer` or None
+    """
+    if isinstance(identifier, Optimizer):
+        return identifier
+    elif isinstance(identifier, str):
+        to_get = {k.lower(): v for k, v in globals().items()}
+        cls = to_get.get(identifier.lower())
+        if cls is None:
+            raise ValueError(f"Could not interpret optimizer : {str(identifier)}")
+        return cls
+    raise ValueError(f"Could not interpret optimizer : {str(identifier)}")

look2hear/system/schedulers.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+from torch.optim.optimizer import Optimizer
+import pytorch_lightning as pl
+from torch.optim.lr_scheduler import _LRScheduler
+class BaseScheduler(object):
+    """Base class for the step-wise scheduler logic.
+    Args:
+        optimizer (Optimize): Optimizer instance to apply lr schedule on.
+    Subclass this and overwrite ``_get_lr`` to write your own step-wise scheduler.
+    """
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+        self.step_num = 0
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+    def _get_lr(self):
+        raise NotImplementedError
+    def _set_lr(self, lr):
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+    def step(self, metrics=None, epoch=None):
+        """Update step-wise learning rate before optimizer.step."""
+        self.step_num += 1
+        lr = self._get_lr()
+        self._set_lr(lr)
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+    def state_dict(self):
+        return {key: value for key, value in self.__dict__.items() if key != "optimizer"}
+    def as_tensor(self, start=0, stop=100_000):
+        """Returns the scheduler values from start to stop."""
+        lr_list = []
+        for _ in range(start, stop):
+            self.step_num += 1
+            lr_list.append(self._get_lr())
+        self.step_num = 0
+        return torch.tensor(lr_list)
+    def plot(self, start=0, stop=100_000):  # noqa
+        """Plot the scheduler values from start to stop."""
+        import matplotlib.pyplot as plt
+        all_lr = self.as_tensor(start=start, stop=stop)
+        plt.plot(all_lr.numpy())
+        plt.show()
+class DPTNetScheduler(BaseScheduler):
+    """Dual Path Transformer Scheduler used in [1]
+    Args:
+        optimizer (Optimizer): Optimizer instance to apply lr schedule on.
+        steps_per_epoch (int): Number of steps per epoch.
+        d_model(int): The number of units in the layer output.
+        warmup_steps (int): The number of steps in the warmup stage of training.
+        noam_scale (float): Linear increase rate in first phase.
+        exp_max (float): Max learning rate in second phase.
+        exp_base (float): Exp learning rate base in second phase.
+    Schedule:
+        This scheduler increases the learning rate linearly for the first
+        ``warmup_steps``, and then decay it by 0.98 for every two epochs.
+    References
+        [1]: Jingjing Chen et al. "Dual-Path Transformer Network: Direct Context-
+        Aware Modeling for End-to-End Monaural Speech Separation" Interspeech 2020.
+    """
+    def __init__(
+        self,
+        optimizer,
+        steps_per_epoch,
+        d_model,
+        warmup_steps=4000,
+        noam_scale=1.0,
+        exp_max=0.0004,
+        exp_base=0.98,
+    ):
+        super().__init__(optimizer)
+        self.noam_scale = noam_scale
+        self.d_model = d_model
+        self.warmup_steps = warmup_steps
+        self.exp_max = exp_max
+        self.exp_base = exp_base
+        self.steps_per_epoch = steps_per_epoch
+        self.epoch = 0
+    def _get_lr(self):
+        if self.step_num % self.steps_per_epoch == 0:
+            self.epoch += 1
+        if self.step_num > self.warmup_steps:
+            # exp decaying
+            lr = self.exp_max * (self.exp_base ** ((self.epoch - 1) // 2))
+        else:
+            # noam
+            lr = (
+                self.noam_scale
+                * self.d_model ** (-0.5)
+                * min(self.step_num ** (-0.5), self.step_num * self.warmup_steps ** (-1.5))
+            )
+        return lr
+class CustomExponentialLR(_LRScheduler):
+    def __init__(self, optimizer, gamma, step_size, last_epoch=-1):
+        self.gamma = gamma
+        self.step_size = step_size
+        self.base_lrs = list(map(lambda group: group['lr'], optimizer.param_groups))
+        super(CustomExponentialLR, self).__init__(optimizer, last_epoch)
+    def get_lr(self):
+        if self.last_epoch == 0 or (self.last_epoch + 1) % self.step_size != 0:
+            return [group['lr'] for group in self.optimizer.param_groups]
+        return [lr * self.gamma for lr in self.base_lrs]
+# Backward compat
+_BaseScheduler = BaseScheduler

look2hear/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+###
+# Author: Kai Li
+# Date: 2021-06-18 16:53:49
+# LastEditors: Please set LastEditors
+# LastEditTime: 2024-01-22 01:01:02
+###
+from .stft import STFT
+from .torch_utils import pad_x_to_y, shape_reconstructed, tensors_to_device
+from .parser_utils import (
+    prepare_parser_from_dict,
+    parse_args_as_dict,
+    str_int_float,
+    str2bool,
+    str2bool_arg,
+    isfloat,
+    isint,
+    instantiate
+)
+from .lightning_utils import print_only, RichProgressBarTheme, MyRichProgressBar, BatchesProcessedColumn, MyMetricsTextColumn
+from .complex_utils import is_complex, is_torch_complex_tensor, new_complex_like
+from .get_layer_from_string import get_layer
+from .inversible_interface import InversibleInterface
+from .nets_utils import make_pad_mask
+from .pylogger import RankedLogger
+from .separator import wav_chunk_inference
+__all__ = [
+    "wav_chunk_inference",
+    "RankedLogger",
+    "instantiate",
+    "STFT",
+    "pad_x_to_y",
+    "shape_reconstructed",
+    "tensors_to_device",
+    "prepare_parser_from_dict",
+    "parse_args_as_dict",
+    "str_int_float",
+    "str2bool",
+    "str2bool_arg",
+    "isfloat",
+    "isint",
+    "print_only",
+    "RichProgressBarTheme",
+    "MyRichProgressBar",
+    "BatchesProcessedColumn",
+    "MyMetricsTextColumn",
+    "is_complex",
+    "is_torch_complex_tensor",
+    "new_complex_like",
+    "get_layer",
+    "InversibleInterface",
+    "make_pad_mask",
+]

look2hear/utils/complex_utils.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""Beamformer module."""
+from typing import Sequence, Tuple, Union
+import torch
+from packaging.version import parse as V
+from torch_complex import functional as FC
+from torch_complex.tensor import ComplexTensor
+EPS = torch.finfo(torch.double).eps
+is_torch_1_8_plus = V(torch.__version__) >= V("1.8.0")
+is_torch_1_9_plus = V(torch.__version__) >= V("1.9.0")
+def new_complex_like(
+    ref: Union[torch.Tensor, ComplexTensor],
+    real_imag: Tuple[torch.Tensor, torch.Tensor],
+):
+    if isinstance(ref, ComplexTensor):
+        return ComplexTensor(*real_imag)
+    elif is_torch_complex_tensor(ref):
+        return torch.complex(*real_imag)
+    else:
+        raise ValueError(
+            "Please update your PyTorch version to 1.9+ for complex support."
+        )
+def is_torch_complex_tensor(c):
+    return (
+        not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c)
+    )
+def is_complex(c):
+    return isinstance(c, ComplexTensor) or is_torch_complex_tensor(c)
+def to_double(c):
+    if not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c):
+        return c.to(dtype=torch.complex128)
+    else:
+        return c.double()
+def to_float(c):
+    if not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c):
+        return c.to(dtype=torch.complex64)
+    else:
+        return c.float()
+def cat(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "cat(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.cat(seq, *args, **kwargs)
+    else:
+        return torch.cat(seq, *args, **kwargs)
+def complex_norm(
+    c: Union[torch.Tensor, ComplexTensor], dim=-1, keepdim=False
+) -> torch.Tensor:
+    if not is_complex(c):
+        raise TypeError("Input is not a complex tensor.")
+    if is_torch_complex_tensor(c):
+        return torch.norm(c, dim=dim, keepdim=keepdim)
+    else:
+        if dim is None:
+            return torch.sqrt((c.real**2 + c.imag**2).sum() + EPS)
+        else:
+            return torch.sqrt(
+                (c.real**2 + c.imag**2).sum(dim=dim, keepdim=keepdim) + EPS
+            )
+def einsum(equation, *operands):
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.einsum does not support
+    # mixed input with complex and real tensors.
+    if len(operands) == 1:
+        if isinstance(operands[0], (tuple, list)):
+            operands = operands[0]
+        complex_module = FC if isinstance(operands[0], ComplexTensor) else torch
+        return complex_module.einsum(equation, *operands)
+    elif len(operands) != 2:
+        op0 = operands[0]
+        same_type = all(op.dtype == op0.dtype for op in operands[1:])
+        if same_type:
+            _einsum = FC.einsum if isinstance(op0, ComplexTensor) else torch.einsum
+            return _einsum(equation, *operands)
+        else:
+            raise ValueError("0 or More than 2 operands are not supported.")
+    a, b = operands
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        return FC.einsum(equation, a, b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if not torch.is_complex(a):
+            o_real = torch.einsum(equation, a, b.real)
+            o_imag = torch.einsum(equation, a, b.imag)
+            return torch.complex(o_real, o_imag)
+        elif not torch.is_complex(b):
+            o_real = torch.einsum(equation, a.real, b)
+            o_imag = torch.einsum(equation, a.imag, b)
+            return torch.complex(o_real, o_imag)
+        else:
+            return torch.einsum(equation, a, b)
+    else:
+        return torch.einsum(equation, a, b)
+def inverse(
+    c: Union[torch.Tensor, ComplexTensor]
+) -> Union[torch.Tensor, ComplexTensor]:
+    if isinstance(c, ComplexTensor):
+        return c.inverse2()
+    else:
+        return c.inverse()
+def matmul(
+    a: Union[torch.Tensor, ComplexTensor], b: Union[torch.Tensor, ComplexTensor]
+) -> Union[torch.Tensor, ComplexTensor]:
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.matmul does not support
+    # multiplication between complex and real tensors.
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        return FC.matmul(a, b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if not torch.is_complex(a):
+            o_real = torch.matmul(a, b.real)
+            o_imag = torch.matmul(a, b.imag)
+            return torch.complex(o_real, o_imag)
+        elif not torch.is_complex(b):
+            o_real = torch.matmul(a.real, b)
+            o_imag = torch.matmul(a.imag, b)
+            return torch.complex(o_real, o_imag)
+        else:
+            return torch.matmul(a, b)
+    else:
+        return torch.matmul(a, b)
+def trace(a: Union[torch.Tensor, ComplexTensor]):
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
+    return FC.trace(a)
+def reverse(a: Union[torch.Tensor, ComplexTensor], dim=0):
+    if isinstance(a, ComplexTensor):
+        return FC.reverse(a, dim=dim)
+    else:
+        return torch.flip(a, dims=(dim,))
+def solve(b: Union[torch.Tensor, ComplexTensor], a: Union[torch.Tensor, ComplexTensor]):
+    """Solve the linear equation ax = b."""
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.solve does not support
+    # mixed input with complex and real tensors.
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        if isinstance(a, ComplexTensor) and isinstance(b, ComplexTensor):
+            return FC.solve(b, a, return_LU=False)
+        else:
+            return matmul(inverse(a), b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if torch.is_complex(a) and torch.is_complex(b):
+            return torch.linalg.solve(a, b)
+        else:
+            return matmul(inverse(a), b)
+    else:
+        if is_torch_1_8_plus:
+            return torch.linalg.solve(a, b)
+        else:
+            return torch.solve(b, a)[0]
+def stack(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "stack(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.stack(seq, *args, **kwargs)
+    else:
+        return torch.stack(seq, *args, **kwargs)

look2hear/utils/get_layer_from_string.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import difflib
+import torch
+def get_layer(l_name, library=torch.nn):
+    """Return layer object handler from library e.g. from torch.nn
+    E.g. if l_name=="elu", returns torch.nn.ELU.
+    Args:
+        l_name (string): Case insensitive name for layer in library (e.g. .'elu').
+        library (module): Name of library/module where to search for object handler
+        with l_name e.g. "torch.nn".
+    Returns:
+        layer_handler (object): handler for the requested layer e.g. (torch.nn.ELU)
+    """
+    all_torch_layers = [x for x in dir(torch.nn)]
+    match = [x for x in all_torch_layers if l_name.lower() == x.lower()]
+    if len(match) == 0:
+        close_matches = difflib.get_close_matches(
+            l_name, [x.lower() for x in all_torch_layers]
+        )
+        raise NotImplementedError(
+            "Layer with name {} not found in {}.\n Closest matches: {}".format(
+                l_name, str(library), close_matches
+            )
+        )
+    elif len(match) > 1:
+        close_matches = difflib.get_close_matches(
+            l_name, [x.lower() for x in all_torch_layers]
+        )
+        raise NotImplementedError(
+            "Multiple matchs for layer with name {} not found in {}.\n "
+            "All matches: {}".format(l_name, str(library), close_matches)
+        )
+    else:
+        # valid
+        layer_handler = getattr(library, match[0])
+        return layer_handler

look2hear/utils/inversible_interface.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from abc import ABC, abstractmethod
+from typing import Tuple
+import torch
+class InversibleInterface(ABC):
+    @abstractmethod
+    def inverse(
+        self, input: torch.Tensor, input_lengths: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # return output, output_lengths
+        raise NotImplementedError

look2hear/utils/lightning_utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+###
+# Author: Kai Li
+# Date: 2022-05-27 10:27:56
+# Email: [email protected]
+# LastEditTime: 2022-06-13 12:11:15
+###
+from rich import print
+from dataclasses import dataclass
+from pytorch_lightning.utilities import rank_zero_only
+from typing import Union
+from pytorch_lightning.callbacks.progress.rich_progress import *
+from rich.console import Console, RenderableType
+from rich.progress_bar import ProgressBar
+from rich.style import Style
+from rich.text import Text
+from rich.progress import (
+    BarColumn,
+    DownloadColumn,
+    Progress,
+    TaskID,
+    TextColumn,
+    TimeRemainingColumn,
+    TransferSpeedColumn,
+    ProgressColumn
+)
+from rich import print, reconfigure
+@rank_zero_only
+def print_only(message: str):
+    print(message)
+@dataclass
+class RichProgressBarTheme:
+    """Styles to associate to different base components.
+    Args:
+        description: Style for the progress bar description. For eg., Epoch x, Testing, etc.
+        progress_bar: Style for the bar in progress.
+        progress_bar_finished: Style for the finished progress bar.
+        progress_bar_pulse: Style for the progress bar when `IterableDataset` is being processed.
+        batch_progress: Style for the progress tracker (i.e 10/50 batches completed).
+        time: Style for the processed time and estimate time remaining.
+        processing_speed: Style for the speed of the batches being processed.
+        metrics: Style for the metrics
+    https://rich.readthedocs.io/en/stable/style.html
+    """
+    description: Union[str, Style] = "#FF4500"
+    progress_bar: Union[str, Style] = "#f92672"
+    progress_bar_finished: Union[str, Style] = "#b7cc8a"
+    progress_bar_pulse: Union[str, Style] = "#f92672"
+    batch_progress: Union[str, Style] = "#fc608a"
+    time: Union[str, Style] = "#45ada2"
+    processing_speed: Union[str, Style] = "#DC143C"
+    metrics: Union[str, Style] = "#228B22"
+class BatchesProcessedColumn(ProgressColumn):
+    def __init__(self, style: Union[str, Style]):
+        self.style = style
+        super().__init__()
+    def render(self, task) -> RenderableType:
+        total = task.total if task.total != float("inf") else "--"
+        return Text(f"{int(task.completed)}/{int(total)}", style=self.style)
+class MyMetricsTextColumn(ProgressColumn):
+    """A column containing text."""
+    def __init__(self, style):
+        self._tasks = {}
+        self._current_task_id = 0
+        self._metrics = {}
+        self._style = style
+        super().__init__()
+    def update(self, metrics):
+        # Called when metrics are ready to be rendered.
+        # This is to prevent render from causing deadlock issues by requesting metrics
+        # in separate threads.
+        self._metrics = metrics
+    def render(self, task) -> Text:
+        text = ""
+        for k, v in self._metrics.items():
+            text += f"{k}: {round(v, 3) if isinstance(v, float) else v} "
+        return Text(text, justify="left", style=self._style)
+class MyRichProgressBar(RichProgressBar):
+    """A progress bar prints metrics at the end of each epoch
+    """
+    def _init_progress(self, trainer):
+        if self.is_enabled and (self.progress is None or self._progress_stopped):
+            self._reset_progress_bar_ids()
+            reconfigure(**self._console_kwargs)
+            # file = open("/home/likai/data/Look2Hear/Experiments/run_logs/EdgeFRCNN-Noncausal.log", 'w')
+            self._console: Console = Console(force_terminal=True)
+            self._console.clear_live()
+            self._metric_component = MetricsTextColumn(trainer, self.theme.metrics)
+            self.progress = CustomProgress(
+                *self.configure_columns(trainer),
+                self._metric_component,
+                auto_refresh=False,
+                disable=self.is_disabled,
+                console=self._console,
+            )
+            self.progress.start()
+            # progress has started
+            self._progress_stopped = False

look2hear/utils/nets_utils.py ADDED Viewed

	@@ -0,0 +1,503 @@

+# -*- coding: utf-8 -*-
+"""Network related utility tools."""
+import logging
+from typing import Dict
+import numpy as np
+import torch
+def to_device(m, x):
+    """Send tensor into the device of the module.
+    Args:
+        m (torch.nn.Module): Torch module.
+        x (Tensor): Torch tensor.
+    Returns:
+        Tensor: Torch tensor located in the same place as torch module.
+    """
+    if isinstance(m, torch.nn.Module):
+        device = next(m.parameters()).device
+    elif isinstance(m, torch.Tensor):
+        device = m.device
+    else:
+        raise TypeError(
+            "Expected torch.nn.Module or torch.tensor, " f"bot got: {type(m)}"
+        )
+    return x.to(device)
+def pad_list(xs, pad_value):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    n_batch = len(xs)
+    max_len = max(x.size(0) for x in xs)
+    pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
+    for i in range(n_batch):
+        pad[i, : xs[i].size(0)] = xs[i]
+    return pad
+def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
+    """Make mask tensor containing indices of padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        Tensor: Mask tensor containing indices of padded part.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0],
+                 [0, 0, 0, 0]],
+                [[0, 0, 0, 1],
+                 [0, 0, 0, 1]],
+                [[0, 0, 1, 1],
+                 [0, 0, 1, 1]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_pad_mask(lengths, xs, 1)
+        tensor([[[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8)
+        >>> make_pad_mask(lengths, xs, 2)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+    """
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+    if not isinstance(lengths, list):
+        lengths = lengths.long().tolist()
+    bs = int(len(lengths))
+    if maxlen is None:
+        if xs is None:
+            maxlen = int(max(lengths))
+        else:
+            maxlen = xs.size(length_dim)
+    else:
+        assert xs is None
+        assert maxlen >= int(max(lengths))
+    seq_range = torch.arange(0, maxlen, dtype=torch.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
+    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    if xs is not None:
+        assert xs.size(0) == bs, (xs.size(0), bs)
+        if length_dim < 0:
+            length_dim = xs.dim() + length_dim
+        # ind = (:, None, ..., None, :, , None, ..., None)
+        ind = tuple(
+            slice(None) if i in (0, length_dim) else None for i in range(xs.dim())
+        )
+        mask = mask[ind].expand_as(xs).to(xs.device)
+    return mask
+def make_non_pad_mask(lengths, xs=None, length_dim=-1):
+    """Make mask tensor containing indices of non-padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        ByteTensor: mask tensor containing indices of padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1],
+                 [1, 1, 1, 1]],
+                [[1, 1, 1, 0],
+                 [1, 1, 1, 0]],
+                [[1, 1, 0, 0],
+                 [1, 1, 0, 0]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_non_pad_mask(lengths, xs, 1)
+        tensor([[[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)
+        >>> make_non_pad_mask(lengths, xs, 2)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+    """
+    return ~make_pad_mask(lengths, xs, length_dim)
+def mask_by_length(xs, lengths, fill=0):
+    """Mask tensor according to length.
+    Args:
+        xs (Tensor): Batch of input tensor (B, `*`).
+        lengths (LongTensor or List): Batch of lengths (B,).
+        fill (int or float): Value to fill masked part.
+    Returns:
+        Tensor: Batch of masked input tensor (B, `*`).
+    Examples:
+        >>> x = torch.arange(5).repeat(3, 1) + 1
+        >>> x
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5]])
+        >>> lengths = [5, 3, 2]
+        >>> mask_by_length(x, lengths)
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 0, 0],
+                [1, 2, 0, 0, 0]])
+    """
+    assert xs.size(0) == len(lengths)
+    ret = xs.data.new(*xs.size()).fill_(fill)
+    for i, l in enumerate(lengths):
+        ret[i, :l] = xs[i, :l]
+    return ret
+def th_accuracy(pad_outputs, pad_targets, ignore_label):
+    """Calculate accuracy.
+    Args:
+        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
+        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
+        ignore_label (int): Ignore label id.
+    Returns:
+        float: Accuracy value (0.0 - 1.0).
+    """
+    pad_pred = pad_outputs.view(
+        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)
+    ).argmax(2)
+    mask = pad_targets != ignore_label
+    numerator = torch.sum(
+        pad_pred.masked_select(mask) == pad_targets.masked_select(mask)
+    )
+    denominator = torch.sum(mask)
+    return float(numerator) / float(denominator)
+def to_torch_tensor(x):
+    """Change to torch.Tensor or ComplexTensor from numpy.ndarray.
+    Args:
+        x: Inputs. It should be one of numpy.ndarray, Tensor, ComplexTensor, and dict.
+    Returns:
+        Tensor or ComplexTensor: Type converted inputs.
+    Examples:
+        >>> xs = np.ones(3, dtype=np.float32)
+        >>> xs = to_torch_tensor(xs)
+        tensor([1., 1., 1.])
+        >>> xs = torch.ones(3, 4, 5)
+        >>> assert to_torch_tensor(xs) is xs
+        >>> xs = {'real': xs, 'imag': xs}
+        >>> to_torch_tensor(xs)
+        ComplexTensor(
+        Real:
+        tensor([1., 1., 1.])
+        Imag;
+        tensor([1., 1., 1.])
+        )
+    """
+    # If numpy, change to torch tensor
+    if isinstance(x, np.ndarray):
+        if x.dtype.kind == "c":
+            # Dynamically importing because torch_complex requires python3
+            from torch_complex.tensor import ComplexTensor
+            return ComplexTensor(x)
+        else:
+            return torch.from_numpy(x)
+    # If {'real': ..., 'imag': ...}, convert to ComplexTensor
+    elif isinstance(x, dict):
+        # Dynamically importing because torch_complex requires python3
+        from torch_complex.tensor import ComplexTensor
+        if "real" not in x or "imag" not in x:
+            raise ValueError("has 'real' and 'imag' keys: {}".format(list(x)))
+        # Relative importing because of using python3 syntax
+        return ComplexTensor(x["real"], x["imag"])
+    # If torch.Tensor, as it is
+    elif isinstance(x, torch.Tensor):
+        return x
+    else:
+        error = (
+            "x must be numpy.ndarray, torch.Tensor or a dict like "
+            "{{'real': torch.Tensor, 'imag': torch.Tensor}}, "
+            "but got {}".format(type(x))
+        )
+        try:
+            from torch_complex.tensor import ComplexTensor
+        except Exception:
+            # If PY2
+            raise ValueError(error)
+        else:
+            # If PY3
+            if isinstance(x, ComplexTensor):
+                return x
+            else:
+                raise ValueError(error)
+def get_subsample(train_args, mode, arch):
+    """Parse the subsampling factors from the args for the specified `mode` and `arch`.
+    Args:
+        train_args: argument Namespace containing options.
+        mode: one of ('asr', 'mt', 'st')
+        arch: one of ('rnn', 'rnn-t', 'rnn_mix', 'rnn_mulenc', 'transformer')
+    Returns:
+        np.ndarray / List[np.ndarray]: subsampling factors.
+    """
+    if arch == "transformer":
+        return np.array([1])
+    elif mode == "mt" and arch == "rnn":
+        # +1 means input (+1) and layers outputs (train_args.elayer)
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int64)
+        logging.warning("Subsampling is not performed for machine translation.")
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif (
+        (mode == "asr" and arch in ("rnn", "rnn-t"))
+        or (mode == "mt" and arch == "rnn")
+        or (mode == "st" and arch == "rnn")
+    ):
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int64)
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(min(train_args.elayers + 1, len(ss))):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mix":
+        subsample = np.ones(
+            train_args.elayers_sd + train_args.elayers + 1, dtype=np.int64
+        )
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(
+                min(train_args.elayers_sd + train_args.elayers + 1, len(ss))
+            ):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mulenc":
+        subsample_list = []
+        for idx in range(train_args.num_encs):
+            subsample = np.ones(train_args.elayers[idx] + 1, dtype=np.int64)
+            if train_args.etype[idx].endswith("p") and not train_args.etype[
+                idx
+            ].startswith("vgg"):
+                ss = train_args.subsample[idx].split("_")
+                for j in range(min(train_args.elayers[idx] + 1, len(ss))):
+                    subsample[j] = int(ss[j])
+            else:
+                logging.warning(
+                    "Encoder %d: Subsampling is not performed for vgg*. "
+                    "It is performed in max pooling layers at CNN.",
+                    idx + 1,
+                )
+            logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+            subsample_list.append(subsample)
+        return subsample_list
+    else:
+        raise ValueError("Invalid options: mode={}, arch={}".format(mode, arch))
+def rename_state_dict(
+    old_prefix: str, new_prefix: str, state_dict: Dict[str, torch.Tensor]
+):
+    """Replace keys of old prefix with new prefix in state dict."""
+    # need this list not to break the dict iterator
+    old_keys = [k for k in state_dict if k.startswith(old_prefix)]
+    if len(old_keys) > 0:
+        logging.warning(f"Rename: {old_prefix} -> {new_prefix}")
+    for k in old_keys:
+        v = state_dict.pop(k)
+        new_k = k.replace(old_prefix, new_prefix)
+        state_dict[new_k] = v
+def get_activation(act):
+    """Return activation function."""
+    # Lazy load to avoid unused import
+    from espnet.nets.pytorch_backend.conformer.swish import Swish
+    activation_funcs = {
+        "hardtanh": torch.nn.Hardtanh,
+        "tanh": torch.nn.Tanh,
+        "relu": torch.nn.ReLU,
+        "selu": torch.nn.SELU,
+        "swish": Swish,
+    }
+    return activation_funcs[act]()

look2hear/utils/parser_utils.py ADDED Viewed

	@@ -0,0 +1,178 @@

+###
+# Author: Kai Li
+# Date: 2021-06-20 00:36:46
+# LastEditors: Please set LastEditors
+# LastEditTime: 2024-01-22 03:02:57
+###
+import sys
+import argparse
+import importlib
+from omegaconf import DictConfig
+def prepare_parser_from_dict(dic, parser=None):
+    """Prepare an argparser from a dictionary.
+    Args:
+        dic (dict): Two-level config dictionary with unique bottom-level keys.
+        parser (argparse.ArgumentParser, optional): If a parser already
+            exists, add the keys from the dictionary on the top of it.
+    Returns:
+        argparse.ArgumentParser:
+            Parser instance with groups corresponding to the first level keys
+            and arguments corresponding to the second level keys with default
+            values given by the values.
+    """
+    def standardized_entry_type(value):
+        """If the default value is None, replace NoneType by str_int_float.
+        If the default value is boolean, look for boolean strings."""
+        if value is None:
+            return str_int_float
+        if isinstance(str2bool(value), bool):
+            return str2bool_arg
+        return type(value)
+    if parser is None:
+        parser = argparse.ArgumentParser()
+    for k in dic.keys():
+        group = parser.add_argument_group(k)
+        if isinstance(dic[k], list):
+            entry_type = standardized_entry_type(dic[k])
+            group.add_argument("--" + k, default=dic[k], type=entry_type)
+        elif isinstance(dic[k], dict):
+            for kk in dic[k].keys():
+                entry_type = standardized_entry_type(dic[k][kk])
+                group.add_argument("--" + kk, default=dic[k][kk], type=entry_type)
+        elif isinstance(dic[k], str):
+            entry_type = standardized_entry_type(dic[k])
+            group.add_argument("--" + k, default=dic[k], type=entry_type)
+    return parser
+def str_int_float(value):
+    """Type to convert strings to int, float (in this order) if possible.
+    Args:
+        value (str): Value to convert.
+    Returns:
+        int, float, str: Converted value.
+    """
+    if isint(value):
+        return int(value)
+    if isfloat(value):
+        return float(value)
+    elif isinstance(value, str):
+        return value
+def str2bool(value):
+    """Type to convert strings to Boolean (returns input if not boolean)"""
+    if not isinstance(value, str):
+        return value
+    if value.lower() in ("yes", "true", "y", "1"):
+        return True
+    elif value.lower() in ("no", "false", "n", "0"):
+        return False
+    else:
+        return value
+def str2bool_arg(value):
+    """Argparse type to convert strings to Boolean"""
+    value = str2bool(value)
+    if isinstance(value, bool):
+        return value
+    raise argparse.ArgumentTypeError("Boolean value expected.")
+def isfloat(value):
+    """Computes whether `value` can be cast to a float.
+    Args:
+        value (str): Value to check.
+    Returns:
+        bool: Whether `value` can be cast to a float.
+    """
+    try:
+        float(value)
+        return True
+    except ValueError:
+        return False
+def isint(value):
+    """Computes whether `value` can be cast to an int
+    Args:
+        value (str): Value to check.
+    Returns:
+        bool: Whether `value` can be cast to an int.
+    """
+    try:
+        int(value)
+        return True
+    except ValueError:
+        return False
+def parse_args_as_dict(parser, return_plain_args=False, args=None):
+    """Get a dict of dicts out of process `parser.parse_args()`
+    Top-level keys corresponding to groups and bottom-level keys corresponding
+    to arguments. Under `'main_args'`, the arguments which don't belong to a
+    argparse group (i.e main arguments defined before parsing from a dict) can
+    be found.
+    Args:
+        parser (argparse.ArgumentParser): ArgumentParser instance containing
+            groups. Output of `prepare_parser_from_dict`.
+        return_plain_args (bool): Whether to return the output or
+            `parser.parse_args()`.
+        args (list): List of arguments as read from the command line.
+            Used for unit testing.
+    Returns:
+        dict:
+            Dictionary of dictionaries containing the arguments. Optionally the
+            direct output `parser.parse_args()`.
+    """
+    args = parser.parse_args(args=args)
+    args_dic = {}
+    for group in parser._action_groups:
+        group_dict = {a.dest: getattr(args, a.dest, None) for a in group._group_actions}
+        args_dic[group.title] = group_dict
+    if sys.version_info.minor == 10:
+        args_dic["main_args"] = args_dic["positional arguments"]
+        del args_dic["positional arguments"]
+    else:
+        args_dic["main_args"] = args_dic["optional arguments"]
+        del args_dic["optional arguments"]
+    if return_plain_args:
+        return args_dic, args
+    return args_dic
+def instantiate(config, **kwargs):
+    if '__target__' in config:
+        module_path, class_name = config['__target__'].rsplit('.', 1)
+        module = importlib.import_module(module_path)
+        cls = getattr(module, class_name)
+        # 先处理嵌套的配置
+        params = {}
+        for key, value in config.items():
+            if key != '__target__':
+                if isinstance(value, DictConfig) and '__target__' in value:
+                    params[key] = instantiate(value)
+                else:
+                    params[key] = value
+        # 添加额外的关键字参数
+        params.update(kwargs)
+        return cls(**params)
+    else:
+        # 对于不包含 '__target__' 的字典，递归处理其每个值
+        return {k: instantiate(v, **kwargs) if isinstance(v, DictConfig) else v for k, v in config.items()}

look2hear/utils/pylogger.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import logging
+from typing import Mapping, Optional
+from lightning_utilities.core.rank_zero import rank_prefixed_message, rank_zero_only
+class RankedLogger(logging.LoggerAdapter):
+    """A multi-GPU-friendly python command line logger."""
+    def __init__(
+        self,
+        name: str = __name__,
+        rank_zero_only: bool = False,
+        extra: Optional[Mapping[str, object]] = None,
+        log_file: str = "log.txt",  # 添加日志文件名参数
+    ) -> None:
+        logger = logging.getLogger(name)
+        # 设置日志格式
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        # 添加文件处理器
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+        super().__init__(logger=logger, extra=extra)
+        self.rank_zero_only = rank_zero_only
+    def log(self, level: int, msg: str, rank: Optional[int] = None, *args, **kwargs) -> None:
+        """Delegate a log call to the underlying logger, after prefixing its message with the rank
+        of the process it's being logged from. If `'rank'` is provided, then the log will only
+        occur on that rank/process.
+        :param level: The level to log at. Look at `logging.__init__.py` for more information.
+        :param msg: The message to log.
+        :param rank: The rank to log at.
+        :param args: Additional args to pass to the underlying logging function.
+        :param kwargs: Any additional keyword args to pass to the underlying logging function.
+        """
+        if self.isEnabledFor(level):
+            msg, kwargs = self.process(msg, kwargs)
+            current_rank = getattr(rank_zero_only, "rank", None)
+            if current_rank is None:
+                raise RuntimeError("The `rank_zero_only.rank` needs to be set before use")
+            msg = rank_prefixed_message(msg, current_rank)
+            if self.rank_zero_only:
+                if current_rank == 0:
+                    self.logger.log(level, msg, *args, **kwargs)
+            else:
+                if rank is None:
+                    self.logger.log(level, msg, *args, **kwargs)
+                elif current_rank == rank:
+                    self.logger.log(level, msg, *args, **kwargs)

look2hear/utils/separator.py ADDED Viewed

	@@ -0,0 +1,138 @@

+###
+# Author: Kai Li
+# Date: 2021-06-18 16:32:50
+# LastEditors: Kai Li
+# LastEditTime: 2021-06-19 01:02:04
+###
+import os
+import warnings
+import torch
+import numpy as np
+import soundfile as sf
+def get_device(tensor_or_module, default=None):
+    if hasattr(tensor_or_module, "device"):
+        return tensor_or_module.device
+    elif hasattr(tensor_or_module, "parameters"):
+        return next(tensor_or_module.parameters()).device
+    elif default is None:
+        raise TypeError(
+            f"Don't know how to get device of {type(tensor_or_module)} object"
+        )
+    else:
+        return torch.device(default)
+class Separator:
+    def forward_wav(self, wav, **kwargs):
+        raise NotImplementedError
+    def sample_rate(self):
+        raise NotImplementedError
+def separate(model, wav, **kwargs):
+    if isinstance(wav, np.ndarray):
+        return numpy_separate(model, wav, **kwargs)
+    elif isinstance(wav, torch.Tensor):
+        return torch_separate(model, wav, **kwargs)
+    else:
+        raise ValueError(
+            f"Only support filenames, numpy arrays and torch tensors, received {type(wav)}"
+        )
+@torch.no_grad()
+def torch_separate(model: Separator, wav: torch.Tensor, **kwargs) -> torch.Tensor:
+    """Core logic of `separate`."""
+    if model.in_channels is not None and wav.shape[-2] != model.in_channels:
+        raise RuntimeError(
+            f"Model supports {model.in_channels}-channel inputs but found audio with {wav.shape[-2]} channels."
+            f"Please match the number of channels."
+        )
+    # Handle device placement
+    input_device = get_device(wav, default="cpu")
+    model_device = get_device(model, default="cpu")
+    wav = wav.to(model_device)
+    # Forward
+    separate_func = getattr(model, "forward_wav", model)
+    out_wavs = separate_func(wav, **kwargs)
+    # FIXME: for now this is the best we can do.
+    out_wavs *= wav.abs().sum() / (out_wavs.abs().sum())
+    # Back to input device (and numpy if necessary)
+    out_wavs = out_wavs.to(input_device)
+    return out_wavs
+def numpy_separate(model: Separator, wav: np.ndarray, **kwargs) -> np.ndarray:
+    """Numpy interface to `separate`."""
+    wav = torch.from_numpy(wav)
+    out_wavs = torch_separate(model, wav, **kwargs)
+    out_wavs = out_wavs.data.numpy()
+    return out_wavs
+def wav_chunk_inference(model, mixture_tensor, sr=16000, target_length=12.0, hop_length=4.0, batch_size=10, n_tracks=3):
+    """
+    Input:
+        mixture_tensor: Tensor, [nch, input_length]
+    Output:
+        all_target_tensor: Tensor, [nch, n_track, input_length]
+    """
+    batch_mixture = mixture_tensor
+    # split data into segments
+    batch_length = batch_mixture.shape[-1]
+    session = int(sr * target_length)
+    target = int(sr * target_length)
+    ignore = (session - target) // 2
+    hop = int(sr * hop_length)
+    tr_ratio = target_length / hop_length
+    if ignore > 0:
+        zero_pad = torch.zeros(batch_mixture.shape[0], batch_mixture.shape[1], ignore).type(batch_mixture.type()).to(batch_mixture.device)
+        batch_mixture_pad = torch.cat([zero_pad, batch_mixture, zero_pad], -1)
+    else:
+        batch_mixture_pad = batch_mixture
+    if target - hop > 0:
+        hop_pad = torch.zeros(batch_mixture.shape[0], batch_mixture.shape[1], target-hop).type(batch_mixture.type()).to(batch_mixture.device)
+        batch_mixture_pad = torch.cat([hop_pad, batch_mixture_pad, hop_pad], -1)
+    skip_idx = ignore + target - hop
+    zero_pad = torch.zeros(batch_mixture.shape[0], batch_mixture.shape[1], session).type(batch_mixture.type()).to(batch_mixture.device)
+    num_session = (batch_mixture_pad.shape[-1] - session) // hop + 2
+    all_target = torch.zeros(batch_mixture_pad.shape[0], n_tracks, batch_mixture_pad.shape[1], batch_mixture_pad.shape[2]).to(batch_mixture_pad.device)
+    all_input = []
+    all_segment_length = []
+    for i in range(num_session):
+        this_input = batch_mixture_pad[:,:,i*hop:i*hop+session]
+        segment_length = this_input.shape[-1]
+        if segment_length < session:
+            this_input = torch.cat([this_input, zero_pad[:,:,:session-segment_length]], -1)
+        all_input.append(this_input)
+        all_segment_length.append(segment_length)
+    all_input = torch.cat(all_input, 0)
+    num_batch = num_session // batch_size
+    if num_session % batch_size > 0:
+        num_batch += 1
+    for i in range(num_batch):
+        this_input = all_input[i*batch_size:(i+1)*batch_size]
+        actual_batch_size = this_input.shape[0]
+        with torch.no_grad():
+            est_target = model(this_input)
+            # print(est_target.shape)
+        for j in range(actual_batch_size):
+            this_est_target = est_target[j,:,:,:all_segment_length[i*batch_size+j]][:,:,ignore:ignore+target].unsqueeze(0)
+            all_target[:,:,:,ignore+(i*batch_size+j)*hop:ignore+(i*batch_size+j)*hop+target] += this_est_target
+    all_target = all_target[:,:,:,skip_idx:skip_idx+batch_length].contiguous() / tr_ratio
+    return all_target.squeeze(0)

look2hear/utils/stft.py ADDED Viewed

	@@ -0,0 +1,797 @@

+# Copyright 2019 Jian Wu
+# License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+import math
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as tf
+import librosa.filters as filters
+from typing import Optional, Tuple
+from distutils.version import LooseVersion
+EPSILON = float(np.finfo(np.float32).eps)
+TORCH_VERSION = th.__version__
+if TORCH_VERSION >= LooseVersion("1.7"):
+    from torch.fft import fft as fft_func
+else:
+    pass
+def export_jit(transform: nn.Module) -> nn.Module:
+    """
+    Export transform module for inference
+    """
+    export_out = [module for module in transform if module.exportable()]
+    return nn.Sequential(*export_out)
+def init_window(wnd: str, frame_len: int, device: th.device = "cpu") -> th.Tensor:
+    """
+    Return window coefficient
+    Args:
+        wnd: window name
+        frame_len: length of the frame
+    """
+    def sqrthann(frame_len, periodic=True):
+        return th.hann_window(frame_len, periodic=periodic) ** 0.5
+    if wnd not in ["bartlett", "hann", "hamm", "blackman", "rect", "sqrthann"]:
+        raise RuntimeError(f"Unknown window type: {wnd}")
+    wnd_tpl = {
+        "sqrthann": sqrthann,
+        "hann": th.hann_window,
+        "hamm": th.hamming_window,
+        "blackman": th.blackman_window,
+        "bartlett": th.bartlett_window,
+        "rect": th.ones,
+    }
+    if wnd != "rect":
+        # match with librosa
+        c = wnd_tpl[wnd](frame_len, periodic=True)
+    else:
+        c = wnd_tpl[wnd](frame_len)
+    return c.to(device)
+def init_kernel(
+    frame_len: int,
+    frame_hop: int,
+    window: th.Tensor,
+    round_pow_of_two: bool = True,
+    normalized: bool = False,
+    inverse: bool = False,
+    mode: str = "librosa",
+) -> Tuple[th.Tensor, th.Tensor]:
+    """
+    Return STFT kernels
+    Args:
+        frame_len: length of the frame
+        frame_hop: hop size between frames
+        window: window tensor
+        round_pow_of_two: if true, choose round(#power_of_two) as the FFT size
+        normalized: return normalized DFT matrix
+        inverse: return iDFT matrix
+        mode: framing mode (librosa or kaldi)
+    """
+    if mode not in ["librosa", "kaldi"]:
+        raise ValueError(f"Unsupported mode: {mode}")
+    # FFT size: B
+    if round_pow_of_two or mode == "kaldi":
+        fft_size = 2 ** math.ceil(math.log2(frame_len))
+    else:
+        fft_size = frame_len
+    # center padding window if needed
+    if mode == "librosa" and fft_size != frame_len:
+        lpad = (fft_size - frame_len) // 2
+        window = tf.pad(window, (lpad, fft_size - frame_len - lpad))
+    if normalized:
+        # make K^H * K = I
+        S = fft_size ** 0.5
+    else:
+        S = 1
+    # W x B x 2
+    if TORCH_VERSION >= LooseVersion("1.7"):
+        K = fft_func(th.eye(fft_size) / S, dim=-1)
+        K = th.stack([K.real, K.imag], dim=-1)
+    else:
+        I = th.stack([th.eye(fft_size), th.zeros(fft_size, fft_size)], dim=-1)
+        K = th.fft(I / S, 1)
+    if mode == "kaldi":
+        K = K[:frame_len]
+    if inverse and not normalized:
+        # to make K^H * K = I
+        K = K / fft_size
+    # 2 x B x W
+    K = th.transpose(K, 0, 2)
+    # 2B x 1 x W
+    K = th.reshape(K, (fft_size * 2, 1, K.shape[-1]))
+    return K.to(window.device), window
+def mel_filter(
+    frame_len: int,
+    round_pow_of_two: bool = True,
+    num_bins: Optional[int] = None,
+    sr: int = 16000,
+    num_mels: int = 80,
+    fmin: float = 0.0,
+    fmax: Optional[float] = None,
+    norm: bool = False,
+) -> th.Tensor:
+    """
+    Return mel filter coefficients
+    Args:
+        frame_len: length of the frame
+        round_pow_of_two: if true, choose round(#power_of_two) as the FFT size
+        num_bins: number of the frequency bins produced by STFT
+        num_mels: number of the mel bands
+        fmin: lowest frequency (in Hz)
+        fmax: highest frequency (in Hz)
+        norm: normalize the mel filter coefficients
+    """
+    # FFT points
+    if num_bins is None:
+        N = 2 ** math.ceil(math.log2(frame_len)) if round_pow_of_two else frame_len
+    else:
+        N = (num_bins - 1) * 2
+    # fmin & fmax
+    freq_upper = sr // 2
+    if fmax is None:
+        fmax = freq_upper
+    else:
+        fmax = min(fmax + freq_upper if fmax < 0 else fmax, freq_upper)
+    fmin = max(0, fmin)
+    # mel filter coefficients
+    mel = filters.mel(
+        sr,
+        N,
+        n_mels=num_mels,
+        fmax=fmax,
+        fmin=fmin,
+        htk=True,
+        norm="slaney" if norm else None,
+    )
+    # num_mels x (N // 2 + 1)
+    return th.tensor(mel, dtype=th.float32)
+def speed_perturb_filter(
+    src_sr: int, dst_sr: int, cutoff_ratio: float = 0.95, num_zeros: int = 64
+) -> th.Tensor:
+    """
+    Return speed perturb filters, reference:
+        https://github.com/danpovey/filtering/blob/master/lilfilter/resampler.py
+    Args:
+        src_sr: sample rate of the source signal
+        dst_sr: sample rate of the target signal
+    Return:
+        weight (Tensor): coefficients of the filter
+    """
+    if src_sr == dst_sr:
+        raise ValueError(f"src_sr should not be equal to dst_sr: {src_sr}/{dst_sr}")
+    gcd = math.gcd(src_sr, dst_sr)
+    src_sr = src_sr // gcd
+    dst_sr = dst_sr // gcd
+    if src_sr == 1 or dst_sr == 1:
+        raise ValueError("do not support integer downsample/upsample")
+    zeros_per_block = min(src_sr, dst_sr) * cutoff_ratio
+    padding = 1 + int(num_zeros / zeros_per_block)
+    # dst_sr x src_sr x K
+    times = (
+        np.arange(dst_sr)[:, None, None] / float(dst_sr)
+        - np.arange(src_sr)[None, :, None] / float(src_sr)
+        - np.arange(2 * padding + 1)[None, None, :]
+        + padding
+    )
+    window = np.heaviside(1 - np.abs(times / padding), 0.0) * (
+        0.5 + 0.5 * np.cos(times / padding * math.pi)
+    )
+    weight = np.sinc(times * zeros_per_block) * window * zeros_per_block / float(src_sr)
+    return th.tensor(weight, dtype=th.float32)
+def splice_feature(
+    feats: th.Tensor, lctx: int = 1, rctx: int = 1, op: str = "cat"
+) -> th.Tensor:
+    """
+    Splice feature
+    Args:
+        feats (Tensor): N x ... x T x F, original feature
+        lctx: left context
+        rctx: right context
+        op: operator on feature context
+    Return:
+        splice (Tensor): feature with context padded
+    """
+    if lctx + rctx == 0:
+        return feats
+    if op not in ["cat", "stack"]:
+        raise ValueError(f"Unknown op for feature splicing: {op}")
+    # [N x ... x T x F, ...]
+    ctx = []
+    T = feats.shape[-2]
+    for c in range(-lctx, rctx + 1):
+        idx = th.arange(c, c + T, device=feats.device, dtype=th.int64)
+        idx = th.clamp(idx, min=0, max=T - 1)
+        ctx.append(th.index_select(feats, -2, idx))
+    if op == "cat":
+        # N x ... x T x FD
+        splice = th.cat(ctx, -1)
+    else:
+        # N x ... x T x F x D
+        splice = th.stack(ctx, -1)
+    return splice
+def _forward_stft(
+    wav: th.Tensor,
+    kernel: th.Tensor,
+    window: th.Tensor,
+    return_polar: bool = False,
+    pre_emphasis: float = 0,
+    frame_hop: int = 256,
+    onesided: bool = False,
+    center: bool = False,
+    eps: float = EPSILON,
+) -> th.Tensor:
+    """
+    STFT function implemented by conv1d (not efficient, but we don't care during training)
+    Args:
+        wav (Tensor): N x (C) x S
+        kernel (Tensor): STFT transform kernels, from init_kernel(...)
+        return_polar: return [magnitude; phase] Tensor or [real; imag] Tensor
+        pre_emphasis: factor of preemphasis
+        frame_hop: frame hop size in number samples
+        onesided: return half FFT bins
+        center: if true, we assumed to have centered frames
+    Return:
+        transform (Tensor): STFT transform results
+    """
+    wav_dim = wav.dim()
+    if wav_dim not in [2, 3]:
+        raise RuntimeError(f"STFT expect 2D/3D tensor, but got {wav_dim:d}D")
+    # if N x S, reshape N x 1 x S
+    # else: reshape NC x 1 x S
+    N, S = wav.shape[0], wav.shape[-1]
+    wav = wav.view(-1, 1, S)
+    # NC x 1 x S+2P
+    if center:
+        pad = kernel.shape[-1] // 2
+        # NOTE: match with librosa
+        wav = tf.pad(wav, (pad, pad), mode="reflect")
+    # STFT
+    kernel = kernel * window
+    if pre_emphasis > 0:
+        # NC x W x T
+        frames = tf.unfold(
+            wav[:, None], (1, kernel.shape[-1]), stride=frame_hop, padding=0
+        )
+        # follow Kaldi's Preemphasize
+        frames[:, 1:] = frames[:, 1:] - pre_emphasis * frames[:, :-1]
+        frames[:, 0] *= 1 - pre_emphasis
+        # 1 x 2B x W, NC x W x T,  NC x 2B x T
+        packed = th.matmul(kernel[:, 0][None, ...], frames)
+    else:
+        packed = tf.conv1d(wav, kernel, stride=frame_hop, padding=0)
+    # NC x 2B x T => N x C x 2B x T
+    if wav_dim == 3:
+        packed = packed.view(N, -1, packed.shape[-2], packed.shape[-1])
+    # N x (C) x B x T
+    real, imag = th.chunk(packed, 2, dim=-2)
+    # N x (C) x B/2+1 x T
+    if onesided:
+        num_bins = kernel.shape[0] // 4 + 1
+        real = real[..., :num_bins, :]
+        imag = imag[..., :num_bins, :]
+    if return_polar:
+        mag = (real ** 2 + imag ** 2 + eps) ** 0.5
+        pha = th.atan2(imag, real)
+        return th.stack([mag, pha], dim=-1)
+    else:
+        return th.stack([real, imag], dim=-1)
+def _inverse_stft(
+    transform: th.Tensor,
+    kernel: th.Tensor,
+    window: th.Tensor,
+    return_polar: bool = False,
+    frame_hop: int = 256,
+    onesided: bool = False,
+    center: bool = False,
+    eps: float = EPSILON,
+) -> th.Tensor:
+    """
+    iSTFT function implemented by conv1d
+    Args:
+        transform (Tensor): STFT transform results
+        kernel (Tensor): STFT transform kernels, from init_kernel(...)
+        return_polar (bool): keep same with the one in _forward_stft
+        frame_hop: frame hop size in number samples
+        onesided: return half FFT bins
+        center: used in _forward_stft
+    Return:
+        wav (Tensor), N x S
+    """
+    # (N) x F x T x 2
+    transform_dim = transform.dim()
+    # if F x T x 2, reshape 1 x F x T x 2
+    if transform_dim == 3:
+        transform = th.unsqueeze(transform, 0)
+    if transform_dim != 4:
+        raise RuntimeError(f"Expect 4D tensor, but got {transform_dim}D")
+    if return_polar:
+        real = transform[..., 0] * th.cos(transform[..., 1])
+        imag = transform[..., 0] * th.sin(transform[..., 1])
+    else:
+        real, imag = transform[..., 0], transform[..., 1]
+    if onesided:
+        # [self.num_bins - 2, ..., 1]
+        reverse = range(kernel.shape[0] // 4 - 1, 0, -1)
+        # extend matrix: N x B x T
+        real = th.cat([real, real[:, reverse]], 1)
+        imag = th.cat([imag, -imag[:, reverse]], 1)
+    # pack: N x 2B x T
+    packed = th.cat([real, imag], dim=1)
+    # N x 1 x T
+    wav = tf.conv_transpose1d(packed, kernel * window, stride=frame_hop, padding=0)
+    # normalized audio samples
+    # refer: https://github.com/pytorch/audio/blob/2ebbbf511fb1e6c47b59fd32ad7e66023fa0dff1/torchaudio/functional.py#L171
+    num_frames = packed.shape[-1]
+    win_length = window.shape[0]
+    # W x T
+    win = th.repeat_interleave(window[..., None] ** 2, num_frames, dim=-1)
+    # Do OLA on windows
+    # v1)
+    I = th.eye(win_length, device=win.device)[:, None]
+    denorm = tf.conv_transpose1d(win[None, ...], I, stride=frame_hop, padding=0)
+    # v2)
+    # num_samples = (num_frames - 1) * frame_hop + win_length
+    # denorm = tf.fold(win[None, ...], (num_samples, 1), (win_length, 1),
+    #                  stride=frame_hop)[..., 0]
+    if center:
+        pad = kernel.shape[-1] // 2
+        wav = wav[..., pad:-pad]
+        denorm = denorm[..., pad:-pad]
+    wav = wav / (denorm + eps)
+    # N x S
+    return wav.squeeze(1)
+def _pytorch_stft(
+    wav: th.Tensor,
+    frame_len: int,
+    frame_hop: int,
+    n_fft: int = 512,
+    return_polar: bool = False,
+    window: str = "sqrthann",
+    normalized: bool = False,
+    onesided: bool = True,
+    center: bool = False,
+    eps: float = EPSILON,
+) -> th.Tensor:
+    """
+    Wrapper of PyTorch STFT function
+    Args:
+        wav (Tensor): source audio signal
+        frame_len: length of the frame
+        frame_hop: hop size between frames
+        n_fft: number of the FFT size
+        return_polar: return the results in polar coordinate
+        window: window tensor
+        center: same definition with the parameter in librosa.stft
+        normalized: use normalized DFT kernel
+        onesided: output onesided STFT
+    Return:
+        transform (Tensor), STFT transform results
+    """
+    if TORCH_VERSION < LooseVersion("1.7"):
+        raise RuntimeError("Can not use this function as TORCH_VERSION < 1.7")
+    wav_dim = wav.dim()
+    if wav_dim not in [2, 3]:
+        raise RuntimeError(f"STFT expect 2D/3D tensor, but got {wav_dim:d}D")
+    # if N x C x S, reshape NC x S
+    wav = wav.view(-1, wav.shape[-1])
+    # STFT: N x F x T x 2
+    stft = th.stft(
+        wav,
+        n_fft,
+        hop_length=frame_hop,
+        win_length=window.shape[-1],
+        window=window,
+        center=center,
+        normalized=normalized,
+        onesided=onesided,
+        return_complex=False,
+    )
+    if wav_dim == 3:
+        N, F, T, _ = stft.shape
+        stft = stft.view(N, -1, F, T, 2)
+    # N x (C) x F x T x 2
+    if not return_polar:
+        return stft
+    # N x (C) x F x T
+    real, imag = stft[..., 0], stft[..., 1]
+    mag = (real ** 2 + imag ** 2 + eps) ** 0.5
+    pha = th.atan2(imag, real)
+    return th.stack([mag, pha], dim=-1)
+def _pytorch_istft(
+    transform: th.Tensor,
+    frame_len: int,
+    frame_hop: int,
+    window: th.Tensor,
+    n_fft: int = 512,
+    return_polar: bool = False,
+    normalized: bool = False,
+    onesided: bool = True,
+    center: bool = False,
+    eps: float = EPSILON,
+) -> th.Tensor:
+    """
+    Wrapper of PyTorch iSTFT function
+    Args:
+        transform (Tensor): results of STFT
+        frame_len: length of the frame
+        frame_hop: hop size between frames
+        window: window tensor
+        n_fft: number of the FFT size
+        return_polar: keep same with _pytorch_stft
+        center: same definition with the parameter in librosa.stft
+        normalized: use normalized DFT kernel
+        onesided: output onesided STFT
+    Return:
+        wav (Tensor): synthetic audio
+    """
+    if TORCH_VERSION < LooseVersion("1.7"):
+        raise RuntimeError("Can not use this function as TORCH_VERSION < 1.7")
+    transform_dim = transform.dim()
+    # if F x T x 2, reshape 1 x F x T x 2
+    if transform_dim == 3:
+        transform = th.unsqueeze(transform, 0)
+    if transform_dim != 4:
+        raise RuntimeError(f"Expect 4D tensor, but got {transform_dim}D")
+    if return_polar:
+        real = transform[..., 0] * th.cos(transform[..., 1])
+        imag = transform[..., 0] * th.sin(transform[..., 1])
+        transform = th.stack([real, imag], -1)
+    # stft is a complex tensor of PyTorch
+    stft = th.view_as_complex(transform)
+    # (N) x S
+    wav = th.istft(
+        stft,
+        n_fft,
+        hop_length=frame_hop,
+        win_length=window.shape[-1],
+        window=window,
+        center=center,
+        normalized=normalized,
+        onesided=onesided,
+        return_complex=False,
+    )
+    return wav
+def forward_stft(
+    wav: th.Tensor,
+    frame_len: int,
+    frame_hop: int,
+    window: str = "sqrthann",
+    round_pow_of_two: bool = True,
+    return_polar: bool = False,
+    pre_emphasis: float = 0,
+    normalized: bool = False,
+    onesided: bool = True,
+    center: bool = False,
+    mode: str = "librosa",
+    eps: float = EPSILON,
+) -> th.Tensor:
+    """
+    STFT function implementation, equals to STFT layer
+    Args:
+        wav: source audio signal
+        frame_len: length of the frame
+        frame_hop: hop size between frames
+        return_polar: return [magnitude; phase] Tensor or [real; imag] Tensor
+        window: window name
+        center: center flag (similar with that in librosa.stft)
+        round_pow_of_two: if true, choose round(#power_of_two) as the FFT size
+        pre_emphasis: factor of preemphasis
+        normalized: use normalized DFT kernel
+        onesided: output onesided STFT
+        inverse: using iDFT kernel (for iSTFT)
+        mode: STFT mode, "kaldi" or "librosa" or "torch"
+    Return:
+        transform: results of STFT
+    """
+    window = init_window(window, frame_len, device=wav.device)
+    if mode == "torch":
+        n_fft = 2 ** math.ceil(math.log2(frame_len)) if round_pow_of_two else frame_len
+        return _pytorch_stft(
+            wav,
+            frame_len,
+            frame_hop,
+            n_fft=n_fft,
+            return_polar=return_polar,
+            window=window,
+            normalized=normalized,
+            onesided=onesided,
+            center=center,
+            eps=eps,
+        )
+    else:
+        kernel, window = init_kernel(
+            frame_len,
+            frame_hop,
+            window=window,
+            round_pow_of_two=round_pow_of_two,
+            normalized=normalized,
+            inverse=False,
+            mode=mode,
+        )
+        return _forward_stft(
+            wav,
+            kernel,
+            window,
+            return_polar=return_polar,
+            frame_hop=frame_hop,
+            pre_emphasis=pre_emphasis,
+            onesided=onesided,
+            center=center,
+            eps=eps,
+        )
+def inverse_stft(
+    transform: th.Tensor,
+    frame_len: int,
+    frame_hop: int,
+    return_polar: bool = False,
+    window: str = "sqrthann",
+    round_pow_of_two: bool = True,
+    normalized: bool = False,
+    onesided: bool = True,
+    center: bool = False,
+    mode: str = "librosa",
+    eps: float = EPSILON,
+) -> th.Tensor:
+    """
+    iSTFT function implementation, equals to iSTFT layer
+    Args:
+        transform: results of STFT
+        frame_len: length of the frame
+        frame_hop: hop size between frames
+        return_polar: keep same with function forward_stft(...)
+        window: window name
+        center: center flag (similar with that in librosa.stft)
+        round_pow_of_two: if true, choose round(#power_of_two) as the FFT size
+        normalized: use normalized DFT kernel
+        onesided: output onesided STFT
+        mode: STFT mode, "kaldi" or "librosa" or "torch"
+    Return:
+        wav: synthetic signals
+    """
+    window = init_window(window, frame_len, device=transform.device)
+    if mode == "torch":
+        n_fft = 2 ** math.ceil(math.log2(frame_len)) if round_pow_of_two else frame_len
+        return _pytorch_istft(
+            transform,
+            frame_len,
+            frame_hop,
+            n_fft=n_fft,
+            return_polar=return_polar,
+            window=window,
+            normalized=normalized,
+            onesided=onesided,
+            center=center,
+            eps=eps,
+        )
+    else:
+        kernel, window = init_kernel(
+            frame_len,
+            frame_hop,
+            window,
+            round_pow_of_two=round_pow_of_two,
+            normalized=normalized,
+            inverse=True,
+            mode=mode,
+        )
+        return _inverse_stft(
+            transform,
+            kernel,
+            window,
+            return_polar=return_polar,
+            frame_hop=frame_hop,
+            onesided=onesided,
+            center=center,
+            eps=eps,
+        )
+class STFTBase(nn.Module):
+    """
+    Base layer for (i)STFT
+    Args:
+        frame_len: length of the frame
+        frame_hop: hop size between frames
+        window: window name
+        center: center flag (similar with that in librosa.stft)
+        round_pow_of_two: if true, choose round(#power_of_two) as the FFT size
+        normalized: use normalized DFT kernel
+        pre_emphasis: factor of preemphasis
+        mode: STFT mode, "kaldi" or "librosa" or "torch"
+        onesided: output onesided STFT
+        inverse: using iDFT kernel (for iSTFT)
+    """
+    def __init__(
+        self,
+        frame_len: int,
+        frame_hop: int,
+        window: str = "sqrthann",
+        round_pow_of_two: bool = True,
+        normalized: bool = False,
+        pre_emphasis: float = 0,
+        onesided: bool = True,
+        inverse: bool = False,
+        center: bool = False,
+        mode: str = "librosa",
+    ) -> None:
+        super(STFTBase, self).__init__()
+        if mode != "torch":
+            K, w = init_kernel(
+                frame_len,
+                frame_hop,
+                init_window(window, frame_len),
+                round_pow_of_two=round_pow_of_two,
+                normalized=normalized,
+                inverse=inverse,
+                mode=mode,
+            )
+            self.K = nn.Parameter(K, requires_grad=False)
+            self.w = nn.Parameter(w, requires_grad=False)
+            self.num_bins = self.K.shape[0] // 4 + 1
+            self.pre_emphasis = pre_emphasis
+            self.win_length = self.K.shape[2]
+        else:
+            self.K = None
+            w = init_window(window, frame_len)
+            self.w = nn.Parameter(w, requires_grad=False)
+            fft_size = (
+                2 ** math.ceil(math.log2(frame_len)) if round_pow_of_two else frame_len
+            )
+            self.num_bins = fft_size // 2 + 1
+            self.pre_emphasis = 0
+            self.win_length = fft_size
+        self.frame_len = frame_len
+        self.frame_hop = frame_hop
+        self.window = window
+        self.normalized = normalized
+        self.onesided = onesided
+        self.center = center
+        self.mode = mode
+    def num_frames(self, wav_len: th.Tensor) -> th.Tensor:
+        """
+        Compute number of the frames
+        """
+        assert th.sum(wav_len <= self.win_length) == 0
+        if self.center:
+            wav_len += self.win_length
+        return (
+            th.div(wav_len - self.win_length, self.frame_hop, rounding_mode="trunc") + 1
+        )
+    def extra_repr(self) -> str:
+        str_repr = (
+            f"num_bins={self.num_bins}, win_length={self.win_length}, "
+            + f"stride={self.frame_hop}, window={self.window}, "
+            + f"center={self.center}, mode={self.mode}"
+        )
+        if not self.onesided:
+            str_repr += f", onesided={self.onesided}"
+        if self.pre_emphasis > 0:
+            str_repr += f", pre_emphasis={self.pre_emphasis}"
+        if self.normalized:
+            str_repr += f", normalized={self.normalized}"
+        return str_repr
+class STFT(STFTBase):
+    """
+    Short-time Fourier Transform as a Layer
+    """
+    def __init__(self, *args, **kwargs):
+        super(STFT, self).__init__(*args, inverse=False, **kwargs)
+    def forward(
+        self, wav: th.Tensor, return_polar: bool = False, eps: float = EPSILON
+    ) -> th.Tensor:
+        """
+        Accept (single or multiple channel) raw waveform and output magnitude and phase
+        Args
+            wav (Tensor) input signal, N x (C) x S
+        Return
+            transform (Tensor), N x (C) x F x T x 2
+        """
+        if self.mode == "torch":
+            return _pytorch_stft(
+                wav,
+                self.frame_len,
+                self.frame_hop,
+                n_fft=(self.num_bins - 1) * 2,
+                return_polar=return_polar,
+                window=self.w,
+                normalized=self.normalized,
+                onesided=self.onesided,
+                center=self.center,
+                eps=eps,
+            )
+        else:
+            return _forward_stft(
+                wav,
+                self.K,
+                self.w,
+                return_polar=return_polar,
+                frame_hop=self.frame_hop,
+                pre_emphasis=self.pre_emphasis,
+                onesided=self.onesided,
+                center=self.center,
+                eps=eps,
+            )
+class iSTFT(STFTBase):
+    """
+    Inverse Short-time Fourier Transform as a Layer
+    """
+    def __init__(self, *args, **kwargs):
+        super(iSTFT, self).__init__(*args, inverse=True, **kwargs)
+    def forward(
+        self, transform: th.Tensor, return_polar: bool = False, eps: float = EPSILON
+    ) -> th.Tensor:
+        """
+        Accept phase & magnitude and output raw waveform
+        Args
+            transform (Tensor): STFT output, N x F x T x 2
+        Return
+            s (Tensor): N x S
+        """
+        if self.mode == "torch":
+            return _pytorch_istft(
+                transform,
+                self.frame_len,
+                self.frame_hop,
+                n_fft=(self.num_bins - 1) * 2,
+                return_polar=return_polar,
+                window=self.w,
+                normalized=self.normalized,
+                onesided=self.onesided,
+                center=self.center,
+                eps=eps,
+            )
+        else:
+            return _inverse_stft(
+                transform,
+                self.K,
+                self.w,
+                return_polar=return_polar,
+                frame_hop=self.frame_hop,
+                onesided=self.onesided,
+                center=self.center,
+                eps=eps,
+            )

look2hear/utils/torch_utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+###
+# Author: Kai Li
+# Date: 2021-06-18 17:29:21
+# LastEditors: Kai Li
+# LastEditTime: 2021-06-21 23:52:52
+###
+import torch
+import torch.nn as nn
+def pad_x_to_y(x, y, axis: int = -1):
+    if axis != -1:
+        raise NotImplementedError
+    inp_len = y.shape[axis]
+    output_len = x.shape[axis]
+    return nn.functional.pad(x, [0, inp_len - output_len])
+def shape_reconstructed(reconstructed, size):
+    if len(size) == 1:
+        return reconstructed.squeeze(0)
+    return reconstructed
+def tensors_to_device(tensors, device):
+    """Transfer tensor, dict or list of tensors to device.
+    Args:
+        tensors (:class:`torch.Tensor`): May be a single, a list or a
+            dictionary of tensors.
+        device (:class: `torch.device`): the device where to place the tensors.
+    Returns:
+        Union [:class:`torch.Tensor`, list, tuple, dict]:
+            Same as input but transferred to device.
+            Goes through lists and dicts and transfers the torch.Tensor to
+            device. Leaves the rest untouched.
+    """
+    if isinstance(tensors, torch.Tensor):
+        return tensors.to(device)
+    elif isinstance(tensors, (list, tuple)):
+        return [tensors_to_device(tens, device) for tens in tensors]
+    elif isinstance(tensors, dict):
+        for key in tensors.keys():
+            tensors[key] = tensors_to_device(tensors[key], device)
+        return tensors
+    else:
+        return tensors

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torchaudio==2.2.0
+torch==2.2.0
+huggingface
+huggingface_hub
+numpy<2.0
+omegaconf
+ml_collections
+librosa
+gradio
+tqdm
+spaces

weights/apollo.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99d9af7f1ff20e63c393035513a655392818d66b4d7fc23d658175c1f15e8d76
+size 66541845