Spaces:

waidhoferj
/

dance-classifier

Runtime error

App Files Files Community

waidhoferj commited on May 22, 2023

Commit

557fb53

1 Parent(s): e82ec2b

Refactor config style and reorganize files

Browse files

Files changed (20) hide show

.gitignore +1 -0
TODO.md +5 -2
environment.yml +5 -0
models/audio_spectrogram_transformer.py +117 -76
models/config/decision_tree.yaml +47 -0
models/config/train.yaml +5 -5
models/config/train_local.yaml +47 -36
models/decision_tree.py +124 -37
models/residual.py +82 -86
models/training_environment.py +90 -0
models/utils.py +47 -20
models/wav2vec2.py +84 -0
preprocessing/dataset.py +230 -198
preprocessing/pipelines.py +56 -42
preprocessing/preprocess.py +66 -44
tests.py +0 -22
tests/test_datasets.py +17 -0
tests/test_pipelines.py +13 -0
tests/utils.py +7 -0
train.py +9 -176

.gitignore CHANGED Viewed

@@ -9,3 +9,4 @@ lightning_logs
 .lr_find_*
 .cache
 .vscode

 .lr_find_*
 .cache
 .vscode
+models/weights/ast

TODO.md CHANGED Viewed

@@ -6,10 +6,13 @@
 - Create an attention-based network
 - ✅ Increase parameter count in network
 - Verify that labels really match what is on the music4dance site
-- Read the Medium series about audio DL
 - double check \_rectify_duration
 - ✅ Filter out songs that have only one vote
 ## Notes
-2xM60 insufficient memory.

 - Create an attention-based network
 - ✅ Increase parameter count in network
 - Verify that labels really match what is on the music4dance site
+- ✅ Read the Medium series about audio DL
 - double check \_rectify_duration
 - ✅ Filter out songs that have only one vote
+- ✅ Download songs from [Best Ballroom](https://www.youtube.com/channel/UC0bYSnzAFMwPiEjmVsrvmRg)
+- ✅ fix nan values
 ## Notes
+2xM60 insufficient memory for the AST.

environment.yml CHANGED Viewed

@@ -23,6 +23,11 @@ dependencies:
   - scikit-learn
   - tensorboard
   - transformers
   - pip:
       - evaluate
       - wakepy

   - scikit-learn
   - tensorboard
   - transformers
+  - accelerate
+  - pytest
   - pip:
       - evaluate
       - wakepy
+      - soundfile
+      - youtube_dl

models/audio_spectrogram_transformer.py CHANGED Viewed

@@ -1,93 +1,138 @@
-from transformers import ASTModel, AutoFeatureExtractor, ASTConfig, AutoModelForAudioClassification, TrainingArguments, Trainer
 import torch
 from torch import nn
-from sklearn.utils.class_weight import compute_class_weight
-import evaluate
-import numpy as np
-accuracy = evaluate.load("accuracy")
-class MultiModalAST(nn.Module):
-    def __init__(self, labels, sample_rate, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         id2label, label2id = get_id_label_mapping(labels)
-        model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
-        self.ast_feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
-        self.ast_model = ASTModel.from_pretrained(
-        model_checkpoint,
-        num_labels=len(label2id),
-        label2id=label2id,
-        id2label=id2label,
-        ignore_mismatched_sizes=True
-        )
-        self.sample_rate = sample_rate
-        self.bpm_model = nn.Sequential(
-            nn.Linear(len(labels), 100),
-            nn.Linear(100, 50)
-        )
-        out_dim = 50 # TODO: Calculate output dimension
-        self.classifier = nn.Sequential(
-            nn.Linear(out_dim, 100),
-            nn.Linear(100, len(labels))
         )
-    def vectorize_bpm(self, waveform):
-        pass
-    def forward(self, audio):
-        bpm_vector = self.vectorize_bpm(audio)
-        bpm_out = self.bpm_model(bpm_vector)
-        spectrogram = self.ast_feature_extractor(audio)
-        ast_out = self.ast_model(spectrogram)
-        # Late fusion
-        z = torch.cat([ast_out, bpm_out]) # Which dimension?
-        return self.classifier(z)
-def compute_metrics(eval_pred):
-    predictions = np.argmax(eval_pred.predictions, axis=1)
-    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
-def get_id_label_mapping(labels:list[str]) -> tuple[dict, dict]:
-    id2label = {str(i) : label for i, label in enumerate(labels)}
-    label2id = {label : str(i) for i, label in enumerate(labels)}
-    return id2label, label2id
-def train(
-        labels,
-        train_ds,
-        test_ds,
-        output_dir="models/weights/ast",
-        device="cpu",
-        batch_size=128,
-        epochs=10):
-    id2label, label2id = get_id_label_mapping(labels)
     model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
     feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
-    preprocess_waveform = lambda wf : feature_extractor(wf, sampling_rate=train_ds.resample_frequency, padding="max_length", return_tensors="pt")
-    train_ds.map(preprocess_waveform)
-    test_ds.map(preprocess_waveform)
     model = AutoModelForAudioClassification.from_pretrained(
-    model_checkpoint,
-    num_labels=len(labels),
-    label2id=label2id,
-    id2label=id2label,
-    ignore_mismatched_sizes=True
-).to(device)
     training_args = TrainingArguments(
-        output_dir=output_dir,
         evaluation_strategy="epoch",
         save_strategy="epoch",
         learning_rate=5e-5,
@@ -100,7 +145,7 @@ def train(
         load_best_model_at_end=True,
         metric_for_best_model="accuracy",
         push_to_hub=False,
-        use_mps_device=device == "mps"
     )
     trainer = Trainer(
@@ -109,11 +154,7 @@ def train(
         train_dataset=train_ds,
         eval_dataset=test_ds,
         tokenizer=feature_extractor,
-        compute_metrics=compute_metrics,
     )
     trainer.train()
     return model

+from typing import Any
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from transformers import (
+    AutoFeatureExtractor,
+    AutoModelForAudioClassification,
+    TrainingArguments,
+    Trainer,
+    ASTConfig,
+    ASTFeatureExtractor,
+    ASTForAudioClassification,
+)
 import torch
 from torch import nn
+from models.training_environment import TrainingEnvironment
+from preprocessing.pipelines import WaveformTrainingPipeline
+from preprocessing.dataset import (
+    DanceDataModule,
+    HuggingFaceDatasetWrapper,
+    get_datasets,
+)
+from preprocessing.dataset import get_music4dance_examples
+from .utils import get_id_label_mapping, compute_hf_metrics
+import pytorch_lightning as pl
+from pytorch_lightning import callbacks as cb
+MODEL_CHECKPOINT = "MIT/ast-finetuned-audioset-10-10-0.4593"
+class AST(nn.Module):
+    def __init__(self, labels, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         id2label, label2id = get_id_label_mapping(labels)
+        config = ASTConfig(
+            hidden_size=300,
+            num_attention_heads=5,
+            num_hidden_layers=3,
+            id2label=id2label,
+            label2id=label2id,
+            num_labels=len(label2id),
+            ignore_mismatched_sizes=True,
         )
+        self.model = ASTForAudioClassification(config)
+    def forward(self, x):
+        return self.model(x).logits
+class ASTExtractorWrapper:
+    def __init__(self, sampling_rate=16000, return_tensors="pt") -> None:
+        self.extractor = ASTFeatureExtractor()
+        self.sampling_rate = sampling_rate
+        self.return_tensors = return_tensors
+        self.waveform_pipeline = WaveformTrainingPipeline()  # TODO configure from yaml
+    def __call__(self, x) -> Any:
+        x = self.waveform_pipeline(x)
+        device = x.device
+        x = x.squeeze(0).numpy()
+        x = self.extractor(
+            x, return_tensors=self.return_tensors, sampling_rate=self.sampling_rate
+        )
+        return x["input_values"].squeeze(0).to(device)
+def train_lightning_ast(config: dict):
+    """
+    work on integration between waveform dataset and environment. Should work for both HF and PTL.
+    """
+    TARGET_CLASSES = config["dance_ids"]
+    DEVICE = config["device"]
+    SEED = config["seed"]
+    pl.seed_everything(SEED, workers=True)
+    feature_extractor = ASTExtractorWrapper()
+    dataset = get_datasets(config["datasets"], feature_extractor)
+    data = DanceDataModule(
+        dataset,
+        target_classes=TARGET_CLASSES,
+        **config["data_module"],
+    )
+    model = AST(TARGET_CLASSES).to(DEVICE)
+    label_weights = data.get_label_weights().to(DEVICE)
+    criterion = nn.CrossEntropyLoss(
+        label_weights
+    )  # LabelWeightedBCELoss(label_weights)
+    train_env = TrainingEnvironment(model, criterion, config)
+    callbacks = [
+        # cb.LearningRateFinder(update_attr=True),
+        cb.EarlyStopping("val/loss", patience=5),
+        cb.RichProgressBar(),
+    ]
+    trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
+    trainer.fit(train_env, datamodule=data)
+    trainer.test(train_env, datamodule=data)
+def train_huggingface_ast(config: dict):
+    TARGET_CLASSES = config["dance_ids"]
+    DEVICE = config["device"]
+    SEED = config["seed"]
+    OUTPUT_DIR = "models/weights/ast"
+    batch_size = config["data_module"]["batch_size"]
+    epochs = config["data_module"]["min_epochs"]
+    test_proportion = config["data_module"].get("test_proportion", 0.2)
+    pl.seed_everything(SEED, workers=True)
+    dataset = get_datasets(config["datasets"])
+    hf_dataset = HuggingFaceDatasetWrapper(dataset)
+    id2label, label2id = get_id_label_mapping(TARGET_CLASSES)
     model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
     feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
+    preprocess_waveform = lambda wf: feature_extractor(
+        wf,
+        sampling_rate=train_ds.resample_frequency,
+        # padding="max_length",
+        # return_tensors="pt",
+    )
+    hf_dataset.append_to_pipeline(preprocess_waveform)
+    test_proportion = config["data_module"]["test_proportion"]
+    train_proporition = 1 - test_proportion
+    train_ds, test_ds = torch.utils.data.random_split(
+        hf_dataset, [train_proporition, test_proportion]
+    )
     model = AutoModelForAudioClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(TARGET_CLASSES),
+        label2id=label2id,
+        id2label=id2label,
+        ignore_mismatched_sizes=True,
+    ).to(DEVICE)
     training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
         evaluation_strategy="epoch",
         save_strategy="epoch",
         learning_rate=5e-5,
         load_best_model_at_end=True,
         metric_for_best_model="accuracy",
         push_to_hub=False,
+        use_mps_device=DEVICE == "mps",
     )
     trainer = Trainer(
         train_dataset=train_ds,
         eval_dataset=test_ds,
         tokenizer=feature_extractor,
+        compute_metrics=compute_hf_metrics,
     )
     trainer.train()
     return model

models/config/decision_tree.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+global:
+  id: decision_tree
+  device: mps
+  seed: 42
+  dance_ids:
+    - ATN
+    - BCH
+    - CHA
+    - ECS
+    - HST
+    - JIV
+    - QST
+    - RMB
+    - SFT
+    - SLS
+    - SMB
+    - SWZ
+    - TGO
+    - VWZ
+    - WCS
+data_module:
+  song_data_path: data/songs_cleaned.csv
+  song_audio_path: data/samples
+  batch_size: 32
+  num_workers: 7
+  min_votes: 1
+  dataset_kwargs:
+    audio_window_duration: 6
+    audio_window_jitter: 1.5
+    audio_pipeline_kwargs:
+      mask_count: 0 # Don't mask the data
+      snr_mean: 15.0 # Pretty much eliminate the noise
+      freq_mask_size: 10
+      time_mask_size: 80
+trainer:
+  log_every_n_steps: 15
+  accelerator: gpu
+  max_epochs: 50
+  min_epochs: 5
+  fast_dev_run: False
+  # gradient_clip_val: 0.5
+  # overfit_batches: 1
+training_environment:
+  learning_rate: 0.00053
+model:
+  n_channels: 128

models/config/train.yaml CHANGED Viewed

@@ -27,11 +27,11 @@ data_module:
   dataset_kwargs:
     audio_window_duration: 6
     audio_window_jitter: 1.5
-    audio_pipeline_kwargs:
-      mask_count: 0 # Don't mask the data
-      snr_mean: 15.0 # Pretty much eliminate the noise
-      freq_mask_size: 10
-      time_mask_size: 80
 trainer:
   log_every_n_steps: 15

   dataset_kwargs:
     audio_window_duration: 6
     audio_window_jitter: 1.5
+    # audio_pipeline_kwargs:
+    #   mask_count: 0 # Don't mask the data
+    #   snr_mean: 15.0 # Pretty much eliminate the noise
+    #   freq_mask_size: 10
+    #   time_mask_size: 80
 trainer:
   log_every_n_steps: 15

models/config/train_local.yaml CHANGED Viewed

@@ -1,47 +1,58 @@
-global:
-  id: ast_ptl # decision_tree
-  device: mps
-  seed: 42
-  dance_ids:
-    - ATN
-    - BCH
-    - CHA
-    - ECS
-    - HST
-    - JIV
-    - QST
-    - RMB
-    - SFT
-    - SLS
-    - SMB
-    - SWZ
-    - TGO
-    - VWZ
-    - WCS
 data_module:
-  song_data_path: data/songs_cleaned.csv
-  song_audio_path: data/samples
-  batch_size: 32
-  num_workers: 7
-  min_votes: 1
-  dataset_kwargs:
-    audio_window_duration: 6
-    audio_window_jitter: 1.5
-    audio_pipeline_kwargs:
-      mask_count: 0 # Don't mask the data
-      snr_mean: 15.0 # Pretty much eliminate the noise
-      freq_mask_size: 10
-      time_mask_size: 80
 trainer:
   log_every_n_steps: 15
   accelerator: gpu
   max_epochs: 50
-  min_epochs: 5
   fast_dev_run: False
   # gradient_clip_val: 0.5
   # overfit_batches: 1
 training_environment:
   learning_rate: 0.00053
-model:
-  n_channels: 128

+training_fn: audio_spectrogram_transformer.train_lightning_ast
+device: mps
+seed: 42
+dance_ids: &dance_ids
+  - BCH
+  - CHA
+  - JIV
+  - ECS
+  - QST
+  - RMB
+  - SFT
+  - SLS
+  - SMB
+  - SWZ
+  - TGO
+  - VWZ
+  - WCS
 data_module:
+  batch_size: 64
+  num_workers: 10
+  test_proportion: 0.2
+datasets:
+  preprocessing.dataset.BestBallroomDataset:
+    audio_dir: data/ballroom-songs
+    class_list: *dance_ids
+    audio_window_jitter: 0.7
+  preprocessing.dataset.Music4DanceDataset:
+    song_data_path: data/songs_cleaned.csv
+    song_audio_path: data/samples # data/samples
+    class_list: *dance_ids
+    multi_label: False
+    min_votes: 1
+    audio_window_jitter: 0.7
+model:
+  n_channels: 128
+feature_extractor:
+  mask_count: 0 # Don't mask the data
+  snr_mean: 15.0 # Pretty much eliminate the noise
+  freq_mask_size: 10
+  time_mask_size: 80
 trainer:
   log_every_n_steps: 15
   accelerator: gpu
   max_epochs: 50
+  min_epochs: 7
   fast_dev_run: False
   # gradient_clip_val: 0.5
   # overfit_batches: 1
 training_environment:
   learning_rate: 0.00053
+  log_spectrograms: False

models/decision_tree.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from sklearn.base import ClassifierMixin, BaseEstimator
 import pandas as pd
 from torch import nn
@@ -5,8 +6,14 @@ import torch
 from typing import Iterator
 import numpy as np
 import json
 from tqdm import tqdm
 import librosa
 DANCE_INFO_FILE = "data/dance_info.csv"
 dance_info_df = pd.read_csv(
@@ -24,9 +31,8 @@ class DanceTreeClassifier(BaseEstimator, ClassifierMixin):
         - BPM
     """
-    def __init__(self, device="cpu", lr=1e-4, epochs=5, verbose=True) -> None:
         self.device = device
-        self.epochs = epochs
         self.verbose = verbose
         self.lr = lr
         self.classifiers = {}
@@ -44,41 +50,40 @@ class DanceTreeClassifier(BaseEstimator, ClassifierMixin):
         x: (specs, bpms). The first element is the spectrogram, second element is the bpm. spec shape should be (channel, freq_bins, sr * time)
         y: (batch_size, n_classes)
         """
-        progress_bar = tqdm(range(self.epochs))
-        for _ in progress_bar:
-            # TODO: Introduce batches
-            epoch_loss = 0
-            pred_count = 0
-            step = 0
-            for (spec, bpm), label in zip(x, y):
-                step += 1
-                # find all models that are in the bpm range
-                matching_dances = self.get_valid_dances_from_bpm(bpm)
-                spec = torch.from_numpy(spec).to(self.device)
-                for dance in matching_dances:
-                    if dance not in self.classifiers or dance not in self.optimizers:
-                        classifier = DanceCNN().to(self.device)
-                        self.classifiers[dance] = classifier
-                        self.optimizers[dance] = torch.optim.Adam(
-                            classifier.parameters(), lr=self.lr
-                        )
-                models = [
-                    (dance, model, self.optimizers[dance])
-                    for dance, model in self.classifiers.items()
-                    if dance in matching_dances
-                ]
-                for model_i, (dance, model, opt) in enumerate(models):
-                    opt.zero_grad()
-                    output = model(spec)
-                    target = torch.tensor([float(dance == label)], device=self.device)
-                    loss = self.criterion(output, target)
-                    epoch_loss += loss.item()
-                    pred_count += 1
-                    loss.backward()
-                    opt.step()
-                    progress_bar.set_description(
-                        f"Loss: {epoch_loss / pred_count}, Step: {step}, Model: {model_i+1}/{len(models)}"
                     )
     def predict(self, x) -> list[str]:
         results = []
@@ -90,6 +95,52 @@ class DanceTreeClassifier(BaseEstimator, ClassifierMixin):
             results.append(matching_dances[dance_i])
         return results
 class DanceCNN(nn.Module):
     def __init__(self, sr=16000, freq_bins=20, duration=6, *args, **kwargs) -> None:
@@ -136,7 +187,6 @@ def features_from_path(
         num_frames = audio_window_duration * sr
         tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
         spec = librosa.feature.melspectrogram(y=waveform, sr=sr)
-        mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=20)
         spec_normalized = (spec - spec.mean()) / spec.std()
         spec_padded = librosa.util.fix_length(
             spec_normalized, size=sr * audio_duration, axis=1
@@ -145,3 +195,40 @@ def features_from_path(
         for i in range(audio_duration // audio_window_duration):
             spec_window = batched_spec[:, :, i * num_frames : (i + 1) * num_frames]
             yield (spec_window, tempo)

+import pytorch_lightning as pl
 from sklearn.base import ClassifierMixin, BaseEstimator
 import pandas as pd
 from torch import nn
 from typing import Iterator
 import numpy as np
 import json
+from torch.utils.data import random_split
 from tqdm import tqdm
 import librosa
+from joblib import dump, load
+from os import path
+import os
+from preprocessing.dataset import get_music4dance_examples
 DANCE_INFO_FILE = "data/dance_info.csv"
 dance_info_df = pd.read_csv(
         - BPM
     """
+    def __init__(self, device="cpu", lr=1e-4, verbose=True) -> None:
         self.device = device
         self.verbose = verbose
         self.lr = lr
         self.classifiers = {}
         x: (specs, bpms). The first element is the spectrogram, second element is the bpm. spec shape should be (channel, freq_bins, sr * time)
         y: (batch_size, n_classes)
         """
+        epoch_loss = 0
+        pred_count = 0
+        data_loader = zip(x, y)
+        if self.verbose:
+            data_loader = tqdm(data_loader, total=len(y))
+        for (spec, bpm), label in data_loader:
+            # find all models that are in the bpm range
+            matching_dances = self.get_valid_dances_from_bpm(bpm)
+            spec = torch.from_numpy(spec).to(self.device)
+            for dance in matching_dances:
+                if dance not in self.classifiers or dance not in self.optimizers:
+                    classifier = DanceCNN().to(self.device)
+                    self.classifiers[dance] = classifier
+                    self.optimizers[dance] = torch.optim.Adam(
+                        classifier.parameters(), lr=self.lr
                     )
+            models = [
+                (dance, model, self.optimizers[dance])
+                for dance, model in self.classifiers.items()
+                if dance in matching_dances
+            ]
+            for model_i, (dance, model, opt) in enumerate(models, start=1):
+                opt.zero_grad()
+                output = model(spec)
+                target = torch.tensor([float(dance == label)], device=self.device)
+                loss = self.criterion(output, target)
+                epoch_loss += loss.item()
+                pred_count += 1
+                loss.backward()
+                if self.verbose:
+                    data_loader.set_description(
+                        f"model: {model_i}/{len(models)}, loss: {loss.item()}"
+                    )
+                opt.step()
     def predict(self, x) -> list[str]:
         results = []
             results.append(matching_dances[dance_i])
         return results
+    def save(self, folder: str):
+        # Create a folder
+        classifier_path = path.join(folder, "classifier")
+        os.makedirs(classifier_path, exist_ok=True)
+        # Swap out model reference
+        classifiers = self.classifiers
+        optimizers = self.optimizers
+        criterion = self.criterion
+        self.classifiers = None
+        self.optimizers = None
+        self.criterion = None
+        # Save the Pth models
+        for dance, classifier in classifiers.items():
+            torch.save(
+                classifier.state_dict(), path.join(classifier_path, dance + ".pth")
+            )
+        # Save the Sklearn model
+        dump(path.join(folder, "sklearn.joblib"))
+        # Reload values
+        self.classifiers = classifiers
+        self.optimizers = optimizers
+        self.criterion = criterion
+    @staticmethod
+    def from_config(folder: str, device="cpu") -> "DanceTreeClassifier":
+        # load in weights
+        model_paths = (
+            p for p in os.listdir(path.join(folder, "classifier")) if p.endswith("pth")
+        )
+        classifiers = {}
+        for model_path in model_paths:
+            dance = model_path.split(".")[0]
+            model = DanceCNN().to(device)
+            model.load_state_dict(
+                torch.load(path.join(folder, "classifier", model_path))
+            )
+            classifiers[dance] = model
+        wrapper = load(path.join(folder, "sklearn.joblib"))
+        wrapper.classifiers = classifiers
+        return wrapper
 class DanceCNN(nn.Module):
     def __init__(self, sr=16000, freq_bins=20, duration=6, *args, **kwargs) -> None:
         num_frames = audio_window_duration * sr
         tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
         spec = librosa.feature.melspectrogram(y=waveform, sr=sr)
         spec_normalized = (spec - spec.mean()) / spec.std()
         spec_padded = librosa.util.fix_length(
             spec_normalized, size=sr * audio_duration, axis=1
         for i in range(audio_duration // audio_window_duration):
             spec_window = batched_spec[:, :, i * num_frames : (i + 1) * num_frames]
             yield (spec_window, tempo)
+def train_decision_tree(config: dict):
+    TARGET_CLASSES = config["global"]["dance_ids"]
+    DEVICE = config["global"]["device"]
+    SEED = config["global"]["seed"]
+    SEED = config["global"]["seed"]
+    EPOCHS = config["trainer"]["min_epochs"]
+    song_data_path = config["data_module"]["song_data_path"]
+    song_audio_path = config["data_module"]["song_audio_path"]
+    pl.seed_everything(SEED, workers=True)
+    df = pd.read_csv(song_data_path)
+    x, y = get_music4dance_examples(
+        df, song_audio_path, class_list=TARGET_CLASSES, multi_label=True
+    )
+    # Convert y back to string classes
+    y = np.array(TARGET_CLASSES)[y.argmax(-1)]
+    train_i, test_i = random_split(
+        np.arange(len(x)), [0.1, 0.9]
+    )  # Temporary to test efficacy
+    train_paths, train_y = x[train_i], y[train_i]
+    model = DanceTreeClassifier(device=DEVICE)
+    for epoch in tqdm(range(1, EPOCHS + 1)):
+        # Shuffle the data
+        i = np.arange(len(train_paths))
+        np.random.shuffle(i)
+        train_paths = train_paths[i]
+        train_y = train_y[i]
+        train_x = features_from_path(train_paths)
+        model.fit(train_x, train_y)
+    # evaluate the model
+    preds = model.predict(x[test_i])
+    accuracy = (preds == y[test_i]).mean()
+    print(f"{accuracy=}")
+    model.save("models/weights/decision_tree")

models/residual.py CHANGED Viewed

@@ -1,18 +1,25 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import pytorch_lightning as pl
 import numpy as np
 import torchaudio
 import yaml
-from .utils import calculate_metrics
-from preprocessing.pipelines import WaveformPreprocessing, AudioToSpectrogram
 # Architecture based on: https://github.com/minzwon/sota-music-tagging-models/blob/36aa13b7205ff156cf4dcab60fd69957da453151/training/model.py
 class ResidualDancer(nn.Module):
-    def __init__(self,n_channels=128, n_classes=50):
         super().__init__()
         self.n_channels = n_channels
@@ -25,17 +32,17 @@ class ResidualDancer(nn.Module):
         self.res_layers = nn.Sequential(
             ResBlock(1, n_channels, stride=2),
             ResBlock(n_channels, n_channels, stride=2),
-            ResBlock(n_channels, n_channels*2, stride=2),
-            ResBlock(n_channels*2, n_channels*2, stride=2),
-            ResBlock(n_channels*2, n_channels*2, stride=2),
-            ResBlock(n_channels*2, n_channels*2, stride=2),
-            ResBlock(n_channels*2, n_channels*4, stride=2)
         )
         # Dense
-        self.dense1 = nn.Linear(n_channels*4, n_channels*4)
-        self.bn = nn.BatchNorm1d(n_channels*4)
-        self.dense2 = nn.Linear(n_channels*4, n_classes)
         self.dropout = nn.Dropout(0.2)
     def forward(self, x):
@@ -56,24 +63,34 @@ class ResidualDancer(nn.Module):
         x = F.relu(x)
         x = self.dropout(x)
         x = self.dense2(x)
-        x = nn.Sigmoid()(x)
         return x
 class ResBlock(nn.Module):
     def __init__(self, input_channels, output_channels, shape=3, stride=2):
         super().__init__()
         # convolution
-        self.conv_1 = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
         self.bn_1 = nn.BatchNorm2d(output_channels)
-        self.conv_2 = nn.Conv2d(output_channels, output_channels, shape, padding=shape//2)
         self.bn_2 = nn.BatchNorm2d(output_channels)
         # residual
         self.diff = False
         if (stride != 1) or (input_channels != output_channels):
-            self.conv_3 = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
             self.bn_3 = nn.BatchNorm2d(output_channels)
             self.diff = True
         self.relu = nn.ReLU()
@@ -89,79 +106,31 @@ class ResBlock(nn.Module):
         out = self.relu(out)
         return out
-class TrainingEnvironment(pl.LightningModule):
-    def __init__(self, model: nn.Module, criterion: nn.Module, config:dict, learning_rate=1e-4, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.model = model
-        self.criterion = criterion
-        self.learning_rate = learning_rate
-        self.config=config
-        self.save_hyperparameters({
-            "model": type(model).__name__,
-            "loss": type(criterion).__name__,
-            "config": config,
-             **kwargs
-            })
-    def training_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int) -> torch.Tensor:
-        features, labels = batch
-        outputs = self.model(features)
-        loss = self.criterion(outputs, labels)
-        metrics = calculate_metrics(outputs, labels, prefix="train/", multi_label=True)
-        self.log_dict(metrics, prog_bar=True)
-        # Log spectrograms
-        if batch_index % 100 == 0:
-            tensorboard = self.logger.experiment
-            img_index = torch.randint(0, len(features), (1,)).item()
-            img = features[img_index][0]
-            img = (img - img.min()) / (img.max() - img.min())
-            tensorboard.add_image(f"batch: {batch_index}, element: {img_index}", img, 0, dataformats='HW')
-        return loss
-    def validation_step(self, batch:tuple[torch.Tensor, torch.TensorType], batch_index:int):
-        x, y = batch
-        preds = self.model(x)
-        metrics = calculate_metrics(preds, y, prefix="val/", multi_label=True)
-        metrics["val/loss"] = self.criterion(preds, y)
-        self.log_dict(metrics,prog_bar=True)
-    def test_step(self, batch:tuple[torch.Tensor, torch.TensorType], batch_index:int):
-        x, y = batch
-        preds = self.model(x)
-        self.log_dict(calculate_metrics(preds, y, prefix="test/", multi_label=True), prog_bar=True)
-    def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
-        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') {"scheduler": scheduler, "monitor": "val/loss"}
-        return [optimizer]
 class DancePredictor:
     def __init__(
-        self,
-        weight_path:str,
-        labels:list[str],
-        expected_duration=6,
         threshold=0.5,
         resample_frequency=16000,
-        device="cpu"):
         super().__init__()
         self.expected_duration = expected_duration
         self.threshold = threshold
         self.resample_frequency = resample_frequency
-        self.preprocess_waveform = WaveformPreprocessing(resample_frequency * expected_duration)
-        self.audio_to_spectrogram = AudioToSpectrogram(resample_frequency)
         self.labels = np.array(labels)
         self.device = device
         self.model = self.get_model(weight_path)
-    def get_model(self, weight_path:str) -> nn.Module:
         weights = torch.load(weight_path, map_location=self.device)["state_dict"]
         model = ResidualDancer(n_classes=len(self.labels))
         for key in list(weights):
@@ -170,21 +139,25 @@ class DancePredictor:
         return model.to(self.device).eval()
     @classmethod
-    def from_config(cls, config_path:str) -> "DancePredictor":
         with open(config_path, "r") as f:
             config = yaml.safe_load(f)
         return DancePredictor(**config)
     @torch.no_grad()
-    def __call__(self, waveform: np.ndarray, sample_rate:int) -> dict[str,float]:
         if len(waveform.shape) > 1 and waveform.shape[1] < waveform.shape[0]:
-            waveform = waveform.transpose(1,0)
         elif len(waveform.shape) == 1:
             waveform = np.expand_dims(waveform, 0)
         waveform = torch.from_numpy(waveform.astype("int16"))
-        waveform = torchaudio.functional.apply_codec(waveform,sample_rate, "wav", channels_first=True)
-        waveform = torchaudio.functional.resample(waveform, sample_rate,self.resample_frequency)
         waveform = self.preprocess_waveform(waveform)
         spectrogram = self.audio_to_spectrogram(waveform)
         spectrogram = spectrogram.unsqueeze(0).to(self.device)
@@ -194,8 +167,31 @@ class DancePredictor:
         result_mask = results > self.threshold
         probs = results[result_mask]
         dances = self.labels[result_mask]
-        return {dance:float(prob) for dance, prob in zip(dances, probs)}

+import pytorch_lightning as pl
+from pytorch_lightning import callbacks as cb
 import torch
+from torch import nn
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 import torchaudio
 import yaml
+from models.training_environment import TrainingEnvironment
+from preprocessing.dataset import DanceDataModule, get_datasets
+from preprocessing.pipelines import (
+    SpectrogramTrainingPipeline,
+    WaveformPreprocessing,
+)
 # Architecture based on: https://github.com/minzwon/sota-music-tagging-models/blob/36aa13b7205ff156cf4dcab60fd69957da453151/training/model.py
 class ResidualDancer(nn.Module):
+    def __init__(self, n_channels=128, n_classes=50):
         super().__init__()
         self.n_channels = n_channels
         self.res_layers = nn.Sequential(
             ResBlock(1, n_channels, stride=2),
             ResBlock(n_channels, n_channels, stride=2),
+            ResBlock(n_channels, n_channels * 2, stride=2),
+            ResBlock(n_channels * 2, n_channels * 2, stride=2),
+            ResBlock(n_channels * 2, n_channels * 2, stride=2),
+            ResBlock(n_channels * 2, n_channels * 2, stride=2),
+            ResBlock(n_channels * 2, n_channels * 4, stride=2),
         )
         # Dense
+        self.dense1 = nn.Linear(n_channels * 4, n_channels * 4)
+        self.bn = nn.BatchNorm1d(n_channels * 4)
+        self.dense2 = nn.Linear(n_channels * 4, n_classes)
         self.dropout = nn.Dropout(0.2)
     def forward(self, x):
         x = F.relu(x)
         x = self.dropout(x)
         x = self.dense2(x)
+        # x = nn.Sigmoid()(x)
         return x
 class ResBlock(nn.Module):
     def __init__(self, input_channels, output_channels, shape=3, stride=2):
         super().__init__()
         # convolution
+        self.conv_1 = nn.Conv2d(
+            input_channels, output_channels, shape, stride=stride, padding=shape // 2
+        )
         self.bn_1 = nn.BatchNorm2d(output_channels)
+        self.conv_2 = nn.Conv2d(
+            output_channels, output_channels, shape, padding=shape // 2
+        )
         self.bn_2 = nn.BatchNorm2d(output_channels)
         # residual
         self.diff = False
         if (stride != 1) or (input_channels != output_channels):
+            self.conv_3 = nn.Conv2d(
+                input_channels,
+                output_channels,
+                shape,
+                stride=stride,
+                padding=shape // 2,
+            )
             self.bn_3 = nn.BatchNorm2d(output_channels)
             self.diff = True
         self.relu = nn.ReLU()
         out = self.relu(out)
         return out
 class DancePredictor:
     def __init__(
+        self,
+        weight_path: str,
+        labels: list[str],
+        expected_duration=6,
         threshold=0.5,
         resample_frequency=16000,
+        device="cpu",
+    ):
         super().__init__()
         self.expected_duration = expected_duration
         self.threshold = threshold
         self.resample_frequency = resample_frequency
+        self.preprocess_waveform = WaveformPreprocessing(
+            resample_frequency * expected_duration
+        )
+        self.audio_to_spectrogram = lambda x: x  # TODO: Fix
         self.labels = np.array(labels)
         self.device = device
         self.model = self.get_model(weight_path)
+    def get_model(self, weight_path: str) -> nn.Module:
         weights = torch.load(weight_path, map_location=self.device)["state_dict"]
         model = ResidualDancer(n_classes=len(self.labels))
         for key in list(weights):
         return model.to(self.device).eval()
     @classmethod
+    def from_config(cls, config_path: str) -> "DancePredictor":
         with open(config_path, "r") as f:
             config = yaml.safe_load(f)
         return DancePredictor(**config)
     @torch.no_grad()
+    def __call__(self, waveform: np.ndarray, sample_rate: int) -> dict[str, float]:
         if len(waveform.shape) > 1 and waveform.shape[1] < waveform.shape[0]:
+            waveform = waveform.transpose(1, 0)
         elif len(waveform.shape) == 1:
             waveform = np.expand_dims(waveform, 0)
         waveform = torch.from_numpy(waveform.astype("int16"))
+        waveform = torchaudio.functional.apply_codec(
+            waveform, sample_rate, "wav", channels_first=True
+        )
+        waveform = torchaudio.functional.resample(
+            waveform, sample_rate, self.resample_frequency
+        )
         waveform = self.preprocess_waveform(waveform)
         spectrogram = self.audio_to_spectrogram(waveform)
         spectrogram = spectrogram.unsqueeze(0).to(self.device)
         result_mask = results > self.threshold
         probs = results[result_mask]
         dances = self.labels[result_mask]
+        return {dance: float(prob) for dance, prob in zip(dances, probs)}
+def train_residual_dancer(config: dict):
+    TARGET_CLASSES = config["dance_ids"]
+    DEVICE = config["device"]
+    SEED = config["seed"]
+    pl.seed_everything(SEED, workers=True)
+    feature_extractor = SpectrogramTrainingPipeline(**config["feature_extractor"])
+    dataset = get_datasets(config["datasets"], feature_extractor)
+    data = DanceDataModule(dataset, **config["data_module"])
+    model = ResidualDancer(n_classes=len(TARGET_CLASSES), **config["model"])
+    label_weights = data.get_label_weights().to(DEVICE)
+    criterion = nn.CrossEntropyLoss(label_weights)
+    train_env = TrainingEnvironment(model, criterion, config)
+    callbacks = [
+        # cb.LearningRateFinder(update_attr=True),
+        cb.EarlyStopping("val/loss", patience=5),
+        cb.StochasticWeightAveraging(1e-2),
+        cb.RichProgressBar(),
+        cb.DeviceStatsMonitor(),
+    ]
+    trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
+    trainer.fit(train_env, datamodule=data)
+    trainer.test(train_env, datamodule=data)

models/training_environment.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from models.utils import calculate_metrics
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+class TrainingEnvironment(pl.LightningModule):
+    def __init__(
+        self,
+        model: nn.Module,
+        criterion: nn.Module,
+        config: dict,
+        learning_rate=1e-4,
+        log_spectrograms=False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.model = model
+        self.criterion = criterion
+        self.learning_rate = learning_rate
+        self.log_spectrograms = log_spectrograms
+        self.config = config
+        self.has_multi_label_predictions = (
+            not type(criterion).__name__ == "CrossEntropyLoss"
+        )
+        self.save_hyperparameters(
+            {
+                "model": type(model).__name__,
+                "loss": type(criterion).__name__,
+                "config": config,
+                **kwargs,
+            }
+        )
+    def training_step(
+        self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int
+    ) -> torch.Tensor:
+        features, labels = batch
+        outputs = self.model(features)
+        loss = self.criterion(outputs, labels)
+        metrics = calculate_metrics(
+            outputs,
+            labels,
+            prefix="train/",
+            multi_label=self.has_multi_label_predictions,
+        )
+        self.log_dict(metrics, prog_bar=True)
+        # Log spectrograms
+        if self.log_spectrograms and batch_index % 100 == 0:
+            tensorboard = self.logger.experiment
+            img_index = torch.randint(0, len(features), (1,)).item()
+            img = features[img_index][0]
+            img = (img - img.min()) / (img.max() - img.min())
+            tensorboard.add_image(
+                f"batch: {batch_index}, element: {img_index}", img, 0, dataformats="HW"
+            )
+        return loss
+    def validation_step(
+        self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int
+    ):
+        x, y = batch
+        preds = self.model(x)
+        metrics = calculate_metrics(
+            preds, y, prefix="val/", multi_label=self.has_multi_label_predictions
+        )
+        metrics["val/loss"] = self.criterion(preds, y)
+        self.log_dict(metrics, prog_bar=True)
+    def test_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int):
+        x, y = batch
+        preds = self.model(x)
+        self.log_dict(
+            calculate_metrics(
+                preds, y, prefix="test/", multi_label=self.has_multi_label_predictions
+            ),
+            prog_bar=True,
+        )
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min")
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": scheduler,
+            "monitor": "val/loss",
+        }

models/utils.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import torch.nn as nn
 import torch
 import numpy as np
 from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
 class LabelWeightedBCELoss(nn.Module):
     """
     Binary Cross Entropy loss that assumes each float in the final dimension is a binary probability distribution.
     Allows for the weighing of each probability distribution wrt loss.
     """
-    def __init__(self, label_weights:torch.Tensor, reduction="mean"):
         super().__init__()
         self.label_weights = label_weights
@@ -17,46 +23,67 @@ class LabelWeightedBCELoss(nn.Module):
                 self.reduction = torch.mean
             case "sum":
                 self.reduction = torch.sum
-    def _log(self,x:torch.Tensor) -> torch.Tensor:
         return torch.clamp_min(torch.log(x), -100)
     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        losses = -self.label_weights * (target * self._log(input) + (1-target) * self._log(1-input))
         return self.reduction(losses)
 # TODO: Code a onehot
-def calculate_metrics(pred, target, threshold=0.5, prefix="", multi_label=True) -> dict[str, torch.Tensor]:
     target = target.detach().cpu().numpy()
     pred = pred.detach().cpu().numpy()
     params = {
-            "y_true": target if multi_label else target.argmax(1) ,
-            "y_pred": np.array(pred > threshold, dtype=float) if multi_label else pred.argmax(1),
-            "zero_division": 0,
-            "average":"macro"
-            }
-    metrics= {
-            'precision': precision_score(**params),
-            'recall': recall_score(**params),
-            'f1': f1_score(**params),
-            'accuracy': accuracy_score(y_true=params["y_true"], y_pred=params["y_pred"]),
-            }
-    return {prefix + k: torch.tensor(v,dtype=torch.float32) for k,v in metrics.items()}
 class EarlyStopping:
     def __init__(self, patience=0):
         self.patience = patience
         self.last_measure = np.inf
         self.consecutive_increase = 0
     def step(self, val) -> bool:
         if self.last_measure <= val:
-            self.consecutive_increase +=1
         else:
             self.consecutive_increase = 0
         self.last_measure = val
-        return self.patience < self.consecutive_increase

 import torch.nn as nn
 import torch
 import numpy as np
+import evaluate
 from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
+accuracy = evaluate.load("accuracy")
 class LabelWeightedBCELoss(nn.Module):
     """
     Binary Cross Entropy loss that assumes each float in the final dimension is a binary probability distribution.
     Allows for the weighing of each probability distribution wrt loss.
     """
+    def __init__(self, label_weights: torch.Tensor, reduction="mean"):
         super().__init__()
         self.label_weights = label_weights
                 self.reduction = torch.mean
             case "sum":
                 self.reduction = torch.sum
+    def _log(self, x: torch.Tensor) -> torch.Tensor:
         return torch.clamp_min(torch.log(x), -100)
     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        losses = -self.label_weights * (
+            target * self._log(input) + (1 - target) * self._log(1 - input)
+        )
         return self.reduction(losses)
 # TODO: Code a onehot
+def calculate_metrics(
+    pred, target, threshold=0.5, prefix="", multi_label=True
+) -> dict[str, torch.Tensor]:
     target = target.detach().cpu().numpy()
     pred = pred.detach().cpu().numpy()
     params = {
+        "y_true": target if multi_label else target.argmax(1),
+        "y_pred": np.array(pred > threshold, dtype=float)
+        if multi_label
+        else pred.argmax(1),
+        "zero_division": 0,
+        "average": "macro",
+    }
+    metrics = {
+        "precision": precision_score(**params),
+        "recall": recall_score(**params),
+        "f1": f1_score(**params),
+        "accuracy": accuracy_score(y_true=params["y_true"], y_pred=params["y_pred"]),
+    }
+    return {
+        prefix + k: torch.tensor(v, dtype=torch.float32) for k, v in metrics.items()
+    }
 class EarlyStopping:
     def __init__(self, patience=0):
         self.patience = patience
         self.last_measure = np.inf
         self.consecutive_increase = 0
     def step(self, val) -> bool:
         if self.last_measure <= val:
+            self.consecutive_increase += 1
         else:
             self.consecutive_increase = 0
         self.last_measure = val
+        return self.patience < self.consecutive_increase
+def get_id_label_mapping(labels: list[str]) -> tuple[dict, dict]:
+    id2label = {str(i): label for i, label in enumerate(labels)}
+    label2id = {label: str(i) for i, label in enumerate(labels)}
+    return id2label, label2id
+def compute_hf_metrics(eval_pred):
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

models/wav2vec2.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+from typing import Any
+import pytorch_lightning as pl
+from torch.utils.data import random_split
+from transformers import AutoFeatureExtractor
+from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
+from preprocessing.dataset import (
+    HuggingFaceDatasetWrapper,
+    BestBallroomDataset,
+    get_datasets,
+)
+from preprocessing.pipelines import WaveformTrainingPipeline
+from .utils import get_id_label_mapping, compute_hf_metrics
+MODEL_CHECKPOINT = "facebook/wav2vec2-base"
+class Wav2VecFeatureExtractor:
+    def __init__(self) -> None:
+        self.waveform_pipeline = WaveformTrainingPipeline()
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained(
+            MODEL_CHECKPOINT,
+        )
+    def __call__(self, waveform) -> Any:
+        waveform = self.waveform_pipeline(waveform)
+        return self.feature_extractor(
+            waveform, sampling_rate=self.feature_extractor.sampling_rate
+        )
+    def __getattr__(self, attr):
+        return getattr(self.feature_extractor, attr)
+def train_wav_model(config: dict):
+    TARGET_CLASSES = config["dance_ids"]
+    DEVICE = config["device"]
+    SEED = config["seed"]
+    OUTPUT_DIR = "models/weights/wav2vec2"
+    batch_size = config["data_module"]["batch_size"]
+    epochs = config["trainer"]["min_epochs"]
+    test_proportion = config["data_module"].get("test_proportion", 0.2)
+    pl.seed_everything(SEED, workers=True)
+    dataset = get_datasets(config["datasets"])
+    id2label, label2id = get_id_label_mapping(TARGET_CLASSES)
+    test_proportion = config["data_module"]["test_proportion"]
+    train_proporition = 1 - test_proportion
+    train_ds, test_ds = random_split(dataset, [train_proporition, test_proportion])
+    feature_extractor = Wav2VecFeatureExtractor()
+    model = AutoModelForAudioClassification.from_pretrained(
+        MODEL_CHECKPOINT,
+        num_labels=len(TARGET_CLASSES),
+        label2id=label2id,
+        id2label=id2label,
+        ignore_mismatched_sizes=True,
+    ).to(DEVICE)
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        learning_rate=3e-5,
+        per_device_train_batch_size=batch_size,
+        gradient_accumulation_steps=5,
+        per_device_eval_batch_size=batch_size,
+        num_train_epochs=epochs,
+        warmup_ratio=0.1,
+        logging_steps=10,
+        load_best_model_at_end=True,
+        metric_for_best_model="accuracy",
+        push_to_hub=False,
+        use_mps_device=DEVICE == "mps",
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=test_ds,
+        tokenizer=feature_extractor,
+        compute_metrics=compute_hf_metrics,
+    )
+    trainer.train()
+    return model

preprocessing/dataset.py CHANGED Viewed

@@ -1,15 +1,21 @@
 import torch
-from torch.utils.data import Dataset, DataLoader, random_split
 import numpy as np
 import pandas as pd
 import torchaudio as ta
-from .pipelines import AudioTrainingPipeline
 import pytorch_lightning as pl
-from .preprocess import get_examples
-from sklearn.model_selection import train_test_split
-from torchaudio import transforms as taT
-from torch import nn
-from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
 class SongDataset(Dataset):
@@ -17,60 +23,67 @@ class SongDataset(Dataset):
         self,
         audio_paths: list[str],
         dance_labels: list[np.ndarray],
-        audio_duration=30,  # seconds
         audio_window_duration=6,  # seconds
-        audio_window_jitter=0.0,  # seconds
-        audio_pipeline_kwargs={},
-        resample_frequency=16000,
     ):
-        assert (
-            audio_duration % audio_window_duration == 0
-        ), "Audio window should divide duration evenly."
         assert (
             audio_window_duration > audio_window_jitter
         ), "Jitter should be a small fraction of the audio window duration."
         self.audio_paths = audio_paths
         self.dance_labels = dance_labels
-        audio_info = ta.info(audio_paths[0])
-        self.sample_rate = audio_info.sample_rate
         self.audio_window_duration = int(audio_window_duration)
         self.audio_window_jitter = audio_window_jitter
-        self.audio_duration = int(audio_duration)
-        self.audio_pipeline = AudioTrainingPipeline(
-            self.sample_rate,
-            resample_frequency,
-            audio_window_duration,
-            **audio_pipeline_kwargs,
-        )
     def __len__(self):
-        return len(self.audio_paths) * self.audio_duration // self.audio_window_duration
     def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
-        waveform = self._waveform_from_index(idx)
-        assert (
-            waveform.shape[1] > 10
-        ), f"No data found: {self._backtrace_audio_path(idx)}"
-        spectrogram = self.audio_pipeline(waveform)
         dance_labels = self._label_from_index(idx)
-        example_is_valid = self._validate_output(spectrogram, dance_labels)
-        if example_is_valid:
-            return spectrogram, dance_labels
-        else:
-            # Try the previous one
-            # This happens when some of the audio recordings are really quiet
-            # This WILL NOT leak into other data partitions because songs belong entirely to a partition
-            return self[idx - 1]
-    def _convert_idx(self, idx: int) -> int:
-        return idx * self.audio_window_duration // self.audio_duration
     def _backtrace_audio_path(self, index: int) -> str:
-        return self.audio_paths[self._convert_idx(index)]
     def _validate_output(self, x, y):
         is_finite = not torch.any(torch.isinf(x))
@@ -80,16 +93,18 @@ class SongDataset(Dataset):
         return all((is_finite, is_numerical, has_data, is_binary))
     def _waveform_from_index(self, idx: int) -> torch.Tensor:
-        audio_filepath = self.audio_paths[self._convert_idx(idx)]
-        num_windows = self.audio_duration // self.audio_window_duration
-        frame_index = idx % num_windows
         jitter_start = -self.audio_window_jitter if frame_index > 0 else 0.0
         jitter_end = self.audio_window_jitter if frame_index != num_windows - 1 else 0.0
         jitter = int(
             torch.FloatTensor(1).uniform_(jitter_start, jitter_end) * self.sample_rate
         )
-        frame_offset = (
-            frame_index * self.audio_window_duration * self.sample_rate + jitter
         )
         num_frames = self.sample_rate * self.audio_window_duration
         waveform, sample_rate = ta.load(
@@ -101,41 +116,21 @@ class SongDataset(Dataset):
         return waveform
     def _label_from_index(self, idx: int) -> torch.Tensor:
-        return torch.from_numpy(self.dance_labels[self._convert_idx(idx)])
-class WaveformSongDataset(SongDataset):
     """
-    Outputs raw waveforms of the data instead of a spectrogram.
     """
-    def __init__(self, *args, resample_frequency=16000, **kwargs):
         super().__init__(*args, **kwargs)
-        self.resample_frequency = resample_frequency
-        self.resampler = taT.Resample(self.sample_rate, self.resample_frequency)
         self.pipeline = []
     def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        waveform = self._waveform_from_index(idx)
-        assert (
-            waveform.shape[1] > 10
-        ), f"No data found: {self._backtrace_audio_path(idx)}"
-        # resample the waveform
-        waveform = self.resampler(waveform)
-        waveform = waveform.mean(0)
-        dance_labels = self._label_from_index(idx)
-        return waveform, dance_labels
-class HuggingFaceWaveformSongDataset(WaveformSongDataset):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.pipeline = []
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        x, y = super().__getitem__(idx)
         if len(self.pipeline) > 0:
             for fn in self.pipeline:
                 x = fn(x)
@@ -146,59 +141,158 @@ class HuggingFaceWaveformSongDataset(WaveformSongDataset):
             "label": dance_labels,
         }
-    def map(self, fn):
         """
-        NOTE this mutates the original, doesn't return a copy like normal maps.
         """
         self.pipeline.append(fn)
 class DanceDataModule(pl.LightningDataModule):
     def __init__(
         self,
-        song_data_path="data/songs_cleaned.csv",
-        song_audio_path="data/samples",
         test_proportion=0.15,
         val_proportion=0.1,
         target_classes: list[str] = None,
-        min_votes=1,
         batch_size: int = 64,
         num_workers=10,
-        dataset_cls=None,
-        dataset_kwargs={},
     ):
         super().__init__()
-        self.song_data_path = song_data_path
-        self.song_audio_path = song_audio_path
         self.val_proportion = val_proportion
         self.test_proportion = test_proportion
         self.train_proportion = 1.0 - test_proportion - val_proportion
         self.target_classes = target_classes
         self.batch_size = batch_size
         self.num_workers = num_workers
-        self.dataset_kwargs = dataset_kwargs
-        self.dataset_cls = dataset_cls if dataset_cls is not None else SongDataset
-        df = pd.read_csv(song_data_path)
-        self.x, self.y = get_examples(
-            df,
-            self.song_audio_path,
-            class_list=self.target_classes,
-            multi_label=True,
-            min_votes=min_votes,
-        )
     def setup(self, stage: str):
-        train_i, val_i, test_i = random_split(
-            np.arange(len(self.x)),
             [self.train_proportion, self.val_proportion, self.test_proportion],
         )
-        self.train_ds = self._dataset_from_indices(train_i)
-        self.val_ds = self._dataset_from_indices(val_i)
-        self.test_ds = self._dataset_from_indices(test_i)
-    def _dataset_from_indices(self, idx: list[int]) -> SongDataset:
-        return self.dataset_cls(self.x[idx], self.y[idx], **self.dataset_kwargs)
     def train_dataloader(self):
         return DataLoader(
@@ -210,110 +304,48 @@ class DanceDataModule(pl.LightningDataModule):
     def val_dataloader(self):
         return DataLoader(
-            self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers
         )
     def test_dataloader(self):
         return DataLoader(
-            self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers
         )
     def get_label_weights(self):
-        n_examples, n_classes = self.y.shape
-        return torch.from_numpy(n_examples / (n_classes * sum(self.y)))
-class WaveformTrainingEnvironment(pl.LightningModule):
-    def __init__(
-        self,
-        model: nn.Module,
-        criterion: nn.Module,
-        feature_extractor,
-        config: dict,
-        learning_rate=1e-4,
-        *args,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.model = model
-        self.criterion = criterion
-        self.learning_rate = learning_rate
-        self.config = config
-        self.feature_extractor = feature_extractor
-        self.save_hyperparameters(
-            {
-                "model": type(model).__name__,
-                "loss": type(criterion).__name__,
-                "config": config,
-                **kwargs,
-            }
-        )
-    def preprocess_inputs(self, x):
-        device = x.device
-        x = list(x.squeeze(1).cpu().numpy())
-        x = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000)
-        return x["input_values"].to(device)
-    def training_step(
-        self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int
-    ) -> torch.Tensor:
-        features, labels = batch
-        features = self.preprocess_inputs(features)
-        outputs = self.model(features).logits
-        outputs = nn.Sigmoid()(
-            outputs
-        )  # good for multi label classification, should be softmax otherwise
-        loss = self.criterion(outputs, labels)
-        metrics = calculate_metrics(outputs, labels, prefix="train/", multi_label=True)
-        self.log_dict(metrics, prog_bar=True)
-        return loss
-    def validation_step(
-        self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int
-    ):
-        x, y = batch
-        x = self.preprocess_inputs(x)
-        preds = self.model(x).logits
-        preds = nn.Sigmoid()(preds)
-        metrics = calculate_metrics(preds, y, prefix="val/", multi_label=True)
-        metrics["val/loss"] = self.criterion(preds, y)
-        self.log_dict(metrics, prog_bar=True)
-    def test_step(self, batch: tuple[torch.Tensor, torch.TensorType], batch_index: int):
-        x, y = batch
-        x = self.preprocess_inputs(x)
-        preds = self.model(x).logits
-        preds = nn.Sigmoid()(preds)
-        self.log_dict(
-            calculate_metrics(preds, y, prefix="test/", multi_label=True), prog_bar=True
-        )
-    def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
-        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') {"scheduler": scheduler, "monitor": "val/loss"}
-        return [optimizer]
-def calculate_metrics(
-    pred, target, threshold=0.5, prefix="", multi_label=True
-) -> dict[str, torch.Tensor]:
-    target = target.detach().cpu().numpy()
-    pred = pred.detach().cpu().numpy()
-    params = {
-        "y_true": target if multi_label else target.argmax(1),
-        "y_pred": np.array(pred > threshold, dtype=float)
-        if multi_label
-        else pred.argmax(1),
-        "zero_division": 0,
-        "average": "macro",
-    }
-    metrics = {
-        "precision": precision_score(**params),
-        "recall": recall_score(**params),
-        "f1": f1_score(**params),
-        "accuracy": accuracy_score(y_true=params["y_true"], y_pred=params["y_pred"]),
-    }
-    return {
-        prefix + k: torch.tensor(v, dtype=torch.float32) for k, v in metrics.items()
-    }

+import importlib
+import os
+from typing import Any
 import torch
+from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
 import numpy as np
 import pandas as pd
 import torchaudio as ta
 import pytorch_lightning as pl
+from preprocessing.preprocess import (
+    fix_dance_rating_counts,
+    get_unique_labels,
+    has_valid_audio,
+    url_to_filename,
+    vectorize_label_probs,
+    vectorize_multi_label,
+)
 class SongDataset(Dataset):
         self,
         audio_paths: list[str],
         dance_labels: list[np.ndarray],
+        audio_start_offset=6,  # seconds
         audio_window_duration=6,  # seconds
+        audio_window_jitter=1.0,  # seconds
     ):
         assert (
             audio_window_duration > audio_window_jitter
         ), "Jitter should be a small fraction of the audio window duration."
         self.audio_paths = audio_paths
         self.dance_labels = dance_labels
+        audio_metadata = [ta.info(audio) for audio in audio_paths]
+        self.audio_durations = [
+            meta.num_frames / meta.sample_rate for meta in audio_metadata
+        ]
+        self.sample_rate = audio_metadata[0].sample_rate  # assuming same sample rate
         self.audio_window_duration = int(audio_window_duration)
+        self.audio_start_offset = audio_start_offset
         self.audio_window_jitter = audio_window_jitter
     def __len__(self):
+        return int(
+            sum(
+                max(duration - self.audio_start_offset, 0) // self.audio_window_duration
+                for duration in self.audio_durations
+            )
+        )
     def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        if isinstance(idx, list):
+            return [
+                (self._waveform_from_index(i), self._label_from_index(i)) for i in idx
+            ]
+        waveform = self._waveform_from_index(idx)
         dance_labels = self._label_from_index(idx)
+        return waveform, dance_labels
+    def _idx2audio_idx(self, idx: int) -> int:
+        return self._get_audio_loc_from_idx(idx)[0]
+    def _get_audio_loc_from_idx(self, idx: int) -> tuple[int, int]:
+        """
+        Converts dataset index to the indices that reference the target audio path
+        and window offset.
+        """
+        total_slices = 0
+        for audio_index, duration in enumerate(self.audio_durations):
+            audio_slices = max(
+                (duration - self.audio_start_offset) // self.audio_window_duration, 1
+            )
+            if total_slices + audio_slices > idx:
+                frame_index = idx - total_slices
+                return audio_index, frame_index
+            total_slices += audio_slices
+    def get_label_weights(self):
+        n_examples, n_classes = self.dance_labels.shape
+        return torch.from_numpy(n_examples / (n_classes * sum(self.dance_labels)))
     def _backtrace_audio_path(self, index: int) -> str:
+        return self.audio_paths[self._idx2audio_idx(index)]
     def _validate_output(self, x, y):
         is_finite = not torch.any(torch.isinf(x))
         return all((is_finite, is_numerical, has_data, is_binary))
     def _waveform_from_index(self, idx: int) -> torch.Tensor:
+        audio_index, frame_index = self._get_audio_loc_from_idx(idx)
+        audio_filepath = self.audio_paths[audio_index]
+        num_windows = self.audio_durations[audio_index] // self.audio_window_duration
         jitter_start = -self.audio_window_jitter if frame_index > 0 else 0.0
         jitter_end = self.audio_window_jitter if frame_index != num_windows - 1 else 0.0
         jitter = int(
             torch.FloatTensor(1).uniform_(jitter_start, jitter_end) * self.sample_rate
         )
+        frame_offset = int(
+            frame_index * self.audio_window_duration * self.sample_rate
+            + jitter
+            + self.audio_start_offset * self.sample_rate
         )
         num_frames = self.sample_rate * self.audio_window_duration
         waveform, sample_rate = ta.load(
         return waveform
     def _label_from_index(self, idx: int) -> torch.Tensor:
+        return torch.from_numpy(self.dance_labels[self._idx2audio_idx(idx)])
+class HuggingFaceDatasetWrapper(Dataset):
     """
+    Makes a standard PyTorch Dataset compatible with a HuggingFace Trainer.
     """
+    def __init__(self, dataset, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.dataset = dataset
         self.pipeline = []
     def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        x, y = self.dataset[idx]
         if len(self.pipeline) > 0:
             for fn in self.pipeline:
                 x = fn(x)
             "label": dance_labels,
         }
+    def __len__(self):
+        return len(self.dataset)
+    def append_to_pipeline(self, fn):
         """
+        Adds a preprocessing step to the dataset.
         """
         self.pipeline.append(fn)
+class BestBallroomDataset(Dataset):
+    def __init__(
+        self, audio_dir="data/ballroom-songs", class_list=None, **kwargs
+    ) -> None:
+        super().__init__()
+        song_paths, labels = self.get_examples(audio_dir, class_list)
+        self.song_dataset = SongDataset(song_paths, labels, **kwargs)
+    def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.song_dataset[index]
+    def __len__(self):
+        return len(self.song_dataset)
+    def get_examples(self, audio_dir, class_list=None):
+        dances = set(
+            f
+            for f in os.listdir(audio_dir)
+            if os.path.isdir(os.path.join(audio_dir, f))
+        )
+        common_dances = dances
+        if class_list is not None:
+            common_dances = dances & set(class_list)
+            dances = class_list
+        dances = np.array(sorted(dances))
+        song_paths = []
+        labels = []
+        for dance in common_dances:
+            dance_label = (dances == dance).astype("float32")
+            folder_path = os.path.join(audio_dir, dance)
+            folder_contents = [f for f in os.listdir(folder_path) if f.endswith(".wav")]
+            song_paths.extend(os.path.join(folder_path, f) for f in folder_contents)
+            labels.extend([dance_label] * len(folder_contents))
+        return np.array(song_paths), np.stack(labels)
+class Music4DanceDataset(Dataset):
+    def __init__(
+        self,
+        song_data_path,
+        song_audio_path,
+        class_list=None,
+        multi_label=True,
+        min_votes=1,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        df = pd.read_csv(song_data_path)
+        song_paths, labels = get_music4dance_examples(
+            df,
+            song_audio_path,
+            class_list=class_list,
+            multi_label=multi_label,
+            min_votes=min_votes,
+        )
+        self.song_dataset = SongDataset(song_paths, labels, **kwargs)
+    def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.song_dataset[index]
+    def __len__(self):
+        return len(self.song_dataset)
+def get_music4dance_examples(
+    df: pd.DataFrame, audio_dir: str, class_list=None, multi_label=True, min_votes=1
+) -> tuple[np.ndarray, np.ndarray]:
+    sampled_songs = df[has_valid_audio(df["Sample"], audio_dir)].copy(deep=True)
+    sampled_songs["DanceRating"] = fix_dance_rating_counts(sampled_songs["DanceRating"])
+    if class_list is not None:
+        class_list = set(class_list)
+        sampled_songs["DanceRating"] = sampled_songs["DanceRating"].apply(
+            lambda labels: {k: v for k, v in labels.items() if k in class_list}
+            if not pd.isna(labels)
+            and any(label in class_list and amt > 0 for label, amt in labels.items())
+            else np.nan
+        )
+    sampled_songs = sampled_songs.dropna(subset=["DanceRating"])
+    vote_mask = sampled_songs["DanceRating"].apply(
+        lambda dances: any(votes >= min_votes for votes in dances.values())
+    )
+    sampled_songs = sampled_songs[vote_mask]
+    labels = sampled_songs["DanceRating"].apply(
+        lambda dances: {
+            dance: votes for dance, votes in dances.items() if votes >= min_votes
+        }
+    )
+    unique_labels = np.array(get_unique_labels(labels))
+    vectorizer = vectorize_multi_label if multi_label else vectorize_label_probs
+    labels = labels.apply(lambda i: vectorizer(i, unique_labels))
+    audio_paths = [
+        os.path.join(audio_dir, url_to_filename(url)) for url in sampled_songs["Sample"]
+    ]
+    return np.array(audio_paths), np.stack(labels)
+class PipelinedDataset(Dataset):
+    """
+    Adds a feature extractor preprocessing step to a dataset.
+    """
+    def __init__(self, dataset, feature_extractor):
+        self._data = dataset
+        self.feature_extractor = feature_extractor
+    def __len__(self):
+        return len(self._data)
+    def __getitem__(self, index):
+        sample, label = self._data[index]
+        features = self.feature_extractor(sample)
+        return features, label
 class DanceDataModule(pl.LightningDataModule):
     def __init__(
         self,
+        dataset: Dataset,
         test_proportion=0.15,
         val_proportion=0.1,
         target_classes: list[str] = None,
         batch_size: int = 64,
         num_workers=10,
     ):
         super().__init__()
         self.val_proportion = val_proportion
         self.test_proportion = test_proportion
         self.train_proportion = 1.0 - test_proportion - val_proportion
         self.target_classes = target_classes
         self.batch_size = batch_size
         self.num_workers = num_workers
+        self.dataset = dataset
     def setup(self, stage: str):
+        self.train_ds, self.val_ds, self.test_ds = random_split(
+            self.dataset,
             [self.train_proportion, self.val_proportion, self.test_proportion],
         )
     def train_dataloader(self):
         return DataLoader(
     def val_dataloader(self):
         return DataLoader(
+            self.val_ds,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
         )
     def test_dataloader(self):
         return DataLoader(
+            self.test_ds,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
         )
     def get_label_weights(self):
+        weights = [
+            ds.song_dataset.get_label_weights() for ds in self.dataset._data.datasets
+        ]
+        return torch.mean(torch.stack(weights), dim=0)  # TODO: Make this weighted
+def find_mean_std(dataset: Dataset, zscore=1.96, moe=0.02, p=0.5):
+    """
+    Estimates the mean and standard deviations of the a dataset.
+    """
+    sample_size = int(np.ceil((zscore**2 * p * (1 - p)) / (moe**2)))
+    sample_indices = np.random.choice(
+        np.arange(len(dataset)), size=sample_size, replace=False
+    )
+    mean = 0
+    std = 0
+    for i in sample_indices:
+        features = dataset[i][0]
+        mean += features.mean().item()
+        std += features.std().item()
+    print("std", std / sample_size)
+    print("mean", mean / sample_size)
+def get_datasets(dataset_config: dict, feature_extractor) -> Dataset:
+    datasets = []
+    for dataset_path, kwargs in dataset_config.items():
+        module_name, class_name = dataset_path.rsplit(".", 1)
+        module = importlib.import_module(module_name)
+        ProvidedDataset = getattr(module, class_name)
+        datasets.append(ProvidedDataset(**kwargs))
+    return PipelinedDataset(ConcatDataset(datasets), feature_extractor)

preprocessing/pipelines.py CHANGED Viewed

@@ -3,29 +3,26 @@ import torchaudio
 from torchaudio import transforms as taT, functional as taF
 import torch.nn as nn
-class AudioTrainingPipeline(torch.nn.Module):
-    def __init__(self,
-            input_freq=16000,
-            resample_freq=16000,
-            expected_duration=6,
-            freq_mask_size=10,
-            time_mask_size=80,
-            mask_count = 2,
-            snr_mean=6.0,
-            noise_path=None):
         super().__init__()
         self.input_freq = input_freq
         self.snr_mean = snr_mean
-        self.mask_count = mask_count
         self.noise = self.get_noise(noise_path)
-        self.resample = taT.Resample(input_freq,resample_freq)
-        self.preprocess_waveform = WaveformPreprocessing(resample_freq * expected_duration)
-        self.audio_to_spectrogram = AudioToSpectrogram(
-            sample_rate=resample_freq,
         )
-        self.freq_mask = taT.FrequencyMasking(freq_mask_size)
-        self.time_mask = taT.TimeMasking(time_mask_size)
     def get_noise(self, path) -> torch.Tensor:
         if path is None:
@@ -34,13 +31,15 @@ class AudioTrainingPipeline(torch.nn.Module):
         if noise.shape[0] > 1:
             noise = noise.mean(0, keepdim=True)
         if sr != self.input_freq:
-            noise = taF.resample(noise,sr, self.input_freq)
         return noise
-    def add_noise(self, waveform:torch.Tensor) -> torch.Tensor:
-        assert self.noise is not None, "Cannot add noise because a noise file was not provided."
         num_repeats = waveform.shape[1] // self.noise.shape[1] + 1
-        noise = self.noise.repeat(1,num_repeats)[:, :waveform.shape[1]]
         noise_power = noise.norm(p=2)
         signal_power = waveform.norm(p=2)
         snr_db = torch.normal(self.snr_mean, 1.5, (1,)).clamp_min(1.0)
@@ -49,14 +48,28 @@ class AudioTrainingPipeline(torch.nn.Module):
         noisy_waveform = (scale * waveform + noise) / 2
         return noisy_waveform
-    def forward(self, waveform:torch.Tensor) -> torch.Tensor:
-        try:
-            waveform = self.resample(waveform)
-        except:
-            print("oops")
         waveform = self.preprocess_waveform(waveform)
         if self.noise is not None:
             waveform = self.add_noise(waveform)
         spec = self.audio_to_spectrogram(waveform)
         # Spectrogram augmentation
@@ -67,14 +80,11 @@ class AudioTrainingPipeline(torch.nn.Module):
 class WaveformPreprocessing(torch.nn.Module):
-    def __init__(self, expected_sample_length:int):
         super().__init__()
         self.expected_sample_length = expected_sample_length
-    def forward(self, waveform:torch.Tensor) -> torch.Tensor:
         # Take out extra channels
         if waveform.shape[0] > 1:
             waveform = waveform.mean(0, keepdim=True)
@@ -83,30 +93,34 @@ class WaveformPreprocessing(torch.nn.Module):
         waveform = self._rectify_duration(waveform)
         return waveform
-    def _rectify_duration(self,waveform:torch.Tensor):
         expected_samples = self.expected_sample_length
         sample_count = waveform.shape[1]
         if expected_samples == sample_count:
             return waveform
         elif expected_samples > sample_count:
             pad_amount = expected_samples - sample_count
-            return torch.nn.functional.pad(waveform, (0, pad_amount),mode="constant", value=0.0)
         else:
-            return waveform[:,:expected_samples]
-class AudioToSpectrogram(torch.nn.Module):
     def __init__(
         self,
         sample_rate=16000,
     ):
-        super().__init__()
-        self.spec = taT.MelSpectrogram(sample_rate=sample_rate, n_mels=128, n_fft=1024) # TODO: Change mels to 64
         self.to_db = taT.AmplitudeToDB()
-    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
         spectrogram = self.spec(waveform)
         spectrogram = self.to_db(spectrogram)
-        return spectrogram

 from torchaudio import transforms as taT, functional as taF
 import torch.nn as nn
+class WaveformTrainingPipeline(torch.nn.Module):
+    def __init__(
+        self,
+        input_freq=16000,
+        resample_freq=16000,
+        expected_duration=6,
+        snr_mean=6.0,
+        noise_path=None,
+    ):
         super().__init__()
         self.input_freq = input_freq
         self.snr_mean = snr_mean
         self.noise = self.get_noise(noise_path)
+        self.resample_frequency = resample_freq
+        self.resample = taT.Resample(input_freq, resample_freq)
+        self.preprocess_waveform = WaveformPreprocessing(
+            resample_freq * expected_duration
         )
     def get_noise(self, path) -> torch.Tensor:
         if path is None:
         if noise.shape[0] > 1:
             noise = noise.mean(0, keepdim=True)
         if sr != self.input_freq:
+            noise = taF.resample(noise, sr, self.input_freq)
         return noise
+    def add_noise(self, waveform: torch.Tensor) -> torch.Tensor:
+        assert (
+            self.noise is not None
+        ), "Cannot add noise because a noise file was not provided."
         num_repeats = waveform.shape[1] // self.noise.shape[1] + 1
+        noise = self.noise.repeat(1, num_repeats)[:, : waveform.shape[1]]
         noise_power = noise.norm(p=2)
         signal_power = waveform.norm(p=2)
         snr_db = torch.normal(self.snr_mean, 1.5, (1,)).clamp_min(1.0)
         noisy_waveform = (scale * waveform + noise) / 2
         return noisy_waveform
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        waveform = self.resample(waveform)
         waveform = self.preprocess_waveform(waveform)
         if self.noise is not None:
             waveform = self.add_noise(waveform)
+        return waveform
+class SpectrogramTrainingPipeline(WaveformTrainingPipeline):
+    def __init__(
+        self, freq_mask_size=10, time_mask_size=80, mask_count=2, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.mask_count = mask_count
+        self.audio_to_spectrogram = AudioToSpectrogram(
+            sample_rate=self.resample_frequency,
+        )
+        self.freq_mask = taT.FrequencyMasking(freq_mask_size)
+        self.time_mask = taT.TimeMasking(time_mask_size)
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        waveform = super().forward(waveform)
         spec = self.audio_to_spectrogram(waveform)
         # Spectrogram augmentation
 class WaveformPreprocessing(torch.nn.Module):
+    def __init__(self, expected_sample_length: int):
         super().__init__()
         self.expected_sample_length = expected_sample_length
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
         # Take out extra channels
         if waveform.shape[0] > 1:
             waveform = waveform.mean(0, keepdim=True)
         waveform = self._rectify_duration(waveform)
         return waveform
+    def _rectify_duration(self, waveform: torch.Tensor):
         expected_samples = self.expected_sample_length
         sample_count = waveform.shape[1]
         if expected_samples == sample_count:
             return waveform
         elif expected_samples > sample_count:
             pad_amount = expected_samples - sample_count
+            return torch.nn.functional.pad(
+                waveform, (0, pad_amount), mode="constant", value=0.0
+            )
         else:
+            return waveform[:, :expected_samples]
+class AudioToSpectrogram:
     def __init__(
         self,
         sample_rate=16000,
     ):
+        self.spec = taT.MelSpectrogram(
+            sample_rate=sample_rate, n_mels=128, n_fft=1024
+        )  # Note: this doesn't work on mps right now.
         self.to_db = taT.AmplitudeToDB()
+    def __call__(self, waveform: torch.Tensor) -> torch.Tensor:
         spectrogram = self.spec(waveform)
         spectrogram = self.to_db(spectrogram)
+        # Normalize
+        spectrogram = (spectrogram - spectrogram.mean()) / (2 * spectrogram.std())
+        return spectrogram

preprocessing/preprocess.py CHANGED Viewed

@@ -3,7 +3,9 @@ import numpy as np
 import re
 import json
 from pathlib import Path
 import os
 import torchaudio
 import torch
 from tqdm import tqdm
@@ -95,7 +97,6 @@ def vectorize_label_probs(
     for k, v in labels.items():
         item_vec = (unique_labels == k) * v
         label_vec += item_vec
-    lv_cache = label_vec.copy()
     label_vec[label_vec < 0] = 0
     label_vec /= label_vec.sum()
     assert not any(np.isnan(label_vec)), f"Provided labels are invalid: {labels}"
@@ -113,49 +114,70 @@ def vectorize_multi_label(
     return probs
-def get_examples(
-    df: pd.DataFrame, audio_dir: str, class_list=None, multi_label=True, min_votes=1
-) -> tuple[np.ndarray, np.ndarray]:
-    sampled_songs = df[has_valid_audio(df["Sample"], audio_dir)].copy(deep=True)
-    sampled_songs["DanceRating"] = fix_dance_rating_counts(sampled_songs["DanceRating"])
-    if class_list is not None:
-        class_list = set(class_list)
-        sampled_songs["DanceRating"] = sampled_songs["DanceRating"].apply(
-            lambda labels: {k: v for k, v in labels.items() if k in class_list}
-            if not pd.isna(labels)
-            and any(label in class_list and amt > 0 for label, amt in labels.items())
-            else np.nan
-        )
-    sampled_songs = sampled_songs.dropna(subset=["DanceRating"])
-    vote_mask = sampled_songs["DanceRating"].apply(
-        lambda dances: any(votes >= min_votes for votes in dances.values())
-    )
-    sampled_songs = sampled_songs[vote_mask]
-    labels = sampled_songs["DanceRating"].apply(
-        lambda dances: {
-            dance: votes for dance, votes in dances.items() if votes >= min_votes
-        }
-    )
-    unique_labels = np.array(get_unique_labels(labels))
-    vectorizer = vectorize_multi_label if multi_label else vectorize_label_probs
-    labels = labels.apply(lambda i: vectorizer(i, unique_labels))
-    audio_paths = [
-        os.path.join(audio_dir, url_to_filename(url)) for url in sampled_songs["Sample"]
-    ]
-    return np.array(audio_paths), np.stack(labels)
 if __name__ == "__main__":
-    links = pd.read_csv("data/backup_2.csv", index_col="index")
-    df = pd.read_csv("data/songs.csv")
-    l = links["link"].str.strip()
-    l = l.apply(lambda url: url if "http" in url else np.nan)
-    l = l.dropna()
-    df["Sample"].update(l)
-    addna = lambda url: url if type(url) == str and "http" in url else np.nan
-    df["Sample"] = df["Sample"].apply(addna)
-    is_valid = validate_audio(df["Sample"], "data/samples")
-    df["valid"] = is_valid
-    df.to_csv("data/songs_validated.csv")

 import re
 import json
 from pathlib import Path
+import glob
 import os
+import shutil
 import torchaudio
 import torch
 from tqdm import tqdm
     for k, v in labels.items():
         item_vec = (unique_labels == k) * v
         label_vec += item_vec
     label_vec[label_vec < 0] = 0
     label_vec /= label_vec.sum()
     assert not any(np.isnan(label_vec)), f"Provided labels are invalid: {labels}"
     return probs
+def sort_yt_files(
+    aliases_path="data/dance_aliases.json",
+    all_dances_folder="data/best-ballroom-music",
+    original_location="data/yt-ballroom-music/",
+):
+    def normalize_string(s):
+        # Lowercase string and remove special characters
+        return re.sub(r"\W+", "", s.lower())
+    with open(aliases_path, "r") as f:
+        dances = json.load(f)
+    # Normalize the dance inputs and aliases
+    normalized_dances = {
+        normalize_string(dance_id): [normalize_string(alias) for alias in aliases]
+        for dance_id, aliases in dances.items()
+    }
+    # For every wav file in the target folder
+    bad_files = []
+    progress_bar = tqdm(os.listdir(all_dances_folder), unit="files moved")
+    for file_name in progress_bar:
+        if file_name.endswith(".wav"):
+            # check if the normalized wav file name contains the normalized dance alias
+            normalized_file_name = normalize_string(file_name)
+            matching_dance_ids = [
+                dance_id
+                for dance_id, aliases in normalized_dances.items()
+                if any(alias in normalized_file_name for alias in aliases)
+            ]
+            if len(matching_dance_ids) == 0:
+                # See if the dance is in the path
+                original_filename = file_name.replace(".wav", "")
+                matches = glob.glob(
+                    os.path.join(original_location, "**", original_filename),
+                    recursive=True,
+                )
+                if len(matches) == 1:
+                    normalized_file_name = normalize_string(matches[0])
+                    matching_dance_ids = [
+                        dance_id
+                        for dance_id, aliases in normalized_dances.items()
+                        if any(alias in normalized_file_name for alias in aliases)
+                    ]
+            if "swz" in matching_dance_ids and "vwz" in matching_dance_ids:
+                matching_dance_ids.remove("swz")
+            if len(matching_dance_ids) > 1 and "lhp" in matching_dance_ids:
+                matching_dance_ids.remove("lhp")
+            if len(matching_dance_ids) != 1:
+                bad_files.append(file_name)
+                progress_bar.set_description(f"bad files: {len(bad_files)}")
+                continue
+            dst = os.path.join("data", "ballroom-songs", matching_dance_ids[0].upper())
+            os.makedirs(dst, exist_ok=True)
+            filepath = os.path.join(all_dances_folder, file_name)
+            shutil.copy(filepath, os.path.join(dst, file_name))
+    with open("data/bad_files.json", "w") as f:
+        json.dump(bad_files, f)
 if __name__ == "__main__":
+    sort_yt_files()

tests.py DELETED Viewed

@@ -1,22 +0,0 @@
-import torchaudio
-import numpy as np
-from audio_utils import play_audio
-from preprocessing.dataset import SongDataset
-def test_audio_splitting():
-    audio_paths = ["data/samples/95f2df65f7450db3b1af29aa77ba7edc6ab52075?cid=7ffadeb2e136495fb5a62d1ac9be8f62.wav"]
-    labels = [np.array([1,0,1,0])]
-    whole_song, sr = torchaudio.load("data/samples/95f2df65f7450db3b1af29aa77ba7edc6ab52075?cid=7ffadeb2e136495fb5a62d1ac9be8f62.wav")
-    ds = SongDataset(audio_paths, labels)
-    song_parts = (ds._waveform_from_index(i) for i in range(len(ds)))
-    print("Sample Parts")
-    for part in song_parts:
-        play_audio(part,sr)
-    print("Whole Sample")
-    play_audio(whole_song,sr)

tests/test_datasets.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from utils import set_path
+import pytest
+set_path()
+from preprocessing.dataset import PipelinedDataset, BestBallroomDataset, SongDataset
+import numpy as np
+def test_preprocess_dataset():
+    dataset = BestBallroomDataset()
+    dataset = PipelinedDataset(dataset, lambda x: x * 0.0)
+    assert isinstance(dataset._data.song_dataset, SongDataset)
+    assert hasattr(dataset, "feature_extractor")
+    features, _ = dataset[0]
+    assert np.unique(features.numpy())[0] == 0.0
+    with pytest.raises(AttributeError):
+        dataset.foo

tests/test_pipelines.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from utils import set_path
+set_path()
+from preprocessing.dataset import BestBallroomDataset
+from preprocessing.pipelines import SpectrogramTrainingPipeline
+def test_spectrogram_training_pipeline():
+    ds = BestBallroomDataset()
+    pipeline = SpectrogramTrainingPipeline()
+    waveform, _ = ds[0]
+    out = pipeline(waveform)
+    assert len(out.shape) == 3

tests/utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import sys
+import os
+# Add parent directory to Python path
+def set_path():
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

train.py CHANGED Viewed

@@ -1,49 +1,16 @@
-from torch.utils.data import DataLoader
-import pandas as pd
 from typing import Callable
-from torch import nn
-from torch.utils.data import SubsetRandomSampler
-from sklearn.model_selection import KFold
-import pytorch_lightning as pl
-from pytorch_lightning import callbacks as cb
-from models.utils import LabelWeightedBCELoss
-from models.audio_spectrogram_transformer import (
-    train as train_audio_spectrogram_transformer,
-    get_id_label_mapping,
-)
-from preprocessing.dataset import SongDataset, WaveformTrainingEnvironment
-from preprocessing.preprocess import get_examples
-from models.residual import ResidualDancer, TrainingEnvironment
-from models.decision_tree import DanceTreeClassifier, features_from_path
 import yaml
-from preprocessing.dataset import (
-    DanceDataModule,
-    WaveformSongDataset,
-    HuggingFaceWaveformSongDataset,
-)
-from torch.utils.data import random_split
-import numpy as np
-from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 from argparse import ArgumentParser
-import torch
-from torch import nn
-from sklearn.utils.class_weight import compute_class_weight
 def get_training_fn(id: str) -> Callable:
-    match id:
-        case "ast_ptl":
-            return train_ast_lightning
-        case "ast_hf":
-            return train_ast
-        case "residual_dancer":
-            return train_model
-        case "decision_tree":
-            return train_decision_tree
-        case _:
-            raise Exception(f"Couldn't find a training function for '{id}'.")
 def get_config(filepath: str) -> dict:
@@ -52,141 +19,6 @@ def get_config(filepath: str) -> dict:
     return config
-def cross_validation(config, k=5):
-    df = pd.read_csv("data/songs.csv")
-    g_config = config["global"]
-    batch_size = config["data_module"]["batch_size"]
-    x, y = get_examples(df, "data/samples", class_list=g_config["dance_ids"])
-    dataset = SongDataset(x, y)
-    splits = KFold(n_splits=k, shuffle=True, random_state=g_config["seed"])
-    trainer = pl.Trainer(accelerator=g_config["device"])
-    for fold, (train_idx, val_idx) in enumerate(splits.split(x, y)):
-        print(f"Fold {fold+1}")
-        model = ResidualDancer(n_classes=len(g_config["dance_ids"]))
-        train_env = TrainingEnvironment(model, nn.BCELoss())
-        train_sampler = SubsetRandomSampler(train_idx)
-        test_sampler = SubsetRandomSampler(val_idx)
-        train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
-        test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)
-        trainer.fit(train_env, train_loader)
-        trainer.test(train_env, test_loader)
-def train_model(config: dict):
-    TARGET_CLASSES = config["global"]["dance_ids"]
-    DEVICE = config["global"]["device"]
-    SEED = config["global"]["seed"]
-    pl.seed_everything(SEED, workers=True)
-    data = DanceDataModule(target_classes=TARGET_CLASSES, **config["data_module"])
-    model = ResidualDancer(n_classes=len(TARGET_CLASSES), **config["model"])
-    label_weights = data.get_label_weights().to(DEVICE)
-    criterion = LabelWeightedBCELoss(
-        label_weights
-    )  # nn.CrossEntropyLoss(label_weights)
-    train_env = TrainingEnvironment(model, criterion, config)
-    callbacks = [
-        # cb.LearningRateFinder(update_attr=True),
-        cb.EarlyStopping("val/loss", patience=5),
-        cb.StochasticWeightAveraging(1e-2),
-        cb.RichProgressBar(),
-        cb.DeviceStatsMonitor(),
-    ]
-    trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
-    trainer.fit(train_env, datamodule=data)
-    trainer.test(train_env, datamodule=data)
-def train_ast(config: dict):
-    TARGET_CLASSES = config["global"]["dance_ids"]
-    DEVICE = config["global"]["device"]
-    SEED = config["global"]["seed"]
-    dataset_kwargs = config["data_module"]["dataset_kwargs"]
-    test_proportion = config["data_module"].get("test_proportion", 0.2)
-    train_proportion = 1.0 - test_proportion
-    song_data_path = "data/songs_cleaned.csv"
-    song_audio_path = "data/samples"
-    pl.seed_everything(SEED, workers=True)
-    df = pd.read_csv(song_data_path)
-    x, y = get_examples(
-        df, song_audio_path, class_list=TARGET_CLASSES, multi_label=True
-    )
-    train_i, test_i = random_split(
-        np.arange(len(x)), [train_proportion, test_proportion]
-    )
-    train_ds = HuggingFaceWaveformSongDataset(
-        x[train_i], y[train_i], **dataset_kwargs, resample_frequency=16000
-    )
-    test_ds = HuggingFaceWaveformSongDataset(
-        x[test_i], y[test_i], **dataset_kwargs, resample_frequency=16000
-    )
-    train_audio_spectrogram_transformer(
-        TARGET_CLASSES, train_ds, test_ds, device=DEVICE
-    )
-def train_ast_lightning(config: dict):
-    """
-    work on integration between waveform dataset and environment. Should work for both HF and PTL.
-    """
-    TARGET_CLASSES = config["global"]["dance_ids"]
-    DEVICE = config["global"]["device"]
-    SEED = config["global"]["seed"]
-    pl.seed_everything(SEED, workers=True)
-    data = DanceDataModule(
-        target_classes=TARGET_CLASSES,
-        dataset_cls=WaveformSongDataset,
-        **config["data_module"],
-    )
-    id2label, label2id = get_id_label_mapping(TARGET_CLASSES)
-    model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
-    feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
-    model = AutoModelForAudioClassification.from_pretrained(
-        model_checkpoint,
-        num_labels=len(label2id),
-        label2id=label2id,
-        id2label=id2label,
-        ignore_mismatched_sizes=True,
-    ).to(DEVICE)
-    label_weights = data.get_label_weights().to(DEVICE)
-    criterion = LabelWeightedBCELoss(
-        label_weights
-    )  # nn.CrossEntropyLoss(label_weights)
-    train_env = WaveformTrainingEnvironment(model, criterion, feature_extractor, config)
-    callbacks = [
-        # cb.LearningRateFinder(update_attr=True),
-        cb.EarlyStopping("val/loss", patience=5),
-        cb.StochasticWeightAveraging(1e-2),
-        cb.RichProgressBar(),
-    ]
-    trainer = pl.Trainer(callbacks=callbacks, **config["trainer"])
-    trainer.fit(train_env, datamodule=data)
-    trainer.test(train_env, datamodule=data)
-def train_decision_tree(config: dict):
-    TARGET_CLASSES = config["global"]["dance_ids"]
-    DEVICE = config["global"]["device"]
-    SEED = config["global"]["seed"]
-    song_data_path = config["data_module"]["song_data_path"]
-    song_audio_path = config["data_module"]["song_audio_path"]
-    pl.seed_everything(SEED, workers=True)
-    df = pd.read_csv(song_data_path)
-    x, y = get_examples(
-        df, song_audio_path, class_list=TARGET_CLASSES, multi_label=True
-    )
-    # Convert y back to string classes
-    y = np.array(TARGET_CLASSES)[y.argmax(-1)]
-    train_i, test_i = random_split(np.arange(len(x)), [0.8, 0.2])
-    train_paths, train_y = x[train_i], y[train_i]
-    train_x = features_from_path(train_paths)
-    model = DanceTreeClassifier(device=DEVICE)
-    model.fit(train_x, train_y)
-    model.save()
 if __name__ == "__main__":
     parser = ArgumentParser(
         description="Trains models on the dance dataset and saves weights."
@@ -198,6 +30,7 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
     config = get_config(args.config)
-    training_id = config["global"]["id"]
-    train = get_training_fn(training_id)
     train(config)

 from typing import Callable
+import importlib
 import yaml
 from argparse import ArgumentParser
+import os
+ROOT_DIR = os.path.basename(os.path.dirname(__file__))
 def get_training_fn(id: str) -> Callable:
+    module_name, fn_name = id.rsplit(".", 1)
+    module = importlib.import_module("models." + module_name, ROOT_DIR)
+    return getattr(module, fn_name)
 def get_config(filepath: str) -> dict:
     return config
 if __name__ == "__main__":
     parser = ArgumentParser(
         description="Trains models on the dance dataset and saves weights."
     )
     args = parser.parse_args()
     config = get_config(args.config)
+    training_fn_path = config["training_fn"]
+    print(f"Config: {args.config}\nTrainer Id: {training_fn_path}")
+    train = get_training_fn(training_fn_path)
     train(config)