@TuringsSolutions on Hugging Face: "Hyperdimensional Computing + Neural Network, tell your friends. To my…"

I wrote a script to pretrain a model using this using an alpaca formatted dataset like my dataset bellow. It takes way to much ram for me to run though.
https://huggingface.co/datasets/Replete-AI/Everything_Instruct
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import json

# Hyperdimensional Computing class
class HDComputing:
    def __init__(self, dim):
        self.dim = dim

    def random_hv(self):
        return np.random.choice([-1, 1], size=self.dim)

    def bind(self, hv1, hv2):
        return hv1 * hv2

    def bundle(self, hvs):
        return np.sign(np.sum(hvs, axis=0))

# HDCNN model
class HDCNNClassifier(nn.Module):
    def __init__(self, dim, num_classes):
        super(HDCNNClassifier, self).__init__()
        self.fc1 = nn.Linear(dim, 512)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.activation(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out
    
class CustomDataset(Dataset):
    def __init__(self, data, hd_computer, max_seq_len):
        self.data = data
        self.hd_computer = hd_computer
        self.max_seq_len = max_seq_len
        self.vocab = self.build_vocab()
        self.token_hvs = {token: self.hd_computer.random_hv() for token in self.vocab}

    def build_vocab(self):
        vocab = set()
        for item in self.data:
            text = f"{item['instruction']} {item['input']} {item['output']}"
            vocab.update(text.split())
        return list(vocab)

    def encode_text(self, text):
        tokens = text.split()[:self.max_seq_len]
        hvs = [self.token_hvs.get(token, self.hd_computer.random_hv()) for token in tokens]
        return self.hd_computer.bundle(hvs)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = f"{item['instruction']} {item['input']} {item['output']}"
        encoded = self.encode_text(text)
        return torch.tensor(encoded, dtype=torch.float32), 0  # Placeholder label

# Progress Bar
class ProgressBar:
    def __init__(self, total_steps):
        self.pbar = tqdm(total=total_steps, desc="Training Progress", unit="step")

    def update(self):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

if __name__ == "__main__":
    # Load the dataset
    dataset_path = "E:/DATASETS/Everything-Instruct.json"
    dataset = []
    with open(dataset_path, 'r', encoding='utf-8') as f:
        for line in f:
            dataset.append(json.loads(line.strip()))

    # Initialize HD Computing
    hd_dim = 5000
    hd_computer = HDComputing(hd_dim)

    # Create custom dataset
    custom_dataset = CustomDataset(dataset, hd_computer, max_seq_len=8192)

    # Create data loader
    batch_size = 32
    dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

    # Initialize model
    num_classes = 1  # Adjust based on your task
    model = HDCNNClassifier(hd_dim, num_classes)

    # Training setup
    num_epochs = 5
    learning_rate = 2e-4
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()  # Adjust based on your task

    # Calculate total steps for the progress bar
    total_steps = len(custom_dataset) // batch_size * num_epochs
    progress_bar = ProgressBar(total_steps)

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in dataloader:
            inputs, labels = batch
            outputs = model(inputs)
            loss = criterion(outputs, labels.float().unsqueeze(1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            progress_bar.update()

    progress_bar.close()

    # Save the model
    torch.save(model.state_dict(), "E:/models/HD_model.pth")

    print("Training completed and model saved.")
Join the conversation