Spaces:

Neu256
/

Prome-LLM

Running

App Files Files Community

Neu256 commited on Oct 4, 2023

Commit

a82b5a1

1 Parent(s): 0efbf87

Upload 4 files

Browse files

Files changed (4) hide show

base_model.pth +3 -0
model.py +199 -0
requirements.txt +4 -0
utils.py +121 -0

base_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:065d18e5699492bb121d4011df95c85bf5505eed62225aa843db7c558258b9e2
+size 201382377

model.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from utils import DEVICE
+class AttentionHead(nn.Module):
+    """
+    One head of the self-attention layer
+    """
+    def __init__(self, head_size, num_embed, block_size, dropout):
+        super().__init__()
+        self.key = nn.Linear(num_embed, head_size, bias=False)
+        self.query = nn.Linear(num_embed, head_size, bias=False)
+        self.value = nn.Linear(num_embed, head_size, bias=False)
+        # tril is a lower triangular matrix. it is not a parameter
+        # of the model, so we assign it to the module using register_buffer
+        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
+        # let's also add dropout
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        B, T, C = x.shape
+        k = self.key(x)
+        q = self.query(x)
+        # compute attention scores
+        # (B, T, C) @ (B, C, T) -> (B, T, T)
+        wei = q @ k.transpose(-2, -1) * C**-0.5
+        # Tril matrix (lower triagular matrix) is used to mask
+        # future positions (setting them to -inf) so that the
+        # decoder "learns" to predict next words
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))  # (B,T,T)
+        wei = F.softmax(wei, dim=-1)  # (B,T,T)
+        wei = self.dropout(wei)
+        # weighted aggregation of the values
+        v = self.value(x)
+        out = wei @ v  # (B,T,T) @ (B,T,C) ---> (B,T,C)
+        return out
+class MultiHeadAttention(nn.Module):
+    """
+    Multiple Heads of self-attention in parallel
+    """
+    def __init__(self, num_heads, head_size, num_embed, block_size, dropout):
+        super().__init__()
+        self.heads = nn.ModuleList(
+            [
+                AttentionHead(
+                    head_size=head_size,
+                    num_embed=num_embed,
+                    block_size=block_size,
+                    dropout=dropout,
+                )
+                for _ in range(num_heads)
+            ]
+        )
+        self.proj = nn.Linear(num_embed, num_embed)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        # output of the self-attention
+        out = torch.cat([h(x) for h in self.heads], dim=-1)
+        # apply the linear projection layer
+        out = self.dropout(self.proj(out))
+        return out
+class FeedForward(nn.Module):
+    """
+    A simple linear layer followed by ReLu
+    """
+    def __init__(self, num_embed, dropout):
+        super().__init__()
+        self.net = nn.Sequential(
+            # in the Attention is All You Need paper
+            # authors are using the size of the ffwd layer 2048
+            # and the output of the model is 512
+            # so we apply the same factor of 4
+            nn.Linear(num_embed, 4 * num_embed),
+            nn.ReLU(),
+            # apply the linear projection layer
+            nn.Linear(4 * num_embed, num_embed),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class TransformerBlock(nn.Module):
+    """
+    This calss will group together MultiHead Attention and
+    FeedForward NN, so that we can copy it in Transformer
+    """
+    def __init__(self, num_heads, block_size, num_embed, dropout):
+        super().__init__()
+        head_size = num_embed // num_heads
+        self.sa = MultiHeadAttention(
+            num_heads=num_heads,
+            head_size=head_size,
+            num_embed=num_embed,
+            block_size=block_size,
+            dropout=dropout,
+        )
+        self.ffwd = FeedForward(num_embed=num_embed, dropout=dropout)
+        # add the layer normalization
+        self.ln1 = nn.LayerNorm(num_embed)
+        self.ln2 = nn.LayerNorm(num_embed)
+    def forward(self, x):
+        # "x +" is the skip (or residual) connection
+        # it helps with optimization
+        # also we apply layer normalization before self-attention
+        # and feed-forward (a reshufle from original paper)
+        x = x + self.sa(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__()
+        # a simple lookup table that stores embeddings of a fixed dictionary and size
+        # each token directly reads off the logits for the next token from a lookup table
+        # see more: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
+        self.vocab_size = kwargs.get("vocab_size", 100)
+        self.num_embed = kwargs.get("num_embed", 32)
+        self.block_size = kwargs.get("block_size", 8)
+        self.num_heads = kwargs.get("num_heads", 4)
+        self.num_layers = kwargs.get("num_layers", 4)
+        self.dropout = kwargs.get("dropout", 0.2)
+        # each token reads the logits for the next token from a lookup table
+        self.token_embedding_table = nn.Embedding(self.vocab_size, self.num_embed)
+        # each position from 0 to block_size-1 will get its embedding
+        self.position_embedding_table = nn.Embedding(self.block_size, self.num_embed)
+        self.blocks = nn.Sequential(
+            *[
+                TransformerBlock(
+                    num_heads=self.num_heads,
+                    block_size=self.block_size,
+                    num_embed=self.num_embed,
+                    dropout=self.dropout,
+                )
+                for _ in range(self.num_layers)
+            ]
+        )
+        # we add the layer norm before the Linear layer
+        self.ln_f = nn.LayerNorm(self.num_embed)
+        self.lm_head = nn.Linear(self.num_embed, self.vocab_size)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        # idx and targets are (B,T) tensor of integers
+        # the token_emb is (B, T, C), C = NUM_EMBED
+        token_emb = self.token_embedding_table(idx)
+        # (T, C)
+        posit_emb = self.position_embedding_table(torch.arange(T, device=DEVICE))
+        x = token_emb + posit_emb
+        # apply one head of self-attention
+        x = self.blocks(x)
+        # (B, T, vocab_size)
+        logits = self.lm_head(x)
+        # compute the loss
+        if targets != None:
+            # cross_entropy accepts inputs in a (batch_size, num_classes)
+            # so we need to reformat our logits dimensions to
+            # (batch_size * time, dim_vocabulary), time = block_size
+            B, T, C = logits.shape
+            logits = torch.reshape(logits, (B * T, C))
+            targets = torch.reshape(targets, (B * T,))
+            loss = F.cross_entropy(logits, targets)
+        else:
+            loss = None
+        return logits, loss
+    def generate(self, idx: torch.Tensor, max_new_tokens: int, block_size: int):
+        # idx is (B, T) array of indices in the current context
+        for _ in range(max_new_tokens):
+            # crop the context too the  last block_size tokens
+            # because tokens don't communicate between blocks
+            idx_crop = idx[:, -block_size:]
+            # get the predictions
+            logits, loss = self.forward(idx_crop)
+            # focus only on the last time step
+            logits = logits[:, -1, :]  # becomes (B, C)
+            # apply softmax to get probabilities
+            probs = F.softmax(logits, dim=-1)  # (B, C)
+            # sample from the distribution with probabilities probs
+            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
+            # append sampled index to the running sequence
+            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
+        return idx

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+# python>=3.9 # This is a recommended python version. It cannot be installed by pip. (???)
+torch>=1.13.1
+transformers>=4.25.1
+numpy

utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import torch
+from datetime import datetime
+# hyperparameters
+BATCH_SIZE = 32  # how many independent sequences will we process in parallel?
+BLOCK_SIZE = 64  # what is the maximum context length for predictions?
+MAX_ITER = 500  # number of training iterations
+EVAL_INTER = 1
+LEARNING_RATE = 3e-4
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+NUM_HEAD = 6
+NUM_EMBED = NUM_HEAD * 128
+NUM_LAYER = 6
+DROPOUT = 0.2
+def encode(text_seq: str, tokenizer: any) -> torch.Tensor:
+    """
+    Function to encode input text using a pre-trained tokenizer and vectorized lookups
+    """
+    # tokenize the input text
+    tokens = tokenizer.tokenize(text_seq)
+    # convert the tokens to their corresponding ids
+    token_indices = tokenizer.convert_tokens_to_ids(tokens)
+    token_indices = torch.tensor(token_indices, dtype=torch.long)
+    return token_indices
+def decode(enc_sec: torch.Tensor, tokenizer: any) -> str:
+    """
+    Function to decode a sequence of token indices back to a string
+    """
+    # convert the indices to a list
+    enc_sec = enc_sec.tolist()
+    # decode the indices to a string
+    text = tokenizer.decode(enc_sec)
+    return text
+def get_batch(data: list[str], block_size: int, batch_size: int):
+    """
+    This is a simple function to create batches of data.
+    GPUs allow for parallel processing we can feed multiple chunks at once
+    so that's why we would need batches - how many independant sequences
+    will we process in parallel.
+    Parameters:
+    data: list[str]: data to take batch from
+    block_size (int): size of the text that is proccessed at once
+    batch_size (int): number of sequences to process in parallel
+    Returns:
+    x, y: a tuple with token sequence and token target
+    """
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    # we stack batch_size rows of sentences
+    # so x and y are the matrices with rows_num=batch_size
+    # and col_num=block_size
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    # y is x shifted one position right - because we predict
+    # word in y having all the previous words as context
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
+    x, y = x.to(DEVICE), y.to(DEVICE)
+    return x, y
+@torch.no_grad()
+def estimate_loss(
+    data: list[str],
+    model: torch.nn.Module,
+    block_size: int,
+    batch_size: int,
+    eval_iters: int = 10,
+):
+    out = {}
+    model.eval()
+    losses = torch.zeros(eval_iters)
+    for k in range(eval_iters):
+        X, Y = get_batch(data=data, block_size=block_size, batch_size=batch_size)
+        logits, loss = model.forward(X, Y)
+        losses[k] = loss.item()
+    out = losses.mean()
+    model.train()
+    return out
+def load_model_from_checkpoint(
+    model_class: torch.nn.Module,
+    path_to_checkpoint: str = "checkpoints/state_dict_model.pt",
+    **kwargs: dict,
+) -> torch.nn.Module:
+    try:
+        state_dict = torch.load(path_to_checkpoint)
+        print("Successfully loaded model from the checkpoint")
+    except Exception as e:
+        print(f"Error loading the model from the checkpoint. {e}")
+    model = model_class(**kwargs)
+    # load the state_dict into the model
+    model.load_state_dict(state_dict)
+    return model
+def save_model_to_chekpoint(
+    model: torch.nn.Module, path_to_checkpoint: str = "checkpoints", epoch: int = 0
+):
+    # check if path exists, otherwise create it
+    if not os.path.exists(path_to_checkpoint):
+        os.makedirs(path_to_checkpoint)
+    # datetime object containing current date and time
+    now = datetime.now()
+    # dd/mm/YY H:M:S
+    dt_string = now.strftime("%d.%m.%Y_%H:%M:%S")
+    checkpoint_name = "checkpoint_epoch-" + str(epoch) + "_" + dt_string + ".pt"
+    full_path = os.path.join(path_to_checkpoint, checkpoint_name)
+    try:
+        torch.save(model.state_dict(), full_path)
+        print("Successfully saved the model to {}".format(full_path))
+    except Exception as e:
+        print(f"Error saving the model to checkpoint. {e}")