satyanayak commited on
Commit
4eeaa77
·
1 Parent(s): d4e4236

first deploy

Browse files
Files changed (3) hide show
  1. app.py +68 -0
  2. model.py +134 -0
  3. requirement.txt +3 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+ from model import LlamaForCausalLM # Import your custom model class
5
+
6
+ # Load tokenizer and model
7
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
8
+ if tokenizer.pad_token is None:
9
+ tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
10
+
11
+ # Initialize model with reduced parameters (135M config)
12
+ model = LlamaForCausalLM(
13
+ vocab_size=tokenizer.vocab_size,
14
+ dim=576,
15
+ num_layers=22,
16
+ hidden_dim=1280,
17
+ num_heads=8
18
+ )
19
+
20
+ # Load trained weights
21
+ state_dict = torch.hub.load_state_dict_from_url(
22
+ "https://huggingface.co/satyanayak/custom-smallmv2135/resolve/main/model-dict-step-5500.pt",
23
+ map_location="cpu"
24
+ )
25
+ model.load_state_dict(state_dict)
26
+ model.eval()
27
+ device = "cuda" if torch.cuda.is_available() else "cpu"
28
+ model.to(device)
29
+
30
+ def generate_text(prompt, max_length=100, temperature=0.7, top_k=50):
31
+ input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
32
+
33
+ with torch.no_grad():
34
+ for _ in range(max_length):
35
+ outputs = model(input_ids)
36
+ next_token_logits = outputs[:, -1, :] / temperature
37
+
38
+ # Apply top-k sampling
39
+ top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k, dim=-1)
40
+ probs = torch.softmax(top_k_logits, dim=-1)
41
+
42
+ # Sample from distribution
43
+ next_token_idx = torch.multinomial(probs, num_samples=1)
44
+ next_token = top_k_indices[0, next_token_idx[0]]
45
+
46
+ if next_token.item() == tokenizer.eos_token_id:
47
+ break
48
+
49
+ input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
50
+
51
+ return tokenizer.decode(input_ids[0], skip_special_tokens=True)
52
+
53
+ # Gradio interface
54
+ demo = gr.Interface(
55
+ fn=generate_text,
56
+ inputs=[
57
+ gr.Textbox(label="Input Prompt", lines=3),
58
+ gr.Slider(50, 200, value=100, label="Max Length"),
59
+ gr.Slider(0.1, 2.0, value=0.7, label="Temperature"),
60
+ gr.Slider(10, 100, value=50, label="Top-k")
61
+ ],
62
+ outputs=gr.Textbox(label="Generated Text", lines=5),
63
+ title="🦙 Custom SmolLLM Demo",
64
+ description="A 135M parameter language model trained on smollm-corpus"
65
+ )
66
+
67
+ if __name__ == "__main__":
68
+ demo.launch()
model.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ # RMSNorm is a normalization technique that normalizes the input by dividing by the square root of the variance plus a small number to prevent division by zero
6
+ class LlamaRMSNorm(nn.Module):
7
+ def __init__(self, hidden_size, eps=1e-5): # the number of features/dimensions/embeddings in the input, eps is a small number to prevent division by zero
8
+ super().__init__()
9
+ self.weight = nn.Parameter(torch.ones(hidden_size)) # weight is a learnable parameter that scales the input
10
+ self.eps = eps
11
+
12
+ def forward(self, x):
13
+ norm = x.pow(2).mean(-1, keepdim=True).sqrt() + self.eps # compute the norm of the input
14
+ return x / norm * self.weight # normalize the input by dividing by the norm and scale it by the weight parameter
15
+
16
+
17
+ # RotaryEmbedding is a technique that rotates the input by a learnable angle
18
+ class LlamaRotaryEmbedding(nn.Module):
19
+ def __init__(self, dim, base=10000, device=None): # dim is the number of features/dimensions/embeddings in the input, base is a base number for the frequency, device is the device to store the buffer
20
+ super().__init__()
21
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device).float() / dim)) # compute the inverse frequency
22
+ self.register_buffer("inv_freq", inv_freq) # register the inverse frequency as a buffer
23
+
24
+ def forward(self, x, seq_len):
25
+ seq_len = seq_len.to(x.device) # convert seq_len to the device of the input
26
+ t = torch.arange(seq_len, device=x.device) # create a tensor of the sequence length
27
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq) # compute the frequency by taking the dot product of the sequence length and the inverse frequency
28
+ emb = torch.cat((freqs, freqs), dim=-1) # concatenate the frequency with itself
29
+ return emb
30
+
31
+ class LlamaMLP(nn.Module):
32
+ def __init__(self, dim, hidden_dim):
33
+ super().__init__()
34
+ self.gate_proj = nn.Linear(dim, hidden_dim, bias=False) # create the gate projection layer with the input dimension and the hidden dimension
35
+ self.up_proj = nn.Linear(dim, hidden_dim, bias=False) # create the up projection layer with the input dimension and the hidden dimension
36
+ self.down_proj = nn.Linear(hidden_dim, dim, bias=False) # create the down projection layer with the hidden dimension and the output dimension
37
+ self.act_fn = nn.SiLU() # create the activation function
38
+
39
+ def forward(self, x):
40
+ gated = self.gate_proj(x) # apply the gate projection to the input
41
+ hidden = self.up_proj(x) # apply the up projection to the input
42
+ return self.down_proj(self.act_fn(gated * hidden)) # apply the activation function to the gated and hidden values and then apply the down projection
43
+
44
+ class LlamaAttention(nn.Module):
45
+ def __init__(self, dim, num_heads=8):
46
+ super().__init__()
47
+ self.num_heads = num_heads
48
+ self.head_dim = dim // num_heads
49
+
50
+ self.q_proj = nn.Linear(dim, dim, bias=False)
51
+ self.k_proj = nn.Linear(dim, dim, bias=False)
52
+ self.v_proj = nn.Linear(dim, dim, bias=False)
53
+ self.o_proj = nn.Linear(dim, dim, bias=False)
54
+
55
+ def forward(self, x):
56
+ batch_size, seq_len, dim = x.size() # [batch_size, seq_len, dim] -> [4, 128, 576]
57
+ q = self.q_proj(x)
58
+ k = self.k_proj(x)
59
+ v = self.v_proj(x)
60
+
61
+
62
+ # Split heads
63
+ q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # [batch_size, num_heads, seq_len, head_dim]
64
+ k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
65
+ v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
66
+
67
+ # Scaled dot-product attention
68
+ scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
69
+ attention = torch.softmax(scores, dim=-1)
70
+ context = torch.matmul(attention, v)
71
+
72
+ # Combine heads
73
+ context = context.transpose(1, 2).reshape(batch_size, seq_len, dim)
74
+ return self.o_proj(context)
75
+
76
+ class LlamaDecoderLayer(nn.Module):
77
+ def __init__(self, dim, hidden_dim, num_heads):
78
+ super().__init__()
79
+ self.self_attn = LlamaAttention(dim, num_heads)
80
+ self.mlp = LlamaMLP(dim, hidden_dim)
81
+ self.input_layernorm = LlamaRMSNorm(dim)
82
+ self.post_attention_layernorm = LlamaRMSNorm(dim)
83
+
84
+ def forward(self, x):
85
+ residual = x
86
+ x = self.input_layernorm(x)
87
+ x = self.self_attn(x)
88
+ x = x + residual
89
+
90
+ residual = x
91
+ x = self.post_attention_layernorm(x)
92
+ x = self.mlp(x)
93
+ x = x + residual
94
+ return x
95
+
96
+
97
+ class LlamaModel(nn.Module):
98
+ def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
99
+ super().__init__()
100
+ self.embed_tokens = nn.Embedding(vocab_size, dim)
101
+ self.layers = nn.ModuleList([
102
+ LlamaDecoderLayer(dim, hidden_dim, num_heads) for _ in range(num_layers)
103
+ ])
104
+ self.norm = LlamaRMSNorm(dim)
105
+ self.rotary_emb = LlamaRotaryEmbedding(dim)
106
+
107
+ def forward(self, x):
108
+ x = self.embed_tokens(x)
109
+ for layer in self.layers:
110
+ x = layer(x)
111
+ return self.norm(x)
112
+
113
+ class LlamaForCausalLM(nn.Module):
114
+ def __init__(self, vocab_size, dim, num_layers, hidden_dim, num_heads):
115
+ super().__init__()
116
+ self.model = LlamaModel(vocab_size, dim, num_layers, hidden_dim, num_heads)
117
+ self.lm_head = nn.Linear(dim, vocab_size, bias=False)
118
+
119
+ def forward(self, x):
120
+ x = self.model(x)
121
+ return self.lm_head(x)
122
+
123
+ def get_model(tokenizer):
124
+ vocab_size = tokenizer.vocab_size # Use actual tokenizer vocab size
125
+ return LlamaForCausalLM(
126
+ vocab_size=vocab_size,
127
+ dim=576,
128
+ num_layers=30,
129
+ hidden_dim=1536,
130
+ num_heads=8
131
+ )
132
+
133
+ # model = get_model()
134
+ # print(model)
requirement.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.40.0
3
+ gradio>=4.0.0