The_Judge / pyJudgeXL_model.py

End of training

78f739d verified 7 months ago

4.94 kB

	# Configuration
	config = {
	"learning_rate": 1e-4,
	"batch_size": 32,
	"vocab_size": 30522,
	"max_len": 256,
	"hidden_size": 768,
	"dropout": 0.1,
	"n_layer": 12,
	"n_head": 12,
	"ff_expansion_factor": 4,
	"rnn_units": 768,
	"num_labels": 5
	}

	class MyClass:
	def __init__(self, value):
	self.value = value

	# Custom Initializer
	def custom_initializer(shape):
	return torch.normal(mean=0.0, std=0.02, size=shape)

	class CustomEmbedding(nn.Module):
	def __init__(self, vocab_size, hidden_size):
	super(CustomEmbedding, self).__init__()
	self.embedding = nn.Embedding(vocab_size, hidden_size, _weight=custom_initializer((vocab_size, hidden_size)))

	def forward(self, inputs):
	return self.embedding(inputs)

	class PositionalEncoding(nn.Module):
	def __init__(self, n_embd, max_len=5000):
	super(PositionalEncoding, self).__init__()
	self.n_embd = n_embd
	self.max_len = max_len

	pe = torch.zeros(max_len, n_embd)
	position = torch.arange(0, max_len).unsqueeze(1).float()
	div_term = torch.exp(torch.arange(0, n_embd, 2).float() * -(np.log(10000.0) / n_embd))
	pe[:, 0::2] = torch.sin(position * div_term)
	pe[:, 1::2] = torch.cos(position * div_term)
	pe = pe.unsqueeze(0).transpose(0, 1)
	self.register_buffer('pe', pe)

	def forward(self, x):
	return x + self.pe[:x.size(0), :]

	class MultiheadAttention(nn.Module):
	def __init__(self, config):
	super(MultiheadAttention, self).__init__()
	self.attention = nn.MultiheadAttention(config['hidden_size'], config['n_head'], dropout=config['dropout'])

	def forward(self, v, k, q, mask=None):
	attn_output, attn_output_weights = self.attention(q, k, v, attn_mask=mask)
	return attn_output

	class FeedForward(nn.Module):
	def __init__(self, config):
	super(FeedForward, self).__init__()
	self.dense1 = nn.Linear(config['hidden_size'], config['hidden_size'] * config['ff_expansion_factor'])
	self.dense2 = nn.Linear(config['hidden_size'] * config['ff_expansion_factor'], config['hidden_size'])
	self.dropout = nn.Dropout(config['dropout'])

	def forward(self, x):
	x = torch.nn.functional.gelu(self.dense1(x))
	x = self.dropout(x)
	return self.dense2(x)

	class TransformerXLBlock(nn.Module):
	def __init__(self, config):
	super(TransformerXLBlock, self).__init__()
	self.attn = MultiheadAttention(config)
	self.ff = FeedForward(config)
	self.ln1 = nn.LayerNorm(config['hidden_size'])
	self.ln2 = nn.LayerNorm(config['hidden_size'])

	def forward(self, x, mask=None):
	attn_out = self.attn(v=x, k=x, q=x, mask=mask)
	out1 = self.ln1(x + attn_out)
	ff_out = self.ff(out1)
	return self.ln2(out1 + ff_out)

	class JudgeXL(nn.Module):
	def __init__(self, config):
	super(JudgeXL, self).__init__()
	self.token_embedding = CustomEmbedding(config['vocab_size'], config['hidden_size'])
	self.pos_encoding = PositionalEncoding(config['hidden_size'], config['max_len'])
	self.transformer_blocks = nn.ModuleList([TransformerXLBlock(config) for _ in range(config['n_layer'])])
	self.ln_f = nn.LayerNorm(config['hidden_size'])
	self.rnn = nn.LSTM(config['hidden_size'], config['rnn_units'], num_layers=2, dropout=config['dropout'], bidirectional=True, batch_first=True)
	self.fc = nn.Linear(config['rnn_units'] * 2, config['vocab_size']) # Adjusted to rnn_units * 2

	def forward(self, x, mask=None):
	x = self.token_embedding(x)
	x = self.pos_encoding(x)
	for block in self.transformer_blocks:
	x = block(x, mask=mask)
	x = self.ln_f(x)
	x, _ = self.rnn(x)
	x = self.fc(x)
	return x

	def generate(self, prompt, max_len=100):
	self.eval()
	input_ids = self.tokenizer(prompt, return_tensors='pt').input_ids.to(device)
	generated = input_ids
	with torch.no_grad():
	for _ in range(max_len):
	outputs = self.forward(generated)
	next_token_logits = outputs[:, :] # Adjusted indexing
	next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
	generated = torch.cat((generated, next_token_id), dim=1)
	if next_token_id.item() == self.tokenizer.sep_token_id:
	break
	generated_text = self.tokenizer.decode(generated[0], skip_special_tokens=True)
	return generated_text

	# Load the last saved model
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = JudgeXL(config)
	model = torch.load('C:/AIstuffing/Judge_XL-LLM/xl-llm_weights/judgeXL-LLm_wiki.pth', weights_only=False)