End of training
Browse files- README.md +48 -0
- config.json +102 -0
- model.safetensors +3 -0
- pyJudgeXL_model.py +122 -0
- tokenizer1.pickle +3 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
base_model: Wonder-Griffin/JudgeLLM2
|
4 |
+
tags:
|
5 |
+
- generated_from_trainer
|
6 |
+
model-index:
|
7 |
+
- name: The_Judge
|
8 |
+
results: []
|
9 |
+
---
|
10 |
+
|
11 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
12 |
+
should probably proofread and complete it, then remove this comment. -->
|
13 |
+
|
14 |
+
# The_Judge
|
15 |
+
|
16 |
+
This model is a fine-tuned version of [Wonder-Griffin/JudgeLLM2](https://huggingface.co/Wonder-Griffin/JudgeLLM2) on an unknown dataset.
|
17 |
+
|
18 |
+
## Model description
|
19 |
+
|
20 |
+
More information needed
|
21 |
+
|
22 |
+
## Intended uses & limitations
|
23 |
+
|
24 |
+
More information needed
|
25 |
+
|
26 |
+
## Training and evaluation data
|
27 |
+
|
28 |
+
More information needed
|
29 |
+
|
30 |
+
## Training procedure
|
31 |
+
|
32 |
+
### Training hyperparameters
|
33 |
+
|
34 |
+
The following hyperparameters were used during training:
|
35 |
+
- learning_rate: 5e-05
|
36 |
+
- train_batch_size: 8
|
37 |
+
- eval_batch_size: 8
|
38 |
+
- seed: 42
|
39 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
40 |
+
- lr_scheduler_type: linear
|
41 |
+
- num_epochs: 3.0
|
42 |
+
|
43 |
+
### Framework versions
|
44 |
+
|
45 |
+
- Transformers 4.45.0.dev0
|
46 |
+
- Pytorch 2.4.0+cu124
|
47 |
+
- Datasets 2.20.0
|
48 |
+
- Tokenizers 0.19.1
|
config.json
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_": "Judge-GPT2",
|
3 |
+
"_name_or_path": "Wonder-Griffin/JudgeLLM2",
|
4 |
+
"activation_function": "gelu_new",
|
5 |
+
"architectures": [
|
6 |
+
"GPT2Model"
|
7 |
+
],
|
8 |
+
"attn_pdrop": 0.1,
|
9 |
+
"batch_size": 32,
|
10 |
+
"bias": true,
|
11 |
+
"block_size": 512,
|
12 |
+
"bos_token_id": 50256,
|
13 |
+
"dim_feedforward": 3072,
|
14 |
+
"dropout": 0.1,
|
15 |
+
"embd_pdrop": 0.1,
|
16 |
+
"eos_token_id": 50256,
|
17 |
+
"ff_expansion_factor": 4,
|
18 |
+
"hidden_act": "gelu",
|
19 |
+
"id2label": {
|
20 |
+
"0": "LABEL_0",
|
21 |
+
"1": "LABEL_1",
|
22 |
+
"2": "LABEL_2",
|
23 |
+
"3": "LABEL_3",
|
24 |
+
"4": "LABEL_4"
|
25 |
+
},
|
26 |
+
"inference_mode": true,
|
27 |
+
"initializer_range": 0.02,
|
28 |
+
"label2id": {
|
29 |
+
"LABEL_0": 0,
|
30 |
+
"LABEL_1": 1,
|
31 |
+
"LABEL_2": 2,
|
32 |
+
"LABEL_3": 3,
|
33 |
+
"LABEL_4": 4
|
34 |
+
},
|
35 |
+
"label_smoothing": 0.1,
|
36 |
+
"layer_norm_epsilon": 1e-05,
|
37 |
+
"learning_rate": 0.0003,
|
38 |
+
"log_interval": 100,
|
39 |
+
"max_grad_norm": 1.0,
|
40 |
+
"model_type": "gpt2",
|
41 |
+
"n_embd": 768,
|
42 |
+
"n_head": 12,
|
43 |
+
"n_inner": null,
|
44 |
+
"n_layer": 12,
|
45 |
+
"n_positions": 512,
|
46 |
+
"output_dir": "C:/Users/wonde/output",
|
47 |
+
"pretrained_weights": "Wonder-Griffin/JudgeLLM2",
|
48 |
+
"reorder_and_upcast_attn": false,
|
49 |
+
"resid_pdrop": 0.1,
|
50 |
+
"scale_attn_by_inverse_layer_idx": false,
|
51 |
+
"scale_attn_weights": true,
|
52 |
+
"summary_activation": null,
|
53 |
+
"summary_first_dropout": 0.1,
|
54 |
+
"summary_proj_to_labels": true,
|
55 |
+
"summary_type": "cls_index",
|
56 |
+
"summary_use_proj": true,
|
57 |
+
"task_heads": {
|
58 |
+
"classifier_head": {
|
59 |
+
"params": {
|
60 |
+
"num_labels": 5
|
61 |
+
},
|
62 |
+
"type": "JudgeClassifier"
|
63 |
+
},
|
64 |
+
"lm_head": {
|
65 |
+
"params": {
|
66 |
+
"vocab_size": 50257
|
67 |
+
},
|
68 |
+
"type": "JudgeCasualLMHead"
|
69 |
+
},
|
70 |
+
"qa_head": {
|
71 |
+
"params": {
|
72 |
+
"num_labels": 2
|
73 |
+
},
|
74 |
+
"type": "JudgeWithQA"
|
75 |
+
}
|
76 |
+
},
|
77 |
+
"task_specific_params": {
|
78 |
+
"question-answering": {
|
79 |
+
"max_answer_length": 100
|
80 |
+
},
|
81 |
+
"sequence-classification": {
|
82 |
+
"eval_steps": 500
|
83 |
+
},
|
84 |
+
"text-generation": {
|
85 |
+
"do_sample": true,
|
86 |
+
"max_length": 100
|
87 |
+
}
|
88 |
+
},
|
89 |
+
"tokenizer": {
|
90 |
+
"params": {
|
91 |
+
"vocab_size": 50257
|
92 |
+
},
|
93 |
+
"type": "AutoTokenizer"
|
94 |
+
},
|
95 |
+
"torch_dtype": "float32",
|
96 |
+
"total_steps": 10000,
|
97 |
+
"transformers_version": "4.45.0.dev0",
|
98 |
+
"use_cache": true,
|
99 |
+
"vocab_size": 30522,
|
100 |
+
"warmup_steps": 1000,
|
101 |
+
"weight_decay": 0.01
|
102 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7611a89d0d92c222df86ba901d724f356efb57ae7f96a425528464f3c3a410e
|
3 |
+
size 435573648
|
pyJudgeXL_model.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Configuration
|
2 |
+
config = {
|
3 |
+
"learning_rate": 1e-4,
|
4 |
+
"batch_size": 32,
|
5 |
+
"vocab_size": 30522,
|
6 |
+
"max_len": 256,
|
7 |
+
"hidden_size": 768,
|
8 |
+
"dropout": 0.1,
|
9 |
+
"n_layer": 12,
|
10 |
+
"n_head": 12,
|
11 |
+
"ff_expansion_factor": 4,
|
12 |
+
"rnn_units": 768,
|
13 |
+
"num_labels": 5
|
14 |
+
}
|
15 |
+
|
16 |
+
class MyClass:
|
17 |
+
def __init__(self, value):
|
18 |
+
self.value = value
|
19 |
+
|
20 |
+
# Custom Initializer
|
21 |
+
def custom_initializer(shape):
|
22 |
+
return torch.normal(mean=0.0, std=0.02, size=shape)
|
23 |
+
|
24 |
+
class CustomEmbedding(nn.Module):
|
25 |
+
def __init__(self, vocab_size, hidden_size):
|
26 |
+
super(CustomEmbedding, self).__init__()
|
27 |
+
self.embedding = nn.Embedding(vocab_size, hidden_size, _weight=custom_initializer((vocab_size, hidden_size)))
|
28 |
+
|
29 |
+
def forward(self, inputs):
|
30 |
+
return self.embedding(inputs)
|
31 |
+
|
32 |
+
class PositionalEncoding(nn.Module):
|
33 |
+
def __init__(self, n_embd, max_len=5000):
|
34 |
+
super(PositionalEncoding, self).__init__()
|
35 |
+
self.n_embd = n_embd
|
36 |
+
self.max_len = max_len
|
37 |
+
|
38 |
+
pe = torch.zeros(max_len, n_embd)
|
39 |
+
position = torch.arange(0, max_len).unsqueeze(1).float()
|
40 |
+
div_term = torch.exp(torch.arange(0, n_embd, 2).float() * -(np.log(10000.0) / n_embd))
|
41 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
42 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
43 |
+
pe = pe.unsqueeze(0).transpose(0, 1)
|
44 |
+
self.register_buffer('pe', pe)
|
45 |
+
|
46 |
+
def forward(self, x):
|
47 |
+
return x + self.pe[:x.size(0), :]
|
48 |
+
|
49 |
+
class MultiheadAttention(nn.Module):
|
50 |
+
def __init__(self, config):
|
51 |
+
super(MultiheadAttention, self).__init__()
|
52 |
+
self.attention = nn.MultiheadAttention(config['hidden_size'], config['n_head'], dropout=config['dropout'])
|
53 |
+
|
54 |
+
def forward(self, v, k, q, mask=None):
|
55 |
+
attn_output, attn_output_weights = self.attention(q, k, v, attn_mask=mask)
|
56 |
+
return attn_output
|
57 |
+
|
58 |
+
class FeedForward(nn.Module):
|
59 |
+
def __init__(self, config):
|
60 |
+
super(FeedForward, self).__init__()
|
61 |
+
self.dense1 = nn.Linear(config['hidden_size'], config['hidden_size'] * config['ff_expansion_factor'])
|
62 |
+
self.dense2 = nn.Linear(config['hidden_size'] * config['ff_expansion_factor'], config['hidden_size'])
|
63 |
+
self.dropout = nn.Dropout(config['dropout'])
|
64 |
+
|
65 |
+
def forward(self, x):
|
66 |
+
x = torch.nn.functional.gelu(self.dense1(x))
|
67 |
+
x = self.dropout(x)
|
68 |
+
return self.dense2(x)
|
69 |
+
|
70 |
+
class TransformerXLBlock(nn.Module):
|
71 |
+
def __init__(self, config):
|
72 |
+
super(TransformerXLBlock, self).__init__()
|
73 |
+
self.attn = MultiheadAttention(config)
|
74 |
+
self.ff = FeedForward(config)
|
75 |
+
self.ln1 = nn.LayerNorm(config['hidden_size'])
|
76 |
+
self.ln2 = nn.LayerNorm(config['hidden_size'])
|
77 |
+
|
78 |
+
def forward(self, x, mask=None):
|
79 |
+
attn_out = self.attn(v=x, k=x, q=x, mask=mask)
|
80 |
+
out1 = self.ln1(x + attn_out)
|
81 |
+
ff_out = self.ff(out1)
|
82 |
+
return self.ln2(out1 + ff_out)
|
83 |
+
|
84 |
+
class JudgeXL(nn.Module):
|
85 |
+
def __init__(self, config):
|
86 |
+
super(JudgeXL, self).__init__()
|
87 |
+
self.token_embedding = CustomEmbedding(config['vocab_size'], config['hidden_size'])
|
88 |
+
self.pos_encoding = PositionalEncoding(config['hidden_size'], config['max_len'])
|
89 |
+
self.transformer_blocks = nn.ModuleList([TransformerXLBlock(config) for _ in range(config['n_layer'])])
|
90 |
+
self.ln_f = nn.LayerNorm(config['hidden_size'])
|
91 |
+
self.rnn = nn.LSTM(config['hidden_size'], config['rnn_units'], num_layers=2, dropout=config['dropout'], bidirectional=True, batch_first=True)
|
92 |
+
self.fc = nn.Linear(config['rnn_units'] * 2, config['vocab_size']) # Adjusted to rnn_units * 2
|
93 |
+
|
94 |
+
def forward(self, x, mask=None):
|
95 |
+
x = self.token_embedding(x)
|
96 |
+
x = self.pos_encoding(x)
|
97 |
+
for block in self.transformer_blocks:
|
98 |
+
x = block(x, mask=mask)
|
99 |
+
x = self.ln_f(x)
|
100 |
+
x, _ = self.rnn(x)
|
101 |
+
x = self.fc(x)
|
102 |
+
return x
|
103 |
+
|
104 |
+
def generate(self, prompt, max_len=100):
|
105 |
+
self.eval()
|
106 |
+
input_ids = self.tokenizer(prompt, return_tensors='pt').input_ids.to(device)
|
107 |
+
generated = input_ids
|
108 |
+
with torch.no_grad():
|
109 |
+
for _ in range(max_len):
|
110 |
+
outputs = self.forward(generated)
|
111 |
+
next_token_logits = outputs[:, :] # Adjusted indexing
|
112 |
+
next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
|
113 |
+
generated = torch.cat((generated, next_token_id), dim=1)
|
114 |
+
if next_token_id.item() == self.tokenizer.sep_token_id:
|
115 |
+
break
|
116 |
+
generated_text = self.tokenizer.decode(generated[0], skip_special_tokens=True)
|
117 |
+
return generated_text
|
118 |
+
|
119 |
+
# Load the last saved model
|
120 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
121 |
+
model = JudgeXL(config)
|
122 |
+
model = torch.load('C:/AIstuffing/Judge_XL-LLM/xl-llm_weights/judgeXL-LLm_wiki.pth', weights_only=False)
|
tokenizer1.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a4bfa2daf9cb9275703fcadd2e7953704653c2a206b1ea0852fad26a5e76c80
|
3 |
+
size 82362540
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca93e44304ed6ec37809ad1da1d61576ecf6389b60e134a029f36fbbbf24ebec
|
3 |
+
size 5176
|