Commit
·
ef571c5
1
Parent(s):
e660fa1
NER_Second_Try
Browse files- README.md +53 -0
- adapter_config.json +23 -0
- adapter_model.safetensors +3 -0
- api_experiment_run/description.json +352 -0
- api_experiment_run/model/logs/training/events.out.tfevents.1700048114.jupyter-carlosruizmoreno.1721.0 +3 -0
- api_experiment_run/model/model_hyperparameters.json +0 -0
- api_experiment_run/model/training_set_metadata.json +0 -0
- api_experiment_run_0/description.json +352 -0
- api_experiment_run_0/model/training_set_metadata.json +0 -0
- final_checkpoint/README.md +257 -0
- final_checkpoint/adapter_config.json +23 -0
- final_checkpoint/adapter_model.bin +3 -0
- runs/Nov14_21-53-35_jupyter-carlosruizmoreno/events.out.tfevents.1699998818.jupyter-carlosruizmoreno.526.0 +3 -0
- runs/Nov15_09-42-53_jupyter-carlosruizmoreno/events.out.tfevents.1700041375.jupyter-carlosruizmoreno.623.0 +3 -0
- runs/Nov15_11-40-06_jupyter-carlosruizmoreno/events.out.tfevents.1700048408.jupyter-carlosruizmoreno.623.1 +3 -0
- runs/Nov15_11-41-48_jupyter-carlosruizmoreno/events.out.tfevents.1700048511.jupyter-carlosruizmoreno.2192.0 +3 -0
- runs/Nov15_11-47-35_jupyter-carlosruizmoreno/events.out.tfevents.1700048867.jupyter-carlosruizmoreno.2530.0 +3 -0
- runs/Nov15_11-49-46_jupyter-carlosruizmoreno/events.out.tfevents.1700048989.jupyter-carlosruizmoreno.2530.1 +3 -0
- runs/Nov15_11-50-51_jupyter-carlosruizmoreno/events.out.tfevents.1700049053.jupyter-carlosruizmoreno.2971.0 +3 -0
- special_tokens_map.json +24 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +39 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: meta-llama/Llama-2-7b-hf
|
3 |
+
tags:
|
4 |
+
- generated_from_trainer
|
5 |
+
model-index:
|
6 |
+
- name: results
|
7 |
+
results: []
|
8 |
+
---
|
9 |
+
|
10 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
11 |
+
should probably proofread and complete it, then remove this comment. -->
|
12 |
+
|
13 |
+
# results
|
14 |
+
|
15 |
+
This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on an unknown dataset.
|
16 |
+
|
17 |
+
## Model description
|
18 |
+
|
19 |
+
More information needed
|
20 |
+
|
21 |
+
## Intended uses & limitations
|
22 |
+
|
23 |
+
More information needed
|
24 |
+
|
25 |
+
## Training and evaluation data
|
26 |
+
|
27 |
+
More information needed
|
28 |
+
|
29 |
+
## Training procedure
|
30 |
+
|
31 |
+
### Training hyperparameters
|
32 |
+
|
33 |
+
The following hyperparameters were used during training:
|
34 |
+
- learning_rate: 0.0002
|
35 |
+
- train_batch_size: 4
|
36 |
+
- eval_batch_size: 8
|
37 |
+
- seed: 42
|
38 |
+
- gradient_accumulation_steps: 4
|
39 |
+
- total_train_batch_size: 16
|
40 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
41 |
+
- lr_scheduler_type: linear
|
42 |
+
- training_steps: 200
|
43 |
+
|
44 |
+
### Training results
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
### Framework versions
|
49 |
+
|
50 |
+
- Transformers 4.35.1
|
51 |
+
- Pytorch 2.1.0+cu121
|
52 |
+
- Datasets 2.14.7
|
53 |
+
- Tokenizers 0.14.1
|
adapter_config.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"lora_alpha": 16,
|
12 |
+
"lora_dropout": 0.05,
|
13 |
+
"modules_to_save": null,
|
14 |
+
"peft_type": "LORA",
|
15 |
+
"r": 32,
|
16 |
+
"rank_pattern": {},
|
17 |
+
"revision": null,
|
18 |
+
"target_modules": [
|
19 |
+
"v_proj",
|
20 |
+
"q_proj"
|
21 |
+
],
|
22 |
+
"task_type": "CAUSAL_LM"
|
23 |
+
}
|
adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6fdb318d0515c6f87e91e0936391b1fc4cc2bf4882cee7a4b1f9a8183fc9309
|
3 |
+
size 67126104
|
api_experiment_run/description.json
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"command": "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py -f /home/jovyan/.local/share/jupyter/runtime/kernel-67de967a-aa00-4e36-bdc4-17a4dd6f9d51.json",
|
3 |
+
"compute": {
|
4 |
+
"arch_list": [
|
5 |
+
"sm_50",
|
6 |
+
"sm_60",
|
7 |
+
"sm_70",
|
8 |
+
"sm_75",
|
9 |
+
"sm_80",
|
10 |
+
"sm_86",
|
11 |
+
"sm_90"
|
12 |
+
],
|
13 |
+
"devices": {
|
14 |
+
"0": {
|
15 |
+
"device_capability": [
|
16 |
+
8,
|
17 |
+
0
|
18 |
+
],
|
19 |
+
"device_properties": "_CudaDeviceProperties(name='NVIDIA A100-PCIE-40GB MIG 7g.40gb', major=8, minor=0, total_memory=40339MB, multi_processor_count=98)",
|
20 |
+
"gpu_type": "NVIDIA A100-PCIE-40GB MIG 7g.40gb"
|
21 |
+
}
|
22 |
+
},
|
23 |
+
"gencode_flags": "-gencode compute=compute_50,code=sm_50 -gencode compute=compute_60,code=sm_60 -gencode compute=compute_70,code=sm_70 -gencode compute=compute_75,code=sm_75 -gencode compute=compute_80,code=sm_80 -gencode compute=compute_86,code=sm_86 -gencode compute=compute_90,code=sm_90",
|
24 |
+
"gpus_per_node": 1,
|
25 |
+
"num_nodes": 1
|
26 |
+
},
|
27 |
+
"config": {
|
28 |
+
"adapter": {
|
29 |
+
"alpha": 16,
|
30 |
+
"bias_type": "none",
|
31 |
+
"dropout": 0.05,
|
32 |
+
"postprocessor": {
|
33 |
+
"merge_adapter_into_base_model": false,
|
34 |
+
"progressbar": false
|
35 |
+
},
|
36 |
+
"pretrained_adapter_weights": null,
|
37 |
+
"r": 8,
|
38 |
+
"type": "lora"
|
39 |
+
},
|
40 |
+
"backend": null,
|
41 |
+
"base_model": "meta-llama/Llama-2-7b-hf",
|
42 |
+
"defaults": {
|
43 |
+
"text": {
|
44 |
+
"decoder": {
|
45 |
+
"fc_activation": "relu",
|
46 |
+
"fc_bias_initializer": "zeros",
|
47 |
+
"fc_dropout": 0.0,
|
48 |
+
"fc_layers": null,
|
49 |
+
"fc_norm": null,
|
50 |
+
"fc_norm_params": null,
|
51 |
+
"fc_output_size": 256,
|
52 |
+
"fc_use_bias": true,
|
53 |
+
"fc_weights_initializer": "xavier_uniform",
|
54 |
+
"input_size": null,
|
55 |
+
"max_new_tokens": null,
|
56 |
+
"num_fc_layers": 0,
|
57 |
+
"pretrained_model_name_or_path": "",
|
58 |
+
"tokenizer": "hf_tokenizer",
|
59 |
+
"type": "text_extractor",
|
60 |
+
"vocab_file": ""
|
61 |
+
},
|
62 |
+
"encoder": {
|
63 |
+
"skip": false,
|
64 |
+
"type": "passthrough"
|
65 |
+
},
|
66 |
+
"loss": {
|
67 |
+
"class_similarities": null,
|
68 |
+
"class_similarities_temperature": 0,
|
69 |
+
"class_weights": null,
|
70 |
+
"confidence_penalty": 0,
|
71 |
+
"robust_lambda": 0,
|
72 |
+
"type": "next_token_softmax_cross_entropy",
|
73 |
+
"unique": false,
|
74 |
+
"weight": 1.0
|
75 |
+
},
|
76 |
+
"preprocessing": {
|
77 |
+
"cache_encoder_embeddings": false,
|
78 |
+
"compute_idf": false,
|
79 |
+
"computed_fill_value": "<UNK>",
|
80 |
+
"fill_value": "<UNK>",
|
81 |
+
"lowercase": true,
|
82 |
+
"max_sequence_length": 256,
|
83 |
+
"missing_value_strategy": "fill_with_const",
|
84 |
+
"most_common": 20000,
|
85 |
+
"ngram_size": 2,
|
86 |
+
"padding": "right",
|
87 |
+
"padding_symbol": "<PAD>",
|
88 |
+
"pretrained_model_name_or_path": null,
|
89 |
+
"prompt": {
|
90 |
+
"retrieval": {
|
91 |
+
"index_name": null,
|
92 |
+
"k": 0,
|
93 |
+
"model_name": null,
|
94 |
+
"type": null
|
95 |
+
},
|
96 |
+
"task": null,
|
97 |
+
"template": null
|
98 |
+
},
|
99 |
+
"sequence_length": null,
|
100 |
+
"tokenizer": "space_punct",
|
101 |
+
"unknown_symbol": "<UNK>",
|
102 |
+
"vocab_file": null
|
103 |
+
}
|
104 |
+
}
|
105 |
+
},
|
106 |
+
"generation": {
|
107 |
+
"bad_words_ids": null,
|
108 |
+
"begin_suppress_tokens": null,
|
109 |
+
"bos_token_id": null,
|
110 |
+
"diversity_penalty": 0.0,
|
111 |
+
"do_sample": true,
|
112 |
+
"early_stopping": false,
|
113 |
+
"encoder_repetition_penalty": 1.0,
|
114 |
+
"eos_token_id": null,
|
115 |
+
"epsilon_cutoff": 0.0,
|
116 |
+
"eta_cutoff": 0.0,
|
117 |
+
"exponential_decay_length_penalty": null,
|
118 |
+
"force_words_ids": null,
|
119 |
+
"forced_bos_token_id": null,
|
120 |
+
"forced_decoder_ids": null,
|
121 |
+
"forced_eos_token_id": null,
|
122 |
+
"guidance_scale": null,
|
123 |
+
"length_penalty": 1.0,
|
124 |
+
"max_length": 32,
|
125 |
+
"max_new_tokens": 512,
|
126 |
+
"max_time": null,
|
127 |
+
"min_length": 0,
|
128 |
+
"min_new_tokens": null,
|
129 |
+
"no_repeat_ngram_size": 0,
|
130 |
+
"num_beam_groups": 1,
|
131 |
+
"num_beams": 1,
|
132 |
+
"pad_token_id": null,
|
133 |
+
"penalty_alpha": null,
|
134 |
+
"remove_invalid_values": false,
|
135 |
+
"renormalize_logits": false,
|
136 |
+
"repetition_penalty": 1.0,
|
137 |
+
"sequence_bias": null,
|
138 |
+
"suppress_tokens": null,
|
139 |
+
"temperature": 0.0,
|
140 |
+
"top_k": 50,
|
141 |
+
"top_p": 1.0,
|
142 |
+
"typical_p": 1.0,
|
143 |
+
"use_cache": true
|
144 |
+
},
|
145 |
+
"hyperopt": null,
|
146 |
+
"input_features": [
|
147 |
+
{
|
148 |
+
"active": true,
|
149 |
+
"column": "instruction",
|
150 |
+
"encoder": {
|
151 |
+
"skip": false,
|
152 |
+
"type": "passthrough"
|
153 |
+
},
|
154 |
+
"name": "instruction",
|
155 |
+
"preprocessing": {
|
156 |
+
"cache_encoder_embeddings": false,
|
157 |
+
"compute_idf": false,
|
158 |
+
"computed_fill_value": "<UNK>",
|
159 |
+
"fill_value": "<UNK>",
|
160 |
+
"lowercase": true,
|
161 |
+
"max_sequence_length": null,
|
162 |
+
"missing_value_strategy": "fill_with_const",
|
163 |
+
"most_common": 20000,
|
164 |
+
"ngram_size": 2,
|
165 |
+
"padding": "left",
|
166 |
+
"padding_symbol": "<PAD>",
|
167 |
+
"pretrained_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
168 |
+
"sequence_length": null,
|
169 |
+
"tokenizer": "hf_tokenizer",
|
170 |
+
"unknown_symbol": "<UNK>",
|
171 |
+
"vocab_file": null
|
172 |
+
},
|
173 |
+
"proc_column": "instruction_TityHg",
|
174 |
+
"tied": null,
|
175 |
+
"type": "text"
|
176 |
+
}
|
177 |
+
],
|
178 |
+
"ludwig_version": "0.8.6",
|
179 |
+
"model_parameters": null,
|
180 |
+
"model_type": "llm",
|
181 |
+
"output_features": [
|
182 |
+
{
|
183 |
+
"active": true,
|
184 |
+
"class_similarities": null,
|
185 |
+
"column": "output",
|
186 |
+
"decoder": {
|
187 |
+
"fc_activation": "relu",
|
188 |
+
"fc_bias_initializer": "zeros",
|
189 |
+
"fc_dropout": 0.0,
|
190 |
+
"fc_layers": null,
|
191 |
+
"fc_norm": null,
|
192 |
+
"fc_norm_params": null,
|
193 |
+
"fc_output_size": 256,
|
194 |
+
"fc_use_bias": true,
|
195 |
+
"fc_weights_initializer": "xavier_uniform",
|
196 |
+
"input_size": null,
|
197 |
+
"max_new_tokens": 512,
|
198 |
+
"num_fc_layers": 0,
|
199 |
+
"pretrained_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
200 |
+
"tokenizer": "hf_tokenizer",
|
201 |
+
"type": "text_extractor",
|
202 |
+
"vocab_file": ""
|
203 |
+
},
|
204 |
+
"default_validation_metric": "loss",
|
205 |
+
"dependencies": [],
|
206 |
+
"input_size": null,
|
207 |
+
"loss": {
|
208 |
+
"class_similarities": null,
|
209 |
+
"class_similarities_temperature": 0,
|
210 |
+
"class_weights": null,
|
211 |
+
"confidence_penalty": 0,
|
212 |
+
"robust_lambda": 0,
|
213 |
+
"type": "next_token_softmax_cross_entropy",
|
214 |
+
"unique": false,
|
215 |
+
"weight": 1.0
|
216 |
+
},
|
217 |
+
"name": "output",
|
218 |
+
"num_classes": null,
|
219 |
+
"preprocessing": {
|
220 |
+
"cache_encoder_embeddings": false,
|
221 |
+
"compute_idf": false,
|
222 |
+
"computed_fill_value": "<UNK>",
|
223 |
+
"fill_value": "<UNK>",
|
224 |
+
"lowercase": true,
|
225 |
+
"max_sequence_length": null,
|
226 |
+
"missing_value_strategy": "drop_row",
|
227 |
+
"most_common": 20000,
|
228 |
+
"ngram_size": 2,
|
229 |
+
"padding": "left",
|
230 |
+
"padding_symbol": "<PAD>",
|
231 |
+
"pretrained_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
232 |
+
"sequence_length": null,
|
233 |
+
"tokenizer": "hf_tokenizer",
|
234 |
+
"unknown_symbol": "<UNK>",
|
235 |
+
"vocab_file": null
|
236 |
+
},
|
237 |
+
"proc_column": "output_9bi87u",
|
238 |
+
"reduce_dependencies": "sum",
|
239 |
+
"reduce_input": "sum",
|
240 |
+
"type": "text"
|
241 |
+
}
|
242 |
+
],
|
243 |
+
"preprocessing": {
|
244 |
+
"global_max_sequence_length": 512,
|
245 |
+
"oversample_minority": null,
|
246 |
+
"sample_ratio": 1.0,
|
247 |
+
"sample_size": null,
|
248 |
+
"split": {
|
249 |
+
"probabilities": [
|
250 |
+
1.0,
|
251 |
+
0.0,
|
252 |
+
0.0
|
253 |
+
],
|
254 |
+
"type": "random"
|
255 |
+
},
|
256 |
+
"undersample_majority": null
|
257 |
+
},
|
258 |
+
"prompt": {
|
259 |
+
"retrieval": {
|
260 |
+
"index_name": null,
|
261 |
+
"k": 0,
|
262 |
+
"model_name": null,
|
263 |
+
"type": null
|
264 |
+
},
|
265 |
+
"task": null,
|
266 |
+
"template": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n### Instruction:\n### Input: {input}\n### Response:"
|
267 |
+
},
|
268 |
+
"quantization": {
|
269 |
+
"bits": 4,
|
270 |
+
"bnb_4bit_compute_dtype": "float16",
|
271 |
+
"bnb_4bit_quant_type": "nf4",
|
272 |
+
"bnb_4bit_use_double_quant": true,
|
273 |
+
"llm_int8_has_fp16_weight": false,
|
274 |
+
"llm_int8_threshold": 6.0
|
275 |
+
},
|
276 |
+
"trainer": {
|
277 |
+
"base_learning_rate": 0.0,
|
278 |
+
"batch_size": 1,
|
279 |
+
"bucketing_field": null,
|
280 |
+
"checkpoints_per_epoch": 0,
|
281 |
+
"compile": false,
|
282 |
+
"early_stop": 5,
|
283 |
+
"effective_batch_size": "auto",
|
284 |
+
"enable_gradient_checkpointing": false,
|
285 |
+
"enable_profiling": false,
|
286 |
+
"epochs": 6,
|
287 |
+
"eval_batch_size": 2,
|
288 |
+
"evaluate_training_set": false,
|
289 |
+
"gradient_accumulation_steps": 16,
|
290 |
+
"gradient_clipping": {
|
291 |
+
"clipglobalnorm": 0.5,
|
292 |
+
"clipnorm": null,
|
293 |
+
"clipvalue": null
|
294 |
+
},
|
295 |
+
"increase_batch_size_eval_metric": "loss",
|
296 |
+
"increase_batch_size_eval_split": "training",
|
297 |
+
"increase_batch_size_on_plateau": 0,
|
298 |
+
"increase_batch_size_on_plateau_patience": 5,
|
299 |
+
"increase_batch_size_on_plateau_rate": 2.0,
|
300 |
+
"learning_rate": 0.0005,
|
301 |
+
"learning_rate_scaling": "linear",
|
302 |
+
"learning_rate_scheduler": {
|
303 |
+
"decay": null,
|
304 |
+
"decay_rate": 0.96,
|
305 |
+
"decay_steps": 10000,
|
306 |
+
"eta_min": 0,
|
307 |
+
"reduce_eval_metric": "loss",
|
308 |
+
"reduce_eval_split": "training",
|
309 |
+
"reduce_on_plateau": 0,
|
310 |
+
"reduce_on_plateau_patience": 10,
|
311 |
+
"reduce_on_plateau_rate": 0.1,
|
312 |
+
"staircase": false,
|
313 |
+
"t_0": null,
|
314 |
+
"t_mult": 1,
|
315 |
+
"warmup_evaluations": 0,
|
316 |
+
"warmup_fraction": 0.03
|
317 |
+
},
|
318 |
+
"max_batch_size": 1099511627776,
|
319 |
+
"optimizer": {
|
320 |
+
"amsgrad": false,
|
321 |
+
"betas": [
|
322 |
+
0.9,
|
323 |
+
0.999
|
324 |
+
],
|
325 |
+
"eps": 1e-08,
|
326 |
+
"type": "adam",
|
327 |
+
"weight_decay": 0.0
|
328 |
+
},
|
329 |
+
"profiler": {
|
330 |
+
"active": 3,
|
331 |
+
"repeat": 5,
|
332 |
+
"skip_first": 0,
|
333 |
+
"wait": 1,
|
334 |
+
"warmup": 1
|
335 |
+
},
|
336 |
+
"regularization_lambda": 0.0,
|
337 |
+
"regularization_type": "l2",
|
338 |
+
"should_shuffle": true,
|
339 |
+
"skip_all_evaluation": false,
|
340 |
+
"steps_per_checkpoint": 0,
|
341 |
+
"train_steps": null,
|
342 |
+
"type": "finetune",
|
343 |
+
"use_mixed_precision": false,
|
344 |
+
"validation_field": "output",
|
345 |
+
"validation_metric": "loss"
|
346 |
+
}
|
347 |
+
},
|
348 |
+
"data_format": "<class 'pandas.core.frame.DataFrame'>",
|
349 |
+
"ludwig_version": "0.8.6",
|
350 |
+
"random_seed": 42,
|
351 |
+
"torch_version": "2.1.0+cu121"
|
352 |
+
}
|
api_experiment_run/model/logs/training/events.out.tfevents.1700048114.jupyter-carlosruizmoreno.1721.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5b19c2dc00151ccb14c7dcadd4d768b6df235f7abc8dd34ecee359c0b29b3a1
|
3 |
+
size 1227
|
api_experiment_run/model/model_hyperparameters.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
api_experiment_run/model/training_set_metadata.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
api_experiment_run_0/description.json
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"command": "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py -f /home/jovyan/.local/share/jupyter/runtime/kernel-67de967a-aa00-4e36-bdc4-17a4dd6f9d51.json",
|
3 |
+
"compute": {
|
4 |
+
"arch_list": [
|
5 |
+
"sm_50",
|
6 |
+
"sm_60",
|
7 |
+
"sm_70",
|
8 |
+
"sm_75",
|
9 |
+
"sm_80",
|
10 |
+
"sm_86",
|
11 |
+
"sm_90"
|
12 |
+
],
|
13 |
+
"devices": {
|
14 |
+
"0": {
|
15 |
+
"device_capability": [
|
16 |
+
8,
|
17 |
+
0
|
18 |
+
],
|
19 |
+
"device_properties": "_CudaDeviceProperties(name='NVIDIA A100-PCIE-40GB MIG 7g.40gb', major=8, minor=0, total_memory=40339MB, multi_processor_count=98)",
|
20 |
+
"gpu_type": "NVIDIA A100-PCIE-40GB MIG 7g.40gb"
|
21 |
+
}
|
22 |
+
},
|
23 |
+
"gencode_flags": "-gencode compute=compute_50,code=sm_50 -gencode compute=compute_60,code=sm_60 -gencode compute=compute_70,code=sm_70 -gencode compute=compute_75,code=sm_75 -gencode compute=compute_80,code=sm_80 -gencode compute=compute_86,code=sm_86 -gencode compute=compute_90,code=sm_90",
|
24 |
+
"gpus_per_node": 1,
|
25 |
+
"num_nodes": 1
|
26 |
+
},
|
27 |
+
"config": {
|
28 |
+
"adapter": {
|
29 |
+
"alpha": 16,
|
30 |
+
"bias_type": "none",
|
31 |
+
"dropout": 0.05,
|
32 |
+
"postprocessor": {
|
33 |
+
"merge_adapter_into_base_model": false,
|
34 |
+
"progressbar": false
|
35 |
+
},
|
36 |
+
"pretrained_adapter_weights": null,
|
37 |
+
"r": 8,
|
38 |
+
"type": "lora"
|
39 |
+
},
|
40 |
+
"backend": null,
|
41 |
+
"base_model": "meta-llama/Llama-2-7b-hf",
|
42 |
+
"defaults": {
|
43 |
+
"text": {
|
44 |
+
"decoder": {
|
45 |
+
"fc_activation": "relu",
|
46 |
+
"fc_bias_initializer": "zeros",
|
47 |
+
"fc_dropout": 0.0,
|
48 |
+
"fc_layers": null,
|
49 |
+
"fc_norm": null,
|
50 |
+
"fc_norm_params": null,
|
51 |
+
"fc_output_size": 256,
|
52 |
+
"fc_use_bias": true,
|
53 |
+
"fc_weights_initializer": "xavier_uniform",
|
54 |
+
"input_size": null,
|
55 |
+
"max_new_tokens": null,
|
56 |
+
"num_fc_layers": 0,
|
57 |
+
"pretrained_model_name_or_path": "",
|
58 |
+
"tokenizer": "hf_tokenizer",
|
59 |
+
"type": "text_extractor",
|
60 |
+
"vocab_file": ""
|
61 |
+
},
|
62 |
+
"encoder": {
|
63 |
+
"skip": false,
|
64 |
+
"type": "passthrough"
|
65 |
+
},
|
66 |
+
"loss": {
|
67 |
+
"class_similarities": null,
|
68 |
+
"class_similarities_temperature": 0,
|
69 |
+
"class_weights": null,
|
70 |
+
"confidence_penalty": 0,
|
71 |
+
"robust_lambda": 0,
|
72 |
+
"type": "next_token_softmax_cross_entropy",
|
73 |
+
"unique": false,
|
74 |
+
"weight": 1.0
|
75 |
+
},
|
76 |
+
"preprocessing": {
|
77 |
+
"cache_encoder_embeddings": false,
|
78 |
+
"compute_idf": false,
|
79 |
+
"computed_fill_value": "<UNK>",
|
80 |
+
"fill_value": "<UNK>",
|
81 |
+
"lowercase": true,
|
82 |
+
"max_sequence_length": 256,
|
83 |
+
"missing_value_strategy": "fill_with_const",
|
84 |
+
"most_common": 20000,
|
85 |
+
"ngram_size": 2,
|
86 |
+
"padding": "right",
|
87 |
+
"padding_symbol": "<PAD>",
|
88 |
+
"pretrained_model_name_or_path": null,
|
89 |
+
"prompt": {
|
90 |
+
"retrieval": {
|
91 |
+
"index_name": null,
|
92 |
+
"k": 0,
|
93 |
+
"model_name": null,
|
94 |
+
"type": null
|
95 |
+
},
|
96 |
+
"task": null,
|
97 |
+
"template": null
|
98 |
+
},
|
99 |
+
"sequence_length": null,
|
100 |
+
"tokenizer": "space_punct",
|
101 |
+
"unknown_symbol": "<UNK>",
|
102 |
+
"vocab_file": null
|
103 |
+
}
|
104 |
+
}
|
105 |
+
},
|
106 |
+
"generation": {
|
107 |
+
"bad_words_ids": null,
|
108 |
+
"begin_suppress_tokens": null,
|
109 |
+
"bos_token_id": null,
|
110 |
+
"diversity_penalty": 0.0,
|
111 |
+
"do_sample": true,
|
112 |
+
"early_stopping": false,
|
113 |
+
"encoder_repetition_penalty": 1.0,
|
114 |
+
"eos_token_id": null,
|
115 |
+
"epsilon_cutoff": 0.0,
|
116 |
+
"eta_cutoff": 0.0,
|
117 |
+
"exponential_decay_length_penalty": null,
|
118 |
+
"force_words_ids": null,
|
119 |
+
"forced_bos_token_id": null,
|
120 |
+
"forced_decoder_ids": null,
|
121 |
+
"forced_eos_token_id": null,
|
122 |
+
"guidance_scale": null,
|
123 |
+
"length_penalty": 1.0,
|
124 |
+
"max_length": 32,
|
125 |
+
"max_new_tokens": 512,
|
126 |
+
"max_time": null,
|
127 |
+
"min_length": 0,
|
128 |
+
"min_new_tokens": null,
|
129 |
+
"no_repeat_ngram_size": 0,
|
130 |
+
"num_beam_groups": 1,
|
131 |
+
"num_beams": 1,
|
132 |
+
"pad_token_id": null,
|
133 |
+
"penalty_alpha": null,
|
134 |
+
"remove_invalid_values": false,
|
135 |
+
"renormalize_logits": false,
|
136 |
+
"repetition_penalty": 1.0,
|
137 |
+
"sequence_bias": null,
|
138 |
+
"suppress_tokens": null,
|
139 |
+
"temperature": 0.1,
|
140 |
+
"top_k": 50,
|
141 |
+
"top_p": 1.0,
|
142 |
+
"typical_p": 1.0,
|
143 |
+
"use_cache": true
|
144 |
+
},
|
145 |
+
"hyperopt": null,
|
146 |
+
"input_features": [
|
147 |
+
{
|
148 |
+
"active": true,
|
149 |
+
"column": "instruction",
|
150 |
+
"encoder": {
|
151 |
+
"skip": false,
|
152 |
+
"type": "passthrough"
|
153 |
+
},
|
154 |
+
"name": "instruction",
|
155 |
+
"preprocessing": {
|
156 |
+
"cache_encoder_embeddings": false,
|
157 |
+
"compute_idf": false,
|
158 |
+
"computed_fill_value": "<UNK>",
|
159 |
+
"fill_value": "<UNK>",
|
160 |
+
"lowercase": true,
|
161 |
+
"max_sequence_length": null,
|
162 |
+
"missing_value_strategy": "fill_with_const",
|
163 |
+
"most_common": 20000,
|
164 |
+
"ngram_size": 2,
|
165 |
+
"padding": "left",
|
166 |
+
"padding_symbol": "<PAD>",
|
167 |
+
"pretrained_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
168 |
+
"sequence_length": null,
|
169 |
+
"tokenizer": "hf_tokenizer",
|
170 |
+
"unknown_symbol": "<UNK>",
|
171 |
+
"vocab_file": null
|
172 |
+
},
|
173 |
+
"proc_column": "instruction_TityHg",
|
174 |
+
"tied": null,
|
175 |
+
"type": "text"
|
176 |
+
}
|
177 |
+
],
|
178 |
+
"ludwig_version": "0.8.6",
|
179 |
+
"model_parameters": null,
|
180 |
+
"model_type": "llm",
|
181 |
+
"output_features": [
|
182 |
+
{
|
183 |
+
"active": true,
|
184 |
+
"class_similarities": null,
|
185 |
+
"column": "output",
|
186 |
+
"decoder": {
|
187 |
+
"fc_activation": "relu",
|
188 |
+
"fc_bias_initializer": "zeros",
|
189 |
+
"fc_dropout": 0.0,
|
190 |
+
"fc_layers": null,
|
191 |
+
"fc_norm": null,
|
192 |
+
"fc_norm_params": null,
|
193 |
+
"fc_output_size": 256,
|
194 |
+
"fc_use_bias": true,
|
195 |
+
"fc_weights_initializer": "xavier_uniform",
|
196 |
+
"input_size": null,
|
197 |
+
"max_new_tokens": 512,
|
198 |
+
"num_fc_layers": 0,
|
199 |
+
"pretrained_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
200 |
+
"tokenizer": "hf_tokenizer",
|
201 |
+
"type": "text_extractor",
|
202 |
+
"vocab_file": ""
|
203 |
+
},
|
204 |
+
"default_validation_metric": "loss",
|
205 |
+
"dependencies": [],
|
206 |
+
"input_size": null,
|
207 |
+
"loss": {
|
208 |
+
"class_similarities": null,
|
209 |
+
"class_similarities_temperature": 0,
|
210 |
+
"class_weights": null,
|
211 |
+
"confidence_penalty": 0,
|
212 |
+
"robust_lambda": 0,
|
213 |
+
"type": "next_token_softmax_cross_entropy",
|
214 |
+
"unique": false,
|
215 |
+
"weight": 1.0
|
216 |
+
},
|
217 |
+
"name": "output",
|
218 |
+
"num_classes": null,
|
219 |
+
"preprocessing": {
|
220 |
+
"cache_encoder_embeddings": false,
|
221 |
+
"compute_idf": false,
|
222 |
+
"computed_fill_value": "<UNK>",
|
223 |
+
"fill_value": "<UNK>",
|
224 |
+
"lowercase": true,
|
225 |
+
"max_sequence_length": null,
|
226 |
+
"missing_value_strategy": "drop_row",
|
227 |
+
"most_common": 20000,
|
228 |
+
"ngram_size": 2,
|
229 |
+
"padding": "left",
|
230 |
+
"padding_symbol": "<PAD>",
|
231 |
+
"pretrained_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
232 |
+
"sequence_length": null,
|
233 |
+
"tokenizer": "hf_tokenizer",
|
234 |
+
"unknown_symbol": "<UNK>",
|
235 |
+
"vocab_file": null
|
236 |
+
},
|
237 |
+
"proc_column": "output_9bi87u",
|
238 |
+
"reduce_dependencies": "sum",
|
239 |
+
"reduce_input": "sum",
|
240 |
+
"type": "text"
|
241 |
+
}
|
242 |
+
],
|
243 |
+
"preprocessing": {
|
244 |
+
"global_max_sequence_length": 512,
|
245 |
+
"oversample_minority": null,
|
246 |
+
"sample_ratio": 1.0,
|
247 |
+
"sample_size": null,
|
248 |
+
"split": {
|
249 |
+
"probabilities": [
|
250 |
+
1.0,
|
251 |
+
0.0,
|
252 |
+
0.0
|
253 |
+
],
|
254 |
+
"type": "random"
|
255 |
+
},
|
256 |
+
"undersample_majority": null
|
257 |
+
},
|
258 |
+
"prompt": {
|
259 |
+
"retrieval": {
|
260 |
+
"index_name": null,
|
261 |
+
"k": 0,
|
262 |
+
"model_name": null,
|
263 |
+
"type": null
|
264 |
+
},
|
265 |
+
"task": null,
|
266 |
+
"template": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n### Instruction:\n### Input: {input}\n### Response:"
|
267 |
+
},
|
268 |
+
"quantization": {
|
269 |
+
"bits": 4,
|
270 |
+
"bnb_4bit_compute_dtype": "float16",
|
271 |
+
"bnb_4bit_quant_type": "nf4",
|
272 |
+
"bnb_4bit_use_double_quant": true,
|
273 |
+
"llm_int8_has_fp16_weight": false,
|
274 |
+
"llm_int8_threshold": 6.0
|
275 |
+
},
|
276 |
+
"trainer": {
|
277 |
+
"base_learning_rate": 0.0,
|
278 |
+
"batch_size": 1,
|
279 |
+
"bucketing_field": null,
|
280 |
+
"checkpoints_per_epoch": 0,
|
281 |
+
"compile": false,
|
282 |
+
"early_stop": 5,
|
283 |
+
"effective_batch_size": "auto",
|
284 |
+
"enable_gradient_checkpointing": false,
|
285 |
+
"enable_profiling": false,
|
286 |
+
"epochs": 2,
|
287 |
+
"eval_batch_size": 2,
|
288 |
+
"evaluate_training_set": false,
|
289 |
+
"gradient_accumulation_steps": 16,
|
290 |
+
"gradient_clipping": {
|
291 |
+
"clipglobalnorm": 0.5,
|
292 |
+
"clipnorm": null,
|
293 |
+
"clipvalue": null
|
294 |
+
},
|
295 |
+
"increase_batch_size_eval_metric": "loss",
|
296 |
+
"increase_batch_size_eval_split": "training",
|
297 |
+
"increase_batch_size_on_plateau": 0,
|
298 |
+
"increase_batch_size_on_plateau_patience": 5,
|
299 |
+
"increase_batch_size_on_plateau_rate": 2.0,
|
300 |
+
"learning_rate": 0.0005,
|
301 |
+
"learning_rate_scaling": "linear",
|
302 |
+
"learning_rate_scheduler": {
|
303 |
+
"decay": null,
|
304 |
+
"decay_rate": 0.96,
|
305 |
+
"decay_steps": 10000,
|
306 |
+
"eta_min": 0,
|
307 |
+
"reduce_eval_metric": "loss",
|
308 |
+
"reduce_eval_split": "training",
|
309 |
+
"reduce_on_plateau": 0,
|
310 |
+
"reduce_on_plateau_patience": 10,
|
311 |
+
"reduce_on_plateau_rate": 0.1,
|
312 |
+
"staircase": false,
|
313 |
+
"t_0": null,
|
314 |
+
"t_mult": 1,
|
315 |
+
"warmup_evaluations": 0,
|
316 |
+
"warmup_fraction": 0.03
|
317 |
+
},
|
318 |
+
"max_batch_size": 1099511627776,
|
319 |
+
"optimizer": {
|
320 |
+
"amsgrad": false,
|
321 |
+
"betas": [
|
322 |
+
0.9,
|
323 |
+
0.999
|
324 |
+
],
|
325 |
+
"eps": 1e-08,
|
326 |
+
"type": "adam",
|
327 |
+
"weight_decay": 0.0
|
328 |
+
},
|
329 |
+
"profiler": {
|
330 |
+
"active": 3,
|
331 |
+
"repeat": 5,
|
332 |
+
"skip_first": 0,
|
333 |
+
"wait": 1,
|
334 |
+
"warmup": 1
|
335 |
+
},
|
336 |
+
"regularization_lambda": 0.0,
|
337 |
+
"regularization_type": "l2",
|
338 |
+
"should_shuffle": true,
|
339 |
+
"skip_all_evaluation": false,
|
340 |
+
"steps_per_checkpoint": 0,
|
341 |
+
"train_steps": null,
|
342 |
+
"type": "finetune",
|
343 |
+
"use_mixed_precision": false,
|
344 |
+
"validation_field": "output",
|
345 |
+
"validation_metric": "loss"
|
346 |
+
}
|
347 |
+
},
|
348 |
+
"data_format": "<class 'pandas.core.frame.DataFrame'>",
|
349 |
+
"ludwig_version": "0.8.6",
|
350 |
+
"random_seed": 42,
|
351 |
+
"torch_version": "2.1.0+cu121"
|
352 |
+
}
|
api_experiment_run_0/model/training_set_metadata.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
final_checkpoint/README.md
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
base_model: meta-llama/Llama-2-7b-hf
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Shared by [optional]:** [More Information Needed]
|
22 |
+
- **Model type:** [More Information Needed]
|
23 |
+
- **Language(s) (NLP):** [More Information Needed]
|
24 |
+
- **License:** [More Information Needed]
|
25 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
26 |
+
|
27 |
+
### Model Sources [optional]
|
28 |
+
|
29 |
+
<!-- Provide the basic links for the model. -->
|
30 |
+
|
31 |
+
- **Repository:** [More Information Needed]
|
32 |
+
- **Paper [optional]:** [More Information Needed]
|
33 |
+
- **Demo [optional]:** [More Information Needed]
|
34 |
+
|
35 |
+
## Uses
|
36 |
+
|
37 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
38 |
+
|
39 |
+
### Direct Use
|
40 |
+
|
41 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
42 |
+
|
43 |
+
[More Information Needed]
|
44 |
+
|
45 |
+
### Downstream Use [optional]
|
46 |
+
|
47 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
48 |
+
|
49 |
+
[More Information Needed]
|
50 |
+
|
51 |
+
### Out-of-Scope Use
|
52 |
+
|
53 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
54 |
+
|
55 |
+
[More Information Needed]
|
56 |
+
|
57 |
+
## Bias, Risks, and Limitations
|
58 |
+
|
59 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
60 |
+
|
61 |
+
[More Information Needed]
|
62 |
+
|
63 |
+
### Recommendations
|
64 |
+
|
65 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
66 |
+
|
67 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
68 |
+
|
69 |
+
## How to Get Started with the Model
|
70 |
+
|
71 |
+
Use the code below to get started with the model.
|
72 |
+
|
73 |
+
[More Information Needed]
|
74 |
+
|
75 |
+
## Training Details
|
76 |
+
|
77 |
+
### Training Data
|
78 |
+
|
79 |
+
<!-- This should link to a Data Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
80 |
+
|
81 |
+
[More Information Needed]
|
82 |
+
|
83 |
+
### Training Procedure
|
84 |
+
|
85 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
86 |
+
|
87 |
+
#### Preprocessing [optional]
|
88 |
+
|
89 |
+
[More Information Needed]
|
90 |
+
|
91 |
+
|
92 |
+
#### Training Hyperparameters
|
93 |
+
|
94 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
95 |
+
|
96 |
+
#### Speeds, Sizes, Times [optional]
|
97 |
+
|
98 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
99 |
+
|
100 |
+
[More Information Needed]
|
101 |
+
|
102 |
+
## Evaluation
|
103 |
+
|
104 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
105 |
+
|
106 |
+
### Testing Data, Factors & Metrics
|
107 |
+
|
108 |
+
#### Testing Data
|
109 |
+
|
110 |
+
<!-- This should link to a Data Card if possible. -->
|
111 |
+
|
112 |
+
[More Information Needed]
|
113 |
+
|
114 |
+
#### Factors
|
115 |
+
|
116 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
117 |
+
|
118 |
+
[More Information Needed]
|
119 |
+
|
120 |
+
#### Metrics
|
121 |
+
|
122 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
123 |
+
|
124 |
+
[More Information Needed]
|
125 |
+
|
126 |
+
### Results
|
127 |
+
|
128 |
+
[More Information Needed]
|
129 |
+
|
130 |
+
#### Summary
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
## Model Examination [optional]
|
135 |
+
|
136 |
+
<!-- Relevant interpretability work for the model goes here -->
|
137 |
+
|
138 |
+
[More Information Needed]
|
139 |
+
|
140 |
+
## Environmental Impact
|
141 |
+
|
142 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
143 |
+
|
144 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
145 |
+
|
146 |
+
- **Hardware Type:** [More Information Needed]
|
147 |
+
- **Hours used:** [More Information Needed]
|
148 |
+
- **Cloud Provider:** [More Information Needed]
|
149 |
+
- **Compute Region:** [More Information Needed]
|
150 |
+
- **Carbon Emitted:** [More Information Needed]
|
151 |
+
|
152 |
+
## Technical Specifications [optional]
|
153 |
+
|
154 |
+
### Model Architecture and Objective
|
155 |
+
|
156 |
+
[More Information Needed]
|
157 |
+
|
158 |
+
### Compute Infrastructure
|
159 |
+
|
160 |
+
[More Information Needed]
|
161 |
+
|
162 |
+
#### Hardware
|
163 |
+
|
164 |
+
[More Information Needed]
|
165 |
+
|
166 |
+
#### Software
|
167 |
+
|
168 |
+
[More Information Needed]
|
169 |
+
|
170 |
+
## Citation [optional]
|
171 |
+
|
172 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
173 |
+
|
174 |
+
**BibTeX:**
|
175 |
+
|
176 |
+
[More Information Needed]
|
177 |
+
|
178 |
+
**APA:**
|
179 |
+
|
180 |
+
[More Information Needed]
|
181 |
+
|
182 |
+
## Glossary [optional]
|
183 |
+
|
184 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
185 |
+
|
186 |
+
[More Information Needed]
|
187 |
+
|
188 |
+
## More Information [optional]
|
189 |
+
|
190 |
+
[More Information Needed]
|
191 |
+
|
192 |
+
## Model Card Authors [optional]
|
193 |
+
|
194 |
+
[More Information Needed]
|
195 |
+
|
196 |
+
## Model Card Contact
|
197 |
+
|
198 |
+
[More Information Needed]
|
199 |
+
|
200 |
+
|
201 |
+
## Training procedure
|
202 |
+
|
203 |
+
|
204 |
+
The following `bitsandbytes` quantization config was used during training:
|
205 |
+
- quant_method: bitsandbytes
|
206 |
+
- load_in_8bit: False
|
207 |
+
- load_in_4bit: True
|
208 |
+
- llm_int8_threshold: 6.0
|
209 |
+
- llm_int8_skip_modules: None
|
210 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
211 |
+
- llm_int8_has_fp16_weight: False
|
212 |
+
- bnb_4bit_quant_type: nf4
|
213 |
+
- bnb_4bit_use_double_quant: True
|
214 |
+
- bnb_4bit_compute_dtype: float16
|
215 |
+
|
216 |
+
### Framework versions
|
217 |
+
|
218 |
+
|
219 |
+
- PEFT 0.6.2
|
220 |
+
## Training procedure
|
221 |
+
|
222 |
+
|
223 |
+
The following `bitsandbytes` quantization config was used during training:
|
224 |
+
- quant_method: bitsandbytes
|
225 |
+
- load_in_8bit: False
|
226 |
+
- load_in_4bit: True
|
227 |
+
- llm_int8_threshold: 6.0
|
228 |
+
- llm_int8_skip_modules: None
|
229 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
230 |
+
- llm_int8_has_fp16_weight: False
|
231 |
+
- bnb_4bit_quant_type: nf4
|
232 |
+
- bnb_4bit_use_double_quant: True
|
233 |
+
- bnb_4bit_compute_dtype: float16
|
234 |
+
|
235 |
+
### Framework versions
|
236 |
+
|
237 |
+
|
238 |
+
- PEFT 0.6.2
|
239 |
+
## Training procedure
|
240 |
+
|
241 |
+
|
242 |
+
The following `bitsandbytes` quantization config was used during training:
|
243 |
+
- quant_method: bitsandbytes
|
244 |
+
- load_in_8bit: False
|
245 |
+
- load_in_4bit: True
|
246 |
+
- llm_int8_threshold: 6.0
|
247 |
+
- llm_int8_skip_modules: None
|
248 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
249 |
+
- llm_int8_has_fp16_weight: False
|
250 |
+
- bnb_4bit_quant_type: nf4
|
251 |
+
- bnb_4bit_use_double_quant: True
|
252 |
+
- bnb_4bit_compute_dtype: float16
|
253 |
+
|
254 |
+
### Framework versions
|
255 |
+
|
256 |
+
|
257 |
+
- PEFT 0.6.2
|
final_checkpoint/adapter_config.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layers_pattern": null,
|
10 |
+
"layers_to_transform": null,
|
11 |
+
"lora_alpha": 16,
|
12 |
+
"lora_dropout": 0.05,
|
13 |
+
"modules_to_save": null,
|
14 |
+
"peft_type": "LORA",
|
15 |
+
"r": 32,
|
16 |
+
"rank_pattern": {},
|
17 |
+
"revision": null,
|
18 |
+
"target_modules": [
|
19 |
+
"v_proj",
|
20 |
+
"q_proj"
|
21 |
+
],
|
22 |
+
"task_type": "CAUSAL_LM"
|
23 |
+
}
|
final_checkpoint/adapter_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff040de074dd2a7aa9cbd8503ded069b074f1e4f8e3c3f45e0fc3926ac2a8065
|
3 |
+
size 67155338
|
runs/Nov14_21-53-35_jupyter-carlosruizmoreno/events.out.tfevents.1699998818.jupyter-carlosruizmoreno.526.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4cafb8ea2b81e391675d864f788346014c786389890c3dc066cc8cfbc2c8713e
|
3 |
+
size 6453
|
runs/Nov15_09-42-53_jupyter-carlosruizmoreno/events.out.tfevents.1700041375.jupyter-carlosruizmoreno.623.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:845fc7be2377b79155a3a7fae8f8ddcf4300f6320bdae80f4ac9a2db6888a7d7
|
3 |
+
size 9593
|
runs/Nov15_11-40-06_jupyter-carlosruizmoreno/events.out.tfevents.1700048408.jupyter-carlosruizmoreno.623.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9260ab564506a4949dbf7b9ccbddfcaa6b5ae7ed4d79e3843f555ac38c7fea61
|
3 |
+
size 40
|
runs/Nov15_11-41-48_jupyter-carlosruizmoreno/events.out.tfevents.1700048511.jupyter-carlosruizmoreno.2192.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3c3f3c0fa9e343538403f452c37fc2f32f5cd37f1b24355b9fd393de3ab3233
|
3 |
+
size 4565
|
runs/Nov15_11-47-35_jupyter-carlosruizmoreno/events.out.tfevents.1700048867.jupyter-carlosruizmoreno.2530.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52ece578ceedd5ba2365a6139e707d72ef5d98138d0d25b7ab3c0d536562fd7e
|
3 |
+
size 4565
|
runs/Nov15_11-49-46_jupyter-carlosruizmoreno/events.out.tfevents.1700048989.jupyter-carlosruizmoreno.2530.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02c4a424b803d40dd9586e0c5d81cf9d248301721babb321337d11031b22f730
|
3 |
+
size 4564
|
runs/Nov15_11-50-51_jupyter-carlosruizmoreno/events.out.tfevents.1700049053.jupyter-carlosruizmoreno.2971.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:daccf10988cebaad93dcbb4cdf2167048834486520dbbd767f91588a71d940bd
|
3 |
+
size 8022
|
special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "</s>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<unk>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
3 |
+
size 499723
|
tokenizer_config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<unk>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<s>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"bos_token": "<s>",
|
29 |
+
"clean_up_tokenization_spaces": false,
|
30 |
+
"eos_token": "</s>",
|
31 |
+
"legacy": false,
|
32 |
+
"model_max_length": 1000000000000000019884624838656,
|
33 |
+
"pad_token": "</s>",
|
34 |
+
"padding_side": "right",
|
35 |
+
"sp_model_kwargs": {},
|
36 |
+
"tokenizer_class": "LlamaTokenizer",
|
37 |
+
"unk_token": "<unk>",
|
38 |
+
"use_default_system_prompt": false
|
39 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0713cc91f7b7cd310bc527e52fc31872b4f8739d95c22b48ad0ca8ce2a7019a4
|
3 |
+
size 4600
|