mrcuddle commited on
Commit
c6c8d88
·
verified ·
1 Parent(s): 9e215be

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +129 -0
README.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - de
5
+ - fr
6
+ - it
7
+ - pt
8
+ - hi
9
+ - es
10
+ - th
11
+ library_name: transformers
12
+ pipeline_tag: image-text-to-text
13
+ tags:
14
+ - meta
15
+ - pytorch
16
+ - llama
17
+ - llama-3
18
+ - vision
19
+ base_model:
20
+ - meta-llama/Llama-3.2-11B-Vision-Instruct
21
+ - rombodawg/Llama-3-8B-Instruct-Coder
22
+ # Lumimimaid v0.2 8B + Llama3.2Vision Adapter
23
+
24
+ This model was created using the script below. It is compatible with:
25
+
26
+ * Llama 3.1 8B & 70B
27
+
28
+ Respectively
29
+
30
+ * Llama Vision 3.2 11B & 90B
31
+
32
+ ## Merge Script
33
+
34
+ ```python
35
+ from transformers import MllamaForConditionalGeneration, MllamaProcessor, AutoModelForCausalLM
36
+
37
+ # NOTE: You need sufficient DRAM to load both models at once (otherwise, need to process layer by layer which is not shown here)
38
+
39
+ multimodal_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Original Llama vision model (11B or 90B)
40
+ text_model_path = "rombodawg/Llama-3-8B-Instruct-Coder" # Model to be merged (8B or 70B)
41
+ save_path = "models/merged_model"
42
+
43
+ multimodal_model = MllamaForConditionalGeneration.from_pretrained(multimodal_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
44
+ multimodal_processor = MllamaProcessor.from_pretrained(multimodal_model_path)
45
+ text_model = AutoModelForCausalLM.from_pretrained(text_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
46
+
47
+ state_dict_multimodal = multimodal_model.state_dict()
48
+ state_dict_text = text_model.state_dict()
49
+
50
+ num_decoder_layers_text = text_model.config.num_hidden_layers
51
+ num_decoder_layers_vision = multimodal_model.config.text_config.num_hidden_layers
52
+
53
+ # Find the list of inserted layers in multimodal Llama
54
+ inserted_layers = set()
55
+ for key_multimodal in state_dict_multimodal.keys():
56
+ if "language_model" in key_multimodal and "cross_attn" in key_multimodal and ".layers." in key_multimodal:
57
+ layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None
58
+ if layer_num_multimodal is not None: inserted_layers.add(layer_num_multimodal)
59
+ # Here are the hard-coded list of layers added:
60
+ # inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83, 88, 93, 98} $ For 90B
61
+ inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38} # For 11B
62
+
63
+ assert len(inserted_layers) == num_decoder_layers_vision - num_decoder_layers_text, "# of added layers do not match"
64
+
65
+ # Build decoder layer map from multimodal layer# to text layer#, skipping layers listed in inserted_layers
66
+ layer_map = dict()
67
+ layer_num_multimodal = 0
68
+ for layer_num_text in range(num_decoder_layers_text):
69
+ while layer_num_multimodal in inserted_layers: layer_num_multimodal += 1 # Increment to skip mismatched layers
70
+ layer_map[layer_num_multimodal] = layer_num_text
71
+ layer_num_multimodal += 1
72
+
73
+ for key_multimodal in state_dict_multimodal.keys():
74
+ if "language_model" not in key_multimodal: continue # A multi-modal param
75
+ if "cross_attn" in key_multimodal: continue # A multi-modal param
76
+ key_text = key_multimodal.replace("language_model.", "")
77
+ if "embed_tokens.weight" in key_multimodal: # Handle embed tokens separately
78
+ assert key_text in state_dict_text, f"Key not found: {key_text}"
79
+ extra_tokens = state_dict_multimodal[key_multimodal].shape[0] - state_dict_text[key_text].shape[0]
80
+ state_dict_multimodal[key_multimodal][:state_dict_text[key_text].shape[0], :].copy_(state_dict_text[key_text])
81
+ print(f"Replaced {key_multimodal} with {key_text} (preserving last {extra_tokens} tokens)")
82
+ continue
83
+ if "lm_head" in key_multimodal or "model.norm.weight" in key_multimodal: # Handle other non-decoder layers separately
84
+ assert key_text in state_dict_text, f"Key not found: {key_text}"
85
+ state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text])
86
+ print(f"Replaced {key_multimodal} with {key_text}")
87
+ continue
88
+ layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None
89
+ assert layer_num_multimodal is not None, f"Unknown non-decoder key encountered: {key_multimodal}"
90
+ if layer_num_multimodal in inserted_layers: continue # Skip mismatched layers
91
+ assert layer_num_multimodal in layer_map, f"Layer not found in layer_map: {layer_num_multimodal}"
92
+ layer_num_text = layer_map[layer_num_multimodal]
93
+ key_text = key_text.replace(f".layers.{layer_num_multimodal}.", f".layers.{layer_num_text}.")
94
+ assert key_text in state_dict_text, f"Key not found: {key_text}"
95
+ state_dict_multimodal[key_multimodal].copy_(state_dict_text[key_text])
96
+ print(f"Replaced {key_multimodal} with {key_text}")
97
+
98
+ print("Merged model successfully. Saving...")
99
+ # Apply the changes
100
+ multimodal_model.load_state_dict(state_dict_multimodal)
101
+
102
+ # Create save_path if it does not exist
103
+ os.makedirs(save_path, exist_ok=True)
104
+ multimodal_model.save_pretrained(save_path, safe_serialization=True, max_shard_size="8192MB")
105
+ multimodal_processor.save_pretrained(save_path)
106
+ print(f"Model saved to {save_path}")
107
+ ```
108
+
109
+ ## Model Inference:
110
+
111
+ ```python
112
+ import requests
113
+ import torch
114
+ from PIL import Image
115
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
116
+
117
+ model_id = "rombodawg/Llama-3-8B-Instruct-Coder"
118
+
119
+ model = MllamaForConditionalGeneration.from_pretrained(
120
+ model_id,
121
+ torch_dtype=torch.bfloat16,
122
+ device_map="auto",
123
+ )
124
+ processor = AutoProcessor.from_pretrained(model_id)
125
+ ```
126
+
127
+ ## License
128
+
129
+ This project is licensed under the MIT License.