grimulkan
/

Llama-3.2-90B-Vision-Hermes-3-lorablated-merge

Model card Files Files and versions Community

grimulkan commited on Oct 9

Commit

e409e32

•

1 Parent(s): 0d87dfc

Support 8B

Files changed (1) hide show

merge_vision_example.py +12 -5

merge_vision_example.py CHANGED Viewed

@@ -4,9 +4,8 @@ from transformers import MllamaForConditionalGeneration, MllamaProcessor, AutoMo
 # NOTE: You need sufficient DRAM to load both models at once (otherwise, need to process layer by layer which is not shown here)
-multimodal_model_path = "models/meta-llama-Llama-3.2-90B-Vision-Instruct" # Original Llama vision model (90B)
-text_model_path = "models/path_to_Llama3.1_70B" # Model to be merged (70B)
 save_path = "models/merged_model"
 multimodal_model = MllamaForConditionalGeneration.from_pretrained(multimodal_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
@@ -19,8 +18,16 @@ state_dict_text = text_model.state_dict()
 num_decoder_layers_text = text_model.config.num_hidden_layers
 num_decoder_layers_vision = multimodal_model.config.text_config.num_hidden_layers
-# Hard-coded list of inserted layers in multimodal Llama
-inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83, 88, 93, 98}
 assert len(inserted_layers) == num_decoder_layers_vision-num_decoder_layers_text, "# of added layers do not match"
 # Build decoder layer map from multimodal layer# to text layer#, skipping layers listed in inserted_layers

 # NOTE: You need sufficient DRAM to load both models at once (otherwise, need to process layer by layer which is not shown here)
+multimodal_model_path = "models/meta-llama-Llama-3.2-90B-Vision-Instruct" # Original Llama vision model (11B or 90B)
+text_model_path = "models/path_to_Llama3.1_70B" # Model to be merged (8B or 70B)
 save_path = "models/merged_model"
 multimodal_model = MllamaForConditionalGeneration.from_pretrained(multimodal_model_path, device_map="cpu", torch_dtype=torch.bfloat16)
 num_decoder_layers_text = text_model.config.num_hidden_layers
 num_decoder_layers_vision = multimodal_model.config.text_config.num_hidden_layers
+# Find the list of inserted layers in multimodal Llama
+inserted_layers = set()
+for key_multimodal in state_dict_multimodal.keys():
+    if "language_model" in key_multimodal and "cross_attn" in key_multimodal and ".layers." in key_multimodal:
+        layer_num_multimodal = int(key_multimodal.split(".layers.")[1].split(".")[0]) if ".layers." in key_multimodal else None
+        if layer_num_multimodal is not None: inserted_layers.add(layer_num_multimodal)
+# Here are the hard-coded list of layers added:
+# inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83, 88, 93, 98} $ For 90B
+# inserted_layers = {3, 8, 13, 18, 23, 28, 33, 38} $ For 11B
 assert len(inserted_layers) == num_decoder_layers_vision-num_decoder_layers_text, "# of added layers do not match"
 # Build decoder layer map from multimodal layer# to text layer#, skipping layers listed in inserted_layers