diff --git a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/analytics/coremldata.bin
index 15dd0b71b6d13ef3a2902949ba73fd3b01733580..a7a08f33581671f152ae7364261093a9b65b68ac 100644
--- a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e729e06a5dac91d54425432e10c01d40645eefd035e7d3569e6aaf5acc4a1493
+oid sha256:e8372d12aa224d728fc434e91b2c1432b7ef69216416bb047c5f7ae2707e4120
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/coremldata.bin
index 07b8e3785b2dcaee4b07f9e806b6bb84f4c00c7e..7e0e82454afd5db20efc14541476eea12e0c1500 100644
--- a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a55bcffcb4e191cd6358ad92d705948cd757010e873528f66b6e21943904acd
+oid sha256:9d888daf26172f67d0fb48d9f30faca6f62b348e0e571de6855c2a60530aa2bb
 size 485
diff --git a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/metadata.json
index d345f6f24057ccf6ac50e7349f16d9bdd578b45e..b6157da7a61a338688c6d104c8b3e9eb27c7e4cf 100644
--- a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/metadata.json
@@ -138,9 +138,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Int32",
-            "formattedType" : "MultiArray (Int32 1 × 1)",
+            "formattedType" : "MultiArray (Int32 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 1]",
+            "shape" : "[1, 4]",
             "name" : "input_ids",
             "type" : "MultiArray"
           },
@@ -165,9 +165,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -175,9 +175,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -185,9 +185,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -195,23 +195,24 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           }
         ],
         "name" : "input_1_context_512",
         "mlProgramOperationTypeHistogram" : {
-          "Select" : 1,
+          "Select" : 2,
           "Ios18.maximum" : 1,
           "Ios18.gather" : 3,
           "Ios18.sub" : 3,
           "Ios18.transpose" : 1,
-          "Ios18.less" : 1,
+          "Ios18.less" : 2,
           "Ios18.cast" : 2,
-          "Ios18.expandDims" : 4
+          "Ios18.expandDims" : 4,
+          "Tile" : 2
         }
       }
     ],
@@ -265,7 +266,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk1",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk1",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/model.mil
index 45e824a261a5b64081e85bc7f57f633e23aee323..5ce1b265fe7a055f766aba137694fea08ebe8356 100644
--- a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/model.mil
@@ -1,49 +1,56 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<int32, [1]> full_sequence_length, tensor<int32, [1, 1]> input_ids) {
-            tensor<int32, [1]> T = const()[name = string("T"), val = tensor<int32, [1]>([1])];
+    func input_1_context_512<ios18>(tensor<int32, [1]> full_sequence_length, tensor<int32, [1, 4]> input_ids) {
+            tensor<int32, [1]> T = const()[name = string("T"), val = tensor<int32, [1]>([4])];
             int32 x_axis_0 = const()[name = string("x_axis_0"), val = int32(0)];
             int32 x_batch_dims_0 = const()[name = string("x_batch_dims_0"), val = int32(0)];
             bool x_validate_indices_0 = const()[name = string("x_validate_indices_0"), val = bool(false)];
             tensor<fp16, [32000, 4096]> wte_weight_to_fp16 = const()[name = string("wte_weight_to_fp16"), val = tensor<fp16, [32000, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
             string input_ids_to_int16_dtype_0 = const()[name = string("input_ids_to_int16_dtype_0"), val = string("int16")];
-            tensor<int16, [1, 1]> input_ids_to_int16 = cast(dtype = input_ids_to_int16_dtype_0, x = input_ids)[name = string("cast_6")];
-            tensor<fp16, [1, 1, 4096]> x_cast_fp16_cast_uint16 = gather(axis = x_axis_0, batch_dims = x_batch_dims_0, indices = input_ids_to_int16, validate_indices = x_validate_indices_0, x = wte_weight_to_fp16)[name = string("x_cast_fp16_cast_uint16")];
+            tensor<int16, [1, 4]> input_ids_to_int16 = cast(dtype = input_ids_to_int16_dtype_0, x = input_ids)[name = string("cast_6")];
+            tensor<fp16, [1, 4, 4096]> x_cast_fp16_cast_uint16 = gather(axis = x_axis_0, batch_dims = x_batch_dims_0, indices = input_ids_to_int16, validate_indices = x_validate_indices_0, x = wte_weight_to_fp16)[name = string("x_cast_fp16_cast_uint16")];
             tensor<int32, [3]> var_16_perm_0 = const()[name = string("op_16_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
             tensor<int32, [1]> var_18_axes_0 = const()[name = string("op_18_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp16, [1, 4096, 1]> var_16_cast_fp16 = transpose(perm = var_16_perm_0, x = x_cast_fp16_cast_uint16)[name = string("transpose_0")];
-            tensor<fp16, [1, 4096, 1, 1]> x = expand_dims(axes = var_18_axes_0, x = var_16_cast_fp16)[name = string("op_18_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_16_cast_fp16 = transpose(perm = var_16_perm_0, x = x_cast_fp16_cast_uint16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> x = expand_dims(axes = var_18_axes_0, x = var_16_cast_fp16)[name = string("op_18_cast_fp16")];
             tensor<int32, [1]> pos_offset = sub(x = T, y = full_sequence_length)[name = string("pos_offset")];
-            tensor<int32, [1]> var_26 = const()[name = string("op_26"), val = tensor<int32, [1]>([0])];
-            tensor<int32, [1]> input_pos_1 = sub(x = var_26, y = pos_offset)[name = string("input_pos_1")];
-            tensor<int32, [1]> var_34 = const()[name = string("op_34"), val = tensor<int32, [1]>([0])];
-            tensor<int32, [1]> input_pos = maximum(x = input_pos_1, y = var_34)[name = string("input_pos")];
+            tensor<int32, [4]> var_26 = const()[name = string("op_26"), val = tensor<int32, [4]>([0, 1, 2, 3])];
+            tensor<int32, [4]> input_pos_1 = sub(x = var_26, y = pos_offset)[name = string("input_pos_1")];
+            tensor<int32, [4]> var_34 = const()[name = string("op_34"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> input_pos = maximum(x = input_pos_1, y = var_34)[name = string("input_pos")];
             int32 var_45 = const()[name = string("op_45"), val = int32(1)];
             int32 var_46_batch_dims_0 = const()[name = string("op_46_batch_dims_0"), val = int32(0)];
             bool var_46_validate_indices_0 = const()[name = string("op_46_validate_indices_0"), val = bool(false)];
             tensor<fp16, [128, 512]> var_44_to_fp16 = const()[name = string("op_44_to_fp16"), val = tensor<fp16, [128, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(262144128)))];
             string input_pos_to_uint16_dtype_0 = const()[name = string("input_pos_to_uint16_dtype_0"), val = string("uint16")];
-            tensor<uint16, [1]> input_pos_to_uint16 = cast(dtype = input_pos_to_uint16_dtype_0, x = input_pos)[name = string("cast_5")];
-            tensor<fp16, [128, 1]> cos = gather(axis = var_45, batch_dims = var_46_batch_dims_0, indices = input_pos_to_uint16, validate_indices = var_46_validate_indices_0, x = var_44_to_fp16)[name = string("op_46_cast_fp16_cast_uint16")];
+            tensor<uint16, [4]> input_pos_to_uint16 = cast(dtype = input_pos_to_uint16_dtype_0, x = input_pos)[name = string("cast_5")];
+            tensor<fp16, [128, 4]> cos = gather(axis = var_45, batch_dims = var_46_batch_dims_0, indices = input_pos_to_uint16, validate_indices = var_46_validate_indices_0, x = var_44_to_fp16)[name = string("op_46_cast_fp16_cast_uint16")];
             int32 var_56 = const()[name = string("op_56"), val = int32(1)];
             int32 var_57_batch_dims_0 = const()[name = string("op_57_batch_dims_0"), val = int32(0)];
             bool var_57_validate_indices_0 = const()[name = string("op_57_validate_indices_0"), val = bool(false)];
             tensor<fp16, [128, 512]> var_55_to_fp16 = const()[name = string("op_55_to_fp16"), val = tensor<fp16, [128, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(262275264)))];
-            tensor<fp16, [128, 1]> sin = gather(axis = var_56, batch_dims = var_57_batch_dims_0, indices = input_pos_to_uint16, validate_indices = var_57_validate_indices_0, x = var_55_to_fp16)[name = string("op_57_cast_fp16_cast_uint16")];
-            tensor<int32, [512]> var_104 = const()[name = string("op_104"), val = tensor<int32, [512]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511])];
-            int32 var_105 = const()[name = string("op_105"), val = int32(512)];
-            tensor<int32, [1]> var_107 = sub(x = var_105, y = full_sequence_length)[name = string("op_107")];
-            tensor<bool, [512]> var_108 = less(x = var_104, y = var_107)[name = string("op_108")];
+            tensor<fp16, [128, 4]> sin = gather(axis = var_56, batch_dims = var_57_batch_dims_0, indices = input_pos_to_uint16, validate_indices = var_57_validate_indices_0, x = var_55_to_fp16)[name = string("op_57_cast_fp16_cast_uint16")];
+            tensor<int32, [4, 1]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4, 1]>([[0], [1], [2], [3]])];
+            tensor<bool, [4, 1]> var_95 = less(x = var_92, y = pos_offset)[name = string("op_95")];
+            tensor<int32, [2]> var_95_after_broadcast_reps_0 = const()[name = string("op_95_after_broadcast_reps_0"), val = tensor<int32, [2]>([1, 512])];
+            tensor<bool, [4, 512]> var_95_after_broadcast = tile(reps = var_95_after_broadcast_reps_0, x = var_95)[name = string("op_95_after_broadcast")];
+            tensor<fp16, [4, 512]> all_mask_to_fp16 = const()[name = string("all_mask_to_fp16"), val = tensor<fp16, [4, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(263455104)))];
+            tensor<fp16, [4, 512]> m_1_to_fp16 = const()[name = string("m_1_to_fp16"), val = tensor<fp16, [4, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(263459264)))];
+            tensor<fp16, [4, 512]> m_3_cast_fp16 = select(a = all_mask_to_fp16, b = m_1_to_fp16, cond = var_95_after_broadcast)[name = string("m_3_cast_fp16")];
+            tensor<int32, [512]> var_105 = const()[name = string("op_105"), val = tensor<int32, [512]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511])];
+            int32 var_106 = const()[name = string("op_106"), val = int32(512)];
+            tensor<int32, [1]> var_108 = sub(x = var_106, y = full_sequence_length)[name = string("op_108")];
+            tensor<bool, [512]> var_109 = less(x = var_105, y = var_108)[name = string("op_109")];
             tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = string("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<bool, [1, 512]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = var_108)[name = string("expand_dims_0")];
-            tensor<fp16, [1, 512]> all_mask_to_fp16 = const()[name = string("all_mask_to_fp16"), val = tensor<fp16, [1, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(263455104)))];
-            tensor<fp16, [1, 512]> m_1_to_fp16 = const()[name = string("m_1_to_fp16"), val = tensor<fp16, [1, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(263456192)))];
-            tensor<fp16, [1, 512]> m_cast_fp16 = select(a = all_mask_to_fp16, b = m_1_to_fp16, cond = expand_dims_0)[name = string("m_cast_fp16")];
-            tensor<int32, [1]> var_111_axes_0 = const()[name = string("op_111_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp16, [1, 1, 512]> var_111_cast_fp16 = expand_dims(axes = var_111_axes_0, x = m_cast_fp16)[name = string("op_111_cast_fp16")];
-            tensor<int32, [1]> var_113_axes_0 = const()[name = string("op_113_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp16, [1, 1, 1, 512]> mask = expand_dims(axes = var_113_axes_0, x = var_111_cast_fp16)[name = string("op_113_cast_fp16")];
+            tensor<bool, [1, 512]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = var_109)[name = string("expand_dims_0")];
+            tensor<int32, [2]> var_109_after_broadcast_reps_0 = const()[name = string("op_109_after_broadcast_reps_0"), val = tensor<int32, [2]>([4, 1])];
+            tensor<bool, [4, 512]> var_109_after_broadcast = tile(reps = var_109_after_broadcast_reps_0, x = expand_dims_0)[name = string("op_109_after_broadcast")];
+            tensor<fp16, [4, 512]> m_cast_fp16 = select(a = all_mask_to_fp16, b = m_3_cast_fp16, cond = var_109_after_broadcast)[name = string("m_cast_fp16")];
+            tensor<int32, [1]> var_112_axes_0 = const()[name = string("op_112_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 4, 512]> var_112_cast_fp16 = expand_dims(axes = var_112_axes_0, x = m_cast_fp16)[name = string("op_112_cast_fp16")];
+            tensor<int32, [1]> var_114_axes_0 = const()[name = string("op_114_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 4, 512]> mask = expand_dims(axes = var_114_axes_0, x = var_112_cast_fp16)[name = string("op_114_cast_fp16")];
         } -> (x, cos, sin, mask);
     func input_512_context_512<ios18>(tensor<int32, [1]> full_sequence_length, tensor<int32, [1, 512]> input_ids) {
             tensor<int32, [1]> T = const()[name = string("T"), val = tensor<int32, [1]>([512])];
diff --git a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/weights/weight.bin b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/weights/weight.bin
index 6e8c20873a29a5687fb43ee77417e2eb428089b0..4cf3905f9e89ebc68889571e6f7fe97ee64a4475 100644
--- a/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/weights/weight.bin
+++ b/sequoia/Llama-2-7b-hf_chunk1.mlmodelc/weights/weight.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:63ea75c6154c60560d9edb4d2e2f028afa38a3927bb7277b7d01558bc198e965
-size 263457280
+oid sha256:a66aa1771f06ceee6e578b7f93444d38b2cb55120a2a84494e7649b4e424a176
+size 263463424
diff --git a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/metadata.json
index 2359abfd7edaf87ca048d7497ab8ddbb6a979e5b..23258cac83ba7eb110d85c01c6c4950230f6af23 100644
--- a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk10",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk10",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk10.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/metadata.json
index e2543d3f3d06f50caf2f0d50170317e9648be15e..10f9ad4cb6b1b8411f8cdfd4a1cffcd37f2b83ef 100644
--- a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk11",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk11",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/analytics/coremldata.bin
index 256231b5dd45c99911c689627cf8356995cd34e2..948cb9016e7ec43d9c4e2d7a07fbc695dc44c577 100644
--- a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:877129a9d42c3d4d9b1b793d51e152d6fed08881a973bbb5ed4a001571623eb0
+oid sha256:ebfac06ad6ea250163afbdb1dcff54d9a4efd5c687a99f836a173d45bba0e7e9
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/coremldata.bin
index f6543c806dbd1796ed26fc0af1585ceb5b425fdd..bb5791e20329bcaf633f3a83d7b5b8bbaa901259 100644
--- a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e4186acc6251c3785f2b0af36e33eacfe6b4f78971ae86bda2e885776607d79
+oid sha256:5015e3121f08174cb761ca5facaf3f027bc6be5ee22d02a1c8a820193ae2e978
 size 831
diff --git a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/metadata.json
index 6a6cf9984f5c7324d020e7307818acce78f35ad5..3468b3f2f024c1d4c321775c578c3626de30c2a5 100644
--- a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       }
@@ -122,9 +122,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -132,9 +132,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           }
@@ -163,15 +163,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 14,
           "Ios18.conv" : 14,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 5,
-          "Ios18.concat" : 14,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 10,
           "Ios18.add" : 10,
           "Ios18.realDiv" : 5,
           "Ios18.silu" : 2,
           "Ios18.softmax" : 2,
           "Ios18.sliceByIndex" : 12,
-          "Ios18.transpose" : 1,
+          "Ios18.transpose" : 5,
           "Ios16.reduceL2Norm" : 5,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 11,
@@ -184,9 +184,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -194,9 +194,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -204,9 +204,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -214,9 +214,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -224,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -234,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -244,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -254,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           }
@@ -271,9 +271,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 32000)",
+            "formattedType" : "MultiArray (Float16 1 × 4 × 32000)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 32000]",
+            "shape" : "[1, 4, 32000]",
             "name" : "logits",
             "type" : "MultiArray"
           },
@@ -281,9 +281,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -291,9 +291,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -301,9 +301,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -311,9 +311,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           }
@@ -322,15 +322,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 14,
           "Ios18.conv" : 14,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 5,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 14,
           "Ios18.add" : 10,
           "Ios18.realDiv" : 5,
           "Ios18.silu" : 2,
           "Ios18.softmax" : 2,
           "Ios18.sliceByIndex" : 12,
-          "Ios18.transpose" : 1,
+          "Ios18.transpose" : 5,
           "Ios16.reduceL2Norm" : 5,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 11,
@@ -341,15 +341,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 14,
       "Ios18.conv" : 14,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 5,
-      "Ios18.concat" : 14,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 10,
       "Ios18.add" : 10,
       "Ios18.realDiv" : 5,
       "Ios18.silu" : 2,
       "Ios18.softmax" : 2,
       "Ios18.sliceByIndex" : 12,
-      "Ios18.transpose" : 1,
+      "Ios18.transpose" : 5,
       "Ios16.reduceL2Norm" : 5,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 11,
@@ -414,7 +414,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk12",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk12",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil
index 6a149cb5c5df1ecde96ec2fda3ea978a8e6afc28..a56af47d37e50f3598690dd468e3fec2798ce91a 100644
--- a/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk12.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(464735296))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(464735424))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(464735552))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -22,329 +22,337 @@ program(1.3)
             int32 var_31 = const()[name = string("op_31"), val = int32(-2)];
             bool var_32 = const()[name = string("op_32"), val = bool(true)];
             tensor<int32, [1]> var_50_axes_0 = const()[name = string("op_50_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_50_cast_fp16 = squeeze(axes = var_50_axes_0, x = x)[name = string("op_50_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_50_cast_fp16 = squeeze(axes = var_50_axes_0, x = x)[name = string("op_50_cast_fp16")];
             bool var_52_interleave_0 = const()[name = string("op_52_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_52_cast_fp16 = concat(axis = var_28, interleave = var_52_interleave_0, values = (var_50_cast_fp16, eps_chan_1_to_fp16))[name = string("op_52_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_52_cast_fp16 = concat(axis = var_28, interleave = var_52_interleave_0, values = (var_50_cast_fp16, eps_chan_1_to_fp16))[name = string("op_52_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_52_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_52_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_32, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_32, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_57_to_fp16 = const()[name = string("op_57_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_57_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_57_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202379008)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_71 = const()[name = string("op_71"), val = tensor<int32, [2]>([1, 1])];
-            string var_73_pad_type_0 = const()[name = string("op_73_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_73_pad_0 = const()[name = string("op_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_73_cast_fp16 = conv(dilations = var_71, groups = var_28, pad = var_73_pad_0, pad_type = var_73_pad_type_0, strides = var_69, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_73_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_70 = const()[name = string("op_70"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
+            string var_74_pad_type_0 = const()[name = string("op_74_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_74_pad_0 = const()[name = string("op_74_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_74_cast_fp16 = conv(dilations = var_72, groups = var_28, pad = var_74_pad_0, pad_type = var_74_pad_type_0, strides = var_70, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_74_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202387264)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_73_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_79 = const()[name = string("op_79"), val = tensor<int32, [2]>([1, 1])];
-            string var_81_pad_type_0 = const()[name = string("op_81_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_81_pad_0 = const()[name = string("op_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_81_cast_fp16 = conv(dilations = var_79, groups = var_28, pad = var_81_pad_0, pad_type = var_81_pad_type_0, strides = var_77, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_81_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_74_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_78 = const()[name = string("op_78"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
+            string var_82_pad_type_0 = const()[name = string("op_82_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_82_pad_0 = const()[name = string("op_82_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_82_cast_fp16 = conv(dilations = var_80, groups = var_28, pad = var_82_pad_0, pad_type = var_82_pad_type_0, strides = var_78, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_82_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202395520)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_81_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_87 = const()[name = string("op_87"), val = tensor<int32, [2]>([1, 1])];
-            string var_89_pad_type_0 = const()[name = string("op_89_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_89_pad_0 = const()[name = string("op_89_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_89_cast_fp16 = conv(dilations = var_87, groups = var_28, pad = var_89_pad_0, pad_type = var_89_pad_type_0, strides = var_85, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_89_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_82_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_86 = const()[name = string("op_86"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
+            string var_90_pad_type_0 = const()[name = string("op_90_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_90_pad_0 = const()[name = string("op_90_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_90_cast_fp16 = conv(dilations = var_88, groups = var_28, pad = var_90_pad_0, pad_type = var_90_pad_type_0, strides = var_86, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_90_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202403776)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_89_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_91, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_93, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_95, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_107_begin_0 = const()[name = string("op_107_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_107_end_0 = const()[name = string("op_107_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_107_end_mask_0 = const()[name = string("op_107_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_107_cast_fp16 = slice_by_index(begin = var_107_begin_0, end = var_107_end_0, end_mask = var_107_end_mask_0, x = q_3_cast_fp16)[name = string("op_107_cast_fp16")];
-            tensor<int32, [4]> var_113_begin_0 = const()[name = string("op_113_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_113_end_0 = const()[name = string("op_113_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_113_end_mask_0 = const()[name = string("op_113_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_113_cast_fp16 = slice_by_index(begin = var_113_begin_0, end = var_113_end_0, end_mask = var_113_end_mask_0, x = q_3_cast_fp16)[name = string("op_113_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_90_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_92, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_94, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_96, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_108_begin_0 = const()[name = string("op_108_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_108_end_0 = const()[name = string("op_108_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_108_end_mask_0 = const()[name = string("op_108_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_108_cast_fp16 = slice_by_index(begin = var_108_begin_0, end = var_108_end_0, end_mask = var_108_end_mask_0, x = q_3_cast_fp16)[name = string("op_108_cast_fp16")];
+            tensor<int32, [4]> var_114_begin_0 = const()[name = string("op_114_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_114_end_0 = const()[name = string("op_114_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_114_end_mask_0 = const()[name = string("op_114_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_114_cast_fp16 = slice_by_index(begin = var_114_begin_0, end = var_114_end_0, end_mask = var_114_end_mask_0, x = q_3_cast_fp16)[name = string("op_114_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_115_cast_fp16 = mul(x = var_113_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_115_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_116_cast_fp16 = mul(x = var_114_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_116_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_31, interleave = rotated_1_interleave_0, values = (var_115_cast_fp16, var_107_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_118_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_118_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_119_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_119_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_118_cast_fp16, y = var_119_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_132_begin_0 = const()[name = string("op_132_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_132_end_0 = const()[name = string("op_132_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_132_end_mask_0 = const()[name = string("op_132_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_132_cast_fp16 = slice_by_index(begin = var_132_begin_0, end = var_132_end_0, end_mask = var_132_end_mask_0, x = k_3_cast_fp16)[name = string("op_132_cast_fp16")];
-            tensor<int32, [4]> var_138_begin_0 = const()[name = string("op_138_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_138_end_0 = const()[name = string("op_138_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_138_end_mask_0 = const()[name = string("op_138_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_138_cast_fp16 = slice_by_index(begin = var_138_begin_0, end = var_138_end_0, end_mask = var_138_end_mask_0, x = k_3_cast_fp16)[name = string("op_138_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_31, interleave = rotated_1_interleave_0, values = (var_116_cast_fp16, var_108_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_119_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_119_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_120_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_120_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_119_cast_fp16, y = var_120_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_133_begin_0 = const()[name = string("op_133_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_133_end_0 = const()[name = string("op_133_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_133_end_mask_0 = const()[name = string("op_133_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_133_cast_fp16 = slice_by_index(begin = var_133_begin_0, end = var_133_end_0, end_mask = var_133_end_mask_0, x = k_3_cast_fp16)[name = string("op_133_cast_fp16")];
+            tensor<int32, [4]> var_139_begin_0 = const()[name = string("op_139_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_139_end_0 = const()[name = string("op_139_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_139_end_mask_0 = const()[name = string("op_139_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_139_cast_fp16 = slice_by_index(begin = var_139_begin_0, end = var_139_end_0, end_mask = var_139_end_mask_0, x = k_3_cast_fp16)[name = string("op_139_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_140_cast_fp16 = mul(x = var_138_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_140_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_141_cast_fp16 = mul(x = var_139_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_141_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_31, interleave = rotated_3_interleave_0, values = (var_140_cast_fp16, var_132_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_143_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_143_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_144_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_144_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_143_cast_fp16, y = var_144_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_31, interleave = rotated_3_interleave_0, values = (var_141_cast_fp16, var_133_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_144_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_144_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_145_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_145_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_144_cast_fp16, y = var_145_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_19, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_19, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_151_begin_0 = const()[name = string("op_151_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_151_end_0 = const()[name = string("op_151_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_151_end_mask_0 = const()[name = string("op_151_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_151_begin_0, end = var_151_end_0, end_mask = var_151_end_mask_0, x = k_7_cast_fp16)[name = string("op_151_cast_fp16")];
-            tensor<int32, [4]> var_152_begin_0 = const()[name = string("op_152_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_152_end_0 = const()[name = string("op_152_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_152_end_mask_0 = const()[name = string("op_152_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_152_begin_0, end = var_152_end_0, end_mask = var_152_end_mask_0, x = v_5_cast_fp16)[name = string("op_152_cast_fp16")];
-            fp16 var_156_to_fp16 = const()[name = string("op_156_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_157_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_156_to_fp16)[name = string("op_157_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_31, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_7_cast_fp16)[name = string("op_156_cast_fp16")];
+            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_7_cast_fp16)[name = string("op_157_cast_fp16")];
+            fp16 var_162_to_fp16 = const()[name = string("op_162_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_163_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_162_to_fp16)[name = string("op_163_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_157_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_165_cast_fp16 = softmax(axis = var_27, x = attn_weights_3_cast_fp16)[name = string("op_165_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_165_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_169 = const()[name = string("op_169"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_169, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_173 = const()[name = string("op_173"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_175 = const()[name = string("op_175"), val = tensor<int32, [2]>([1, 1])];
-            string var_177_pad_type_0 = const()[name = string("op_177_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_177_pad_0 = const()[name = string("op_177_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_177_cast_fp16 = conv(dilations = var_175, groups = var_28, pad = var_177_pad_0, pad_type = var_177_pad_type_0, strides = var_173, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_177_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_163_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_27, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_172_transpose_x_0 = const()[name = string("op_172_transpose_x_0"), val = bool(false)];
+            bool var_172_transpose_y_0 = const()[name = string("op_172_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_172_cast_fp16 = matmul(transpose_x = var_172_transpose_x_0, transpose_y = var_172_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_172_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_175 = const()[name = string("op_175"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_172_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_175, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_179 = const()[name = string("op_179"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_181 = const()[name = string("op_181"), val = tensor<int32, [2]>([1, 1])];
+            string var_183_pad_type_0 = const()[name = string("op_183_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_183_pad_0 = const()[name = string("op_183_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_183_cast_fp16 = conv(dilations = var_181, groups = var_28, pad = var_183_pad_0, pad_type = var_183_pad_type_0, strides = var_179, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_183_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202412032)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_177_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_196_axes_0 = const()[name = string("op_196_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_196_cast_fp16 = squeeze(axes = var_196_axes_0, x = x_11_cast_fp16)[name = string("op_196_cast_fp16")];
-            bool var_198_interleave_0 = const()[name = string("op_198_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_198_cast_fp16 = concat(axis = var_28, interleave = var_198_interleave_0, values = (var_196_cast_fp16, eps_chan_3_to_fp16))[name = string("op_198_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_183_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_202_axes_0 = const()[name = string("op_202_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_202_cast_fp16 = squeeze(axes = var_202_axes_0, x = x_11_cast_fp16)[name = string("op_202_cast_fp16")];
+            bool var_204_interleave_0 = const()[name = string("op_204_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_204_cast_fp16 = concat(axis = var_28, interleave = var_204_interleave_0, values = (var_202_cast_fp16, eps_chan_3_to_fp16))[name = string("op_204_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_198_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_204_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_32, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_203_to_fp16 = const()[name = string("op_203_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_203_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_32, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_209_to_fp16 = const()[name = string("op_209_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_209_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202420288)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_215 = const()[name = string("op_215"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_217 = const()[name = string("op_217"), val = tensor<int32, [2]>([1, 1])];
-            string var_219_pad_type_0 = const()[name = string("op_219_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_219_pad_0 = const()[name = string("op_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_219_cast_fp16 = conv(dilations = var_217, groups = var_28, pad = var_219_pad_0, pad_type = var_219_pad_type_0, strides = var_215, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_219_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202428544)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_219_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_221 = const()[name = string("op_221"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_223 = const()[name = string("op_223"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_225 = const()[name = string("op_225"), val = tensor<int32, [2]>([1, 1])];
-            string var_227_pad_type_0 = const()[name = string("op_227_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_227_pad_0 = const()[name = string("op_227_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_227_cast_fp16 = conv(dilations = var_225, groups = var_28, pad = var_227_pad_0, pad_type = var_227_pad_type_0, strides = var_223, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_227_cast_fp16")];
+            string var_225_pad_type_0 = const()[name = string("op_225_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_225_pad_0 = const()[name = string("op_225_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_225_cast_fp16 = conv(dilations = var_223, groups = var_28, pad = var_225_pad_0, pad_type = var_225_pad_type_0, strides = var_221, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_225_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202428544)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_225_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_229 = const()[name = string("op_229"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_231 = const()[name = string("op_231"), val = tensor<int32, [2]>([1, 1])];
+            string var_233_pad_type_0 = const()[name = string("op_233_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_233_pad_0 = const()[name = string("op_233_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_233_cast_fp16 = conv(dilations = var_231, groups = var_28, pad = var_233_pad_0, pad_type = var_233_pad_type_0, strides = var_229, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_233_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202450624)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_227_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_229_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_229_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_229_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_233 = const()[name = string("op_233"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_235 = const()[name = string("op_235"), val = tensor<int32, [2]>([1, 1])];
-            string var_237_pad_type_0 = const()[name = string("op_237_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_237_pad_0 = const()[name = string("op_237_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_237_cast_fp16 = conv(dilations = var_235, groups = var_28, pad = var_237_pad_0, pad_type = var_237_pad_type_0, strides = var_233, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_237_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_233_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_235_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_235_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_235_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_239 = const()[name = string("op_239"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_241 = const()[name = string("op_241"), val = tensor<int32, [2]>([1, 1])];
+            string var_243_pad_type_0 = const()[name = string("op_243_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_243_pad_0 = const()[name = string("op_243_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_243_cast_fp16 = conv(dilations = var_241, groups = var_28, pad = var_243_pad_0, pad_type = var_243_pad_type_0, strides = var_239, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_243_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202472704)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_238_cast_fp16 = mul(x = var_237_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_238_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_238_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_249 = const()[name = string("op_249"), val = int32(-1)];
-            int32 var_257 = const()[name = string("op_257"), val = int32(3)];
-            int32 var_258 = const()[name = string("op_258"), val = int32(1)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(-2)];
-            bool var_262 = const()[name = string("op_262"), val = bool(true)];
-            tensor<int32, [1]> var_279_axes_0 = const()[name = string("op_279_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_279_cast_fp16 = squeeze(axes = var_279_axes_0, x = x_15_cast_fp16)[name = string("op_279_cast_fp16")];
-            bool var_281_interleave_0 = const()[name = string("op_281_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_281_cast_fp16 = concat(axis = var_258, interleave = var_281_interleave_0, values = (var_279_cast_fp16, eps_chan_5_to_fp16))[name = string("op_281_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_244_cast_fp16 = mul(x = var_243_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_244_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_244_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_255 = const()[name = string("op_255"), val = int32(-1)];
+            int32 var_263 = const()[name = string("op_263"), val = int32(3)];
+            int32 var_264 = const()[name = string("op_264"), val = int32(1)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(-2)];
+            bool var_268 = const()[name = string("op_268"), val = bool(true)];
+            tensor<int32, [1]> var_285_axes_0 = const()[name = string("op_285_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_285_cast_fp16 = squeeze(axes = var_285_axes_0, x = x_15_cast_fp16)[name = string("op_285_cast_fp16")];
+            bool var_287_interleave_0 = const()[name = string("op_287_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_287_cast_fp16 = concat(axis = var_264, interleave = var_287_interleave_0, values = (var_285_cast_fp16, eps_chan_5_to_fp16))[name = string("op_287_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_281_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_287_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_262, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_286_to_fp16 = const()[name = string("op_286_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_286_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_268, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_292_to_fp16 = const()[name = string("op_292_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_292_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202480960)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_301 = const()[name = string("op_301"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_303 = const()[name = string("op_303"), val = tensor<int32, [2]>([1, 1])];
-            string var_305_pad_type_0 = const()[name = string("op_305_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_305_pad_0 = const()[name = string("op_305_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_305_cast_fp16 = conv(dilations = var_303, groups = var_258, pad = var_305_pad_0, pad_type = var_305_pad_type_0, strides = var_301, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_305_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_310 = const()[name = string("op_310"), val = tensor<int32, [2]>([1, 1])];
+            string var_312_pad_type_0 = const()[name = string("op_312_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_312_pad_0 = const()[name = string("op_312_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_312_cast_fp16 = conv(dilations = var_310, groups = var_264, pad = var_312_pad_0, pad_type = var_312_pad_type_0, strides = var_308, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_312_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202489216)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_305_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
-            string var_313_pad_type_0 = const()[name = string("op_313_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_313_pad_0 = const()[name = string("op_313_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_313_cast_fp16 = conv(dilations = var_311, groups = var_258, pad = var_313_pad_0, pad_type = var_313_pad_type_0, strides = var_309, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_313_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_312_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_318 = const()[name = string("op_318"), val = tensor<int32, [2]>([1, 1])];
+            string var_320_pad_type_0 = const()[name = string("op_320_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_320_pad_0 = const()[name = string("op_320_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_320_cast_fp16 = conv(dilations = var_318, groups = var_264, pad = var_320_pad_0, pad_type = var_320_pad_type_0, strides = var_316, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_320_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202497472)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_313_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
-            string var_321_pad_type_0 = const()[name = string("op_321_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_321_pad_0 = const()[name = string("op_321_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_321_cast_fp16 = conv(dilations = var_319, groups = var_258, pad = var_321_pad_0, pad_type = var_321_pad_type_0, strides = var_317, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_321_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_320_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_326 = const()[name = string("op_326"), val = tensor<int32, [2]>([1, 1])];
+            string var_328_pad_type_0 = const()[name = string("op_328_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_328_pad_0 = const()[name = string("op_328_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_328_cast_fp16 = conv(dilations = var_326, groups = var_264, pad = var_328_pad_0, pad_type = var_328_pad_type_0, strides = var_324, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_328_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202505728)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_321_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_323 = const()[name = string("op_323"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_323, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_325 = const()[name = string("op_325"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_325, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_327 = const()[name = string("op_327"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_327, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_339_begin_0 = const()[name = string("op_339_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_339_end_0 = const()[name = string("op_339_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_339_end_mask_0 = const()[name = string("op_339_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_339_cast_fp16 = slice_by_index(begin = var_339_begin_0, end = var_339_end_0, end_mask = var_339_end_mask_0, x = q_9_cast_fp16)[name = string("op_339_cast_fp16")];
-            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_328_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_330, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_332, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_334 = const()[name = string("op_334"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_334, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_346_begin_0 = const()[name = string("op_346_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_346_end_0 = const()[name = string("op_346_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_346_end_mask_0 = const()[name = string("op_346_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_346_cast_fp16 = slice_by_index(begin = var_346_begin_0, end = var_346_end_0, end_mask = var_346_end_mask_0, x = q_9_cast_fp16)[name = string("op_346_cast_fp16")];
+            tensor<int32, [4]> var_352_begin_0 = const()[name = string("op_352_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_352_end_0 = const()[name = string("op_352_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_352_end_mask_0 = const()[name = string("op_352_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_352_cast_fp16 = slice_by_index(begin = var_352_begin_0, end = var_352_end_0, end_mask = var_352_end_mask_0, x = q_9_cast_fp16)[name = string("op_352_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_347_cast_fp16 = mul(x = var_345_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_347_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_354_cast_fp16 = mul(x = var_352_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_354_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_261, interleave = rotated_5_interleave_0, values = (var_347_cast_fp16, var_339_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_350_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_350_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_351_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_351_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_350_cast_fp16, y = var_351_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_364_begin_0 = const()[name = string("op_364_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_364_end_0 = const()[name = string("op_364_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_364_end_mask_0 = const()[name = string("op_364_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_364_cast_fp16 = slice_by_index(begin = var_364_begin_0, end = var_364_end_0, end_mask = var_364_end_mask_0, x = k_11_cast_fp16)[name = string("op_364_cast_fp16")];
-            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_11_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_267, interleave = rotated_5_interleave_0, values = (var_354_cast_fp16, var_346_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_357_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_358_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_358_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_357_cast_fp16, y = var_358_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_371_begin_0 = const()[name = string("op_371_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_371_end_0 = const()[name = string("op_371_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_371_end_mask_0 = const()[name = string("op_371_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_371_cast_fp16 = slice_by_index(begin = var_371_begin_0, end = var_371_end_0, end_mask = var_371_end_mask_0, x = k_11_cast_fp16)[name = string("op_371_cast_fp16")];
+            tensor<int32, [4]> var_377_begin_0 = const()[name = string("op_377_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_377_end_0 = const()[name = string("op_377_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_377_end_mask_0 = const()[name = string("op_377_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_377_cast_fp16 = slice_by_index(begin = var_377_begin_0, end = var_377_end_0, end_mask = var_377_end_mask_0, x = k_11_cast_fp16)[name = string("op_377_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_372_cast_fp16 = mul(x = var_370_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_372_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_379_cast_fp16 = mul(x = var_377_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_379_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_261, interleave = rotated_interleave_0, values = (var_372_cast_fp16, var_364_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_375_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_375_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_376_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_376_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_375_cast_fp16, y = var_376_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_267, interleave = rotated_interleave_0, values = (var_379_cast_fp16, var_371_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_382_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_383_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_383_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_382_cast_fp16, y = var_383_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_249, interleave = k_interleave_0, values = (k_cache_1, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_255, interleave = k_interleave_0, values = (k_cache_1, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_249, interleave = v_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_383_begin_0 = const()[name = string("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_383_end_0 = const()[name = string("op_383_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_383_end_mask_0 = const()[name = string("op_383_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = k_cast_fp16)[name = string("op_383_cast_fp16")];
-            tensor<int32, [4]> var_384_begin_0 = const()[name = string("op_384_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_384_end_0 = const()[name = string("op_384_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_384_end_mask_0 = const()[name = string("op_384_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_384_begin_0, end = var_384_end_0, end_mask = var_384_end_mask_0, x = v_cast_fp16)[name = string("op_384_cast_fp16")];
-            fp16 var_388_to_fp16 = const()[name = string("op_388_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_389_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_388_to_fp16)[name = string("op_389_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_389_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_397_cast_fp16 = softmax(axis = var_257, x = attn_weights_cast_fp16)[name = string("op_397_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_397_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_401 = const()[name = string("op_401"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_401, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_405 = const()[name = string("op_405"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_407 = const()[name = string("op_407"), val = tensor<int32, [2]>([1, 1])];
-            string var_409_pad_type_0 = const()[name = string("op_409_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_409_pad_0 = const()[name = string("op_409_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_409_cast_fp16 = conv(dilations = var_407, groups = var_258, pad = var_409_pad_0, pad_type = var_409_pad_type_0, strides = var_405, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_409_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_267, interleave = v_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_394_begin_0 = const()[name = string("op_394_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_394_end_0 = const()[name = string("op_394_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_394_end_mask_0 = const()[name = string("op_394_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_394_begin_0, end = var_394_end_0, end_mask = var_394_end_mask_0, x = k_cast_fp16)[name = string("op_394_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = string("op_395_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = string("op_395_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = string("op_395_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_cast_fp16)[name = string("op_395_cast_fp16")];
+            fp16 var_400_to_fp16 = const()[name = string("op_400_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_401_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_400_to_fp16)[name = string("op_401_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_401_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_263, x = attn_weights_9_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_410_transpose_x_0 = const()[name = string("op_410_transpose_x_0"), val = bool(false)];
+            bool var_410_transpose_y_0 = const()[name = string("op_410_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_410_cast_fp16 = matmul(transpose_x = var_410_transpose_x_0, transpose_y = var_410_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_410_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_413 = const()[name = string("op_413"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_410_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_413, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_417 = const()[name = string("op_417"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_419 = const()[name = string("op_419"), val = tensor<int32, [2]>([1, 1])];
+            string var_421_pad_type_0 = const()[name = string("op_421_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_421_pad_0 = const()[name = string("op_421_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_421_cast_fp16 = conv(dilations = var_419, groups = var_264, pad = var_421_pad_0, pad_type = var_421_pad_type_0, strides = var_417, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_421_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202513984)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_409_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_428_axes_0 = const()[name = string("op_428_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_428_cast_fp16 = squeeze(axes = var_428_axes_0, x = x_25_cast_fp16)[name = string("op_428_cast_fp16")];
-            bool var_430_interleave_0 = const()[name = string("op_430_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_430_cast_fp16 = concat(axis = var_258, interleave = var_430_interleave_0, values = (var_428_cast_fp16, eps_chan_7_to_fp16))[name = string("op_430_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_421_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_440_axes_0 = const()[name = string("op_440_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_440_cast_fp16 = squeeze(axes = var_440_axes_0, x = x_25_cast_fp16)[name = string("op_440_cast_fp16")];
+            bool var_442_interleave_0 = const()[name = string("op_442_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_442_cast_fp16 = concat(axis = var_264, interleave = var_442_interleave_0, values = (var_440_cast_fp16, eps_chan_7_to_fp16))[name = string("op_442_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_430_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_442_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_262, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_435_to_fp16 = const()[name = string("op_435_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_435_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_268, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_447_to_fp16 = const()[name = string("op_447_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_447_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202522240)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_447 = const()[name = string("op_447"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_449 = const()[name = string("op_449"), val = tensor<int32, [2]>([1, 1])];
-            string var_451_pad_type_0 = const()[name = string("op_451_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_451_pad_0 = const()[name = string("op_451_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_451_cast_fp16 = conv(dilations = var_449, groups = var_258, pad = var_451_pad_0, pad_type = var_451_pad_type_0, strides = var_447, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_451_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_459 = const()[name = string("op_459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_461 = const()[name = string("op_461"), val = tensor<int32, [2]>([1, 1])];
+            string var_463_pad_type_0 = const()[name = string("op_463_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_463_pad_0 = const()[name = string("op_463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_463_cast_fp16 = conv(dilations = var_461, groups = var_264, pad = var_463_pad_0, pad_type = var_463_pad_type_0, strides = var_459, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_463_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202530496)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_451_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_455 = const()[name = string("op_455"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_457 = const()[name = string("op_457"), val = tensor<int32, [2]>([1, 1])];
-            string var_459_pad_type_0 = const()[name = string("op_459_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_459_pad_0 = const()[name = string("op_459_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_459_cast_fp16 = conv(dilations = var_457, groups = var_258, pad = var_459_pad_0, pad_type = var_459_pad_type_0, strides = var_455, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_459_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202552576)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_459_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_461_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_461_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_461_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
-            tensor<int32, [2]> var_465 = const()[name = string("op_465"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_463_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_467 = const()[name = string("op_467"), val = tensor<int32, [2]>([1, 1])];
-            string var_469_pad_type_0 = const()[name = string("op_469_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_469_pad_0 = const()[name = string("op_469_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_469_cast_fp16 = conv(dilations = var_467, groups = var_258, pad = var_469_pad_0, pad_type = var_469_pad_type_0, strides = var_465, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_469_cast_fp16")];
+            tensor<int32, [2]> var_469 = const()[name = string("op_469"), val = tensor<int32, [2]>([1, 1])];
+            string var_471_pad_type_0 = const()[name = string("op_471_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_471_pad_0 = const()[name = string("op_471_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_471_cast_fp16 = conv(dilations = var_469, groups = var_264, pad = var_471_pad_0, pad_type = var_471_pad_type_0, strides = var_467, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_471_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202552576)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_471_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_473_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_473_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_473_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_477 = const()[name = string("op_477"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_479 = const()[name = string("op_479"), val = tensor<int32, [2]>([1, 1])];
+            string var_481_pad_type_0 = const()[name = string("op_481_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_481_pad_0 = const()[name = string("op_481_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_481_cast_fp16 = conv(dilations = var_479, groups = var_264, pad = var_481_pad_0, pad_type = var_481_pad_type_0, strides = var_477, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_481_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202574656)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_470_cast_fp16 = mul(x = var_469_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_470_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_470_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_476 = const()[name = string("op_476"), val = int32(-1)];
-            int32 var_485 = const()[name = string("op_485"), val = int32(1)];
-            bool var_489 = const()[name = string("op_489"), val = bool(true)];
-            tensor<int32, [1]> var_505_axes_0 = const()[name = string("op_505_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_505_cast_fp16 = squeeze(axes = var_505_axes_0, x = x_29_cast_fp16)[name = string("op_505_cast_fp16")];
-            bool var_507_interleave_0 = const()[name = string("op_507_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_507_cast_fp16 = concat(axis = var_485, interleave = var_507_interleave_0, values = (var_505_cast_fp16, eps_chan_to_fp16))[name = string("op_507_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_482_cast_fp16 = mul(x = var_481_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_482_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_482_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_488 = const()[name = string("op_488"), val = int32(-1)];
+            int32 var_497 = const()[name = string("op_497"), val = int32(1)];
+            bool var_501 = const()[name = string("op_501"), val = bool(true)];
+            tensor<int32, [1]> var_517_axes_0 = const()[name = string("op_517_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_517_cast_fp16 = squeeze(axes = var_517_axes_0, x = x_29_cast_fp16)[name = string("op_517_cast_fp16")];
+            bool var_519_interleave_0 = const()[name = string("op_519_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_519_cast_fp16 = concat(axis = var_497, interleave = var_519_interleave_0, values = (var_517_cast_fp16, eps_chan_to_fp16))[name = string("op_519_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_507_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_519_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_489, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_512_to_fp16 = const()[name = string("op_512_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_512_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_501, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_524_to_fp16 = const()[name = string("op_524_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_524_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> post_block_ln_f_weight_to_fp16 = const()[name = string("post_block_ln_f_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202582912)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = post_block_ln_f_weight_to_fp16)[name = string("x_cast_fp16")];
-            tensor<int32, [1]> var_516_axes_0 = const()[name = string("op_516_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp16, [1, 4096, 1]> var_516_cast_fp16 = squeeze(axes = var_516_axes_0, x = x_cast_fp16)[name = string("op_516_cast_fp16")];
-            tensor<int32, [3]> var_517_perm_0 = const()[name = string("op_517_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [2]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [2]>([1, 4096])];
-            tensor<fp16, [1, 1, 4096]> var_517_cast_fp16 = transpose(perm = var_517_perm_0, x = var_516_cast_fp16)[name = string("transpose_4")];
-            tensor<fp16, [1, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_4, x = var_517_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = post_block_ln_f_weight_to_fp16)[name = string("x_cast_fp16")];
+            tensor<int32, [1]> var_528_axes_0 = const()[name = string("op_528_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 4096, 4]> var_528_cast_fp16 = squeeze(axes = var_528_axes_0, x = x_cast_fp16)[name = string("op_528_cast_fp16")];
+            tensor<int32, [3]> var_529_perm_0 = const()[name = string("op_529_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [2]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [2]>([4, 4096])];
+            tensor<fp16, [1, 4, 4096]> var_529_cast_fp16 = transpose(perm = var_529_perm_0, x = var_528_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [4, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_4, x = var_529_cast_fp16)[name = string("reshape_0_cast_fp16")];
             bool matmul_0_transpose_x_0 = const()[name = string("matmul_0_transpose_x_0"), val = bool(false)];
             bool matmul_0_transpose_y_0 = const()[name = string("matmul_0_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 16384]> transpose_1_to_fp16 = const()[name = string("transpose_1_to_fp16"), val = tensor<fp16, [4096, 16384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202591168)))];
-            tensor<fp16, [1, 16384]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_1_to_fp16)[name = string("matmul_0_cast_fp16")];
-            tensor<int32, [3]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [3]>([1, 1, 16384])];
-            tensor<fp16, [1, 1, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_8, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<fp16, [4, 16384]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_1_to_fp16)[name = string("matmul_0_cast_fp16")];
+            tensor<int32, [3]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [3]>([1, 4, 16384])];
+            tensor<fp16, [1, 4, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_8, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
             bool matmul_1_transpose_x_0 = const()[name = string("matmul_1_transpose_x_0"), val = bool(false)];
             bool matmul_1_transpose_y_0 = const()[name = string("matmul_1_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 15616]> transpose_3_to_fp16 = const()[name = string("transpose_3_to_fp16"), val = tensor<fp16, [4096, 15616]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(336808960)))];
-            tensor<fp16, [1, 15616]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_3_to_fp16)[name = string("matmul_1_cast_fp16")];
-            tensor<int32, [3]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [3]>([1, 1, 15616])];
-            tensor<fp16, [1, 1, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_16, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
-            bool var_526_interleave_0 = const()[name = string("op_526_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 32000]> logits = concat(axis = var_476, interleave = var_526_interleave_0, values = (reshape_2_cast_fp16, reshape_5_cast_fp16))[name = string("op_526_cast_fp16")];
+            tensor<fp16, [4, 15616]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_3_to_fp16)[name = string("matmul_1_cast_fp16")];
+            tensor<int32, [3]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [3]>([1, 4, 15616])];
+            tensor<fp16, [1, 4, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_16, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            bool var_538_interleave_0 = const()[name = string("op_538_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 4, 32000]> logits = concat(axis = var_488, interleave = var_538_interleave_0, values = (reshape_2_cast_fp16, reshape_5_cast_fp16))[name = string("op_538_cast_fp16")];
         } -> (logits, new_k_cache_0, new_k_cache_1, new_v_cache_0, new_v_cache_1);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -379,86 +387,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_53_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202379008)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_65 = const()[name = string("op_65"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
-            string var_69_pad_type_0 = const()[name = string("op_69_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_69_pad_0 = const()[name = string("op_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_69_cast_fp16 = conv(dilations = var_67, groups = var_24, pad = var_69_pad_0, pad_type = var_69_pad_type_0, strides = var_65, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_69_cast_fp16")];
+            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
+            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_24, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202387264)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_69_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
-            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_24, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
+            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_24, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202395520)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
-            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_24, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
+            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_24, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202403776)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_87 = const()[name = string("op_87"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_87, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_89, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_91, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_103_begin_0 = const()[name = string("op_103_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_103_end_0 = const()[name = string("op_103_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_103_end_mask_0 = const()[name = string("op_103_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_103_cast_fp16 = slice_by_index(begin = var_103_begin_0, end = var_103_end_0, end_mask = var_103_end_mask_0, x = q_3_cast_fp16)[name = string("op_103_cast_fp16")];
-            tensor<int32, [4]> var_109_begin_0 = const()[name = string("op_109_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_109_end_0 = const()[name = string("op_109_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_109_end_mask_0 = const()[name = string("op_109_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_109_cast_fp16 = slice_by_index(begin = var_109_begin_0, end = var_109_end_0, end_mask = var_109_end_mask_0, x = q_3_cast_fp16)[name = string("op_109_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
+            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = mul(x = var_109_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_111_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_27, interleave = rotated_1_interleave_0, values = (var_111_cast_fp16, var_103_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_114_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_114_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_114_cast_fp16, y = var_115_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_128_begin_0 = const()[name = string("op_128_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_128_end_0 = const()[name = string("op_128_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_128_end_mask_0 = const()[name = string("op_128_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_128_cast_fp16 = slice_by_index(begin = var_128_begin_0, end = var_128_end_0, end_mask = var_128_end_mask_0, x = k_3_cast_fp16)[name = string("op_128_cast_fp16")];
-            tensor<int32, [4]> var_134_begin_0 = const()[name = string("op_134_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_134_end_0 = const()[name = string("op_134_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_134_end_mask_0 = const()[name = string("op_134_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_134_cast_fp16 = slice_by_index(begin = var_134_begin_0, end = var_134_end_0, end_mask = var_134_end_mask_0, x = k_3_cast_fp16)[name = string("op_134_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_27, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
+            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = mul(x = var_134_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_136_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_27, interleave = rotated_3_interleave_0, values = (var_136_cast_fp16, var_128_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_139_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_139_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_139_cast_fp16, y = var_140_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_27, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_27, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = k_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = v_3_cast_fp16)[name = string("op_156_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_27, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_145_begin_0 = const()[name = string("op_145_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_145_end_0 = const()[name = string("op_145_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_145_end_mask_0 = const()[name = string("op_145_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_145_begin_0, end = var_145_end_0, end_mask = var_145_end_mask_0, x = roped_3_cast_fp16)[name = string("op_145_cast_fp16")];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = v_5_cast_fp16)[name = string("op_146_cast_fp16")];
             fp16 var_160_to_fp16 = const()[name = string("op_160_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_161_cast_fp16 = mul(x = q_5_cast_fp16, y = var_160_to_fp16)[name = string("op_161_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_161_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_160_to_fp16)[name = string("op_161_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_161_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_161_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_169_cast_fp16 = softmax(axis = var_23, x = attn_weights_3_cast_fp16)[name = string("op_169_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_169_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_23, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_170_transpose_x_1 = const()[name = string("op_170_transpose_x_1"), val = bool(false)];
+            bool var_170_transpose_y_1 = const()[name = string("op_170_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_170_cast_fp16 = matmul(transpose_x = var_170_transpose_x_1, transpose_y = var_170_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_170_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_173 = const()[name = string("op_173"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_170_cast_fp16)[name = string("transpose_7")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_173, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_177 = const()[name = string("op_177"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_179 = const()[name = string("op_179"), val = tensor<int32, [2]>([1, 1])];
@@ -522,86 +530,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_290_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202480960)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_305 = const()[name = string("op_305"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
-            string var_309_pad_type_0 = const()[name = string("op_309_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_309_pad_0 = const()[name = string("op_309_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_309_cast_fp16 = conv(dilations = var_307, groups = var_262, pad = var_309_pad_0, pad_type = var_309_pad_type_0, strides = var_305, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_309_cast_fp16")];
+            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
+            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_262, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202489216)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_309_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
-            string var_317_pad_type_0 = const()[name = string("op_317_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_317_pad_0 = const()[name = string("op_317_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_317_cast_fp16 = conv(dilations = var_315, groups = var_262, pad = var_317_pad_0, pad_type = var_317_pad_type_0, strides = var_313, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_317_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
+            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_262, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202497472)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_317_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
-            string var_325_pad_type_0 = const()[name = string("op_325_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_325_pad_0 = const()[name = string("op_325_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_325_cast_fp16 = conv(dilations = var_323, groups = var_262, pad = var_325_pad_0, pad_type = var_325_pad_type_0, strides = var_321, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_325_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
+            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_262, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202505728)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_325_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_327 = const()[name = string("op_327"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_327, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_329, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_331, x = v_5_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_343_begin_0 = const()[name = string("op_343_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_343_end_0 = const()[name = string("op_343_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_343_end_mask_0 = const()[name = string("op_343_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_343_cast_fp16 = slice_by_index(begin = var_343_begin_0, end = var_343_end_0, end_mask = var_343_end_mask_0, x = q_9_cast_fp16)[name = string("op_343_cast_fp16")];
-            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_332, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
+            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = mul(x = var_349_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_351_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_265, interleave = rotated_5_interleave_0, values = (var_351_cast_fp16, var_343_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_354_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_354_cast_fp16, y = var_355_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_368_begin_0 = const()[name = string("op_368_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_368_end_0 = const()[name = string("op_368_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_368_end_mask_0 = const()[name = string("op_368_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_368_cast_fp16 = slice_by_index(begin = var_368_begin_0, end = var_368_end_0, end_mask = var_368_end_mask_0, x = k_9_cast_fp16)[name = string("op_368_cast_fp16")];
-            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_9_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_265, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
+            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = mul(x = var_374_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_376_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_265, interleave = rotated_interleave_0, values = (var_376_cast_fp16, var_368_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_379_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_379_cast_fp16, y = var_380_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_265, interleave = q_interleave_0, values = roped_5_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_265, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_395_begin_0 = const()[name = string("op_395_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_395_end_0 = const()[name = string("op_395_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_395_end_mask_0 = const()[name = string("op_395_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = k_cast_fp16)[name = string("op_395_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = v_cast_fp16)[name = string("op_396_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_265, interleave = rotated_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_385_begin_0 = const()[name = string("op_385_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_385_end_0 = const()[name = string("op_385_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_385_end_mask_0 = const()[name = string("op_385_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_385_begin_0, end = var_385_end_0, end_mask = var_385_end_mask_0, x = roped_cast_fp16)[name = string("op_385_cast_fp16")];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_9_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = v_cast_fp16)[name = string("op_386_cast_fp16")];
             fp16 var_400_to_fp16 = const()[name = string("op_400_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_401_cast_fp16 = mul(x = q_cast_fp16, y = var_400_to_fp16)[name = string("op_401_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_401_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_409_cast_fp16 = softmax(axis = var_261, x = attn_weights_cast_fp16)[name = string("op_409_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_cast_fp16, y = var_409_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_401_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_400_to_fp16)[name = string("op_401_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_401_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_261, x = attn_weights_9_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_410_transpose_x_1 = const()[name = string("op_410_transpose_x_1"), val = bool(false)];
+            bool var_410_transpose_y_1 = const()[name = string("op_410_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_410_cast_fp16 = matmul(transpose_x = var_410_transpose_x_1, transpose_y = var_410_transpose_y_1, x = attn_weights_cast_fp16, y = v_9_cast_fp16)[name = string("op_410_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_413 = const()[name = string("op_413"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_410_cast_fp16)[name = string("transpose_5")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_413, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_417 = const()[name = string("op_417"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_419 = const()[name = string("op_419"), val = tensor<int32, [2]>([1, 1])];
@@ -667,21 +675,21 @@ program(1.3)
             tensor<int32, [1]> var_528_axes_0 = const()[name = string("op_528_axes_0"), val = tensor<int32, [1]>([2])];
             tensor<fp16, [1, 4096, 512]> var_528_cast_fp16 = squeeze(axes = var_528_axes_0, x = x_cast_fp16)[name = string("op_528_cast_fp16")];
             tensor<int32, [3]> var_529_perm_0 = const()[name = string("op_529_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<int32, [2]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [2]>([512, 4096])];
+            tensor<int32, [2]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [2]>([512, 4096])];
             tensor<fp16, [1, 512, 4096]> var_529_cast_fp16 = transpose(perm = var_529_perm_0, x = var_528_cast_fp16)[name = string("transpose_4")];
-            tensor<fp16, [512, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_4, x = var_529_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<fp16, [512, 4096]> reshape_0_cast_fp16 = reshape(shape = concat_8, x = var_529_cast_fp16)[name = string("reshape_0_cast_fp16")];
             bool matmul_0_transpose_x_0 = const()[name = string("matmul_0_transpose_x_0"), val = bool(false)];
             bool matmul_0_transpose_y_0 = const()[name = string("matmul_0_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 16384]> transpose_1_to_fp16 = const()[name = string("transpose_1_to_fp16"), val = tensor<fp16, [4096, 16384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202591168)))];
             tensor<fp16, [512, 16384]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_1_to_fp16)[name = string("matmul_0_cast_fp16")];
-            tensor<int32, [3]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [3]>([1, 512, 16384])];
-            tensor<fp16, [1, 512, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_8, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [3]> concat_12 = const()[name = string("concat_12"), val = tensor<int32, [3]>([1, 512, 16384])];
+            tensor<fp16, [1, 512, 16384]> reshape_2_cast_fp16 = reshape(shape = concat_12, x = matmul_0_cast_fp16)[name = string("reshape_2_cast_fp16")];
             bool matmul_1_transpose_x_0 = const()[name = string("matmul_1_transpose_x_0"), val = bool(false)];
             bool matmul_1_transpose_y_0 = const()[name = string("matmul_1_transpose_y_0"), val = bool(false)];
             tensor<fp16, [4096, 15616]> transpose_3_to_fp16 = const()[name = string("transpose_3_to_fp16"), val = tensor<fp16, [4096, 15616]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(336808960)))];
             tensor<fp16, [512, 15616]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_3_to_fp16)[name = string("matmul_1_cast_fp16")];
-            tensor<int32, [3]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [3]>([1, 512, 15616])];
-            tensor<fp16, [1, 512, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_16, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            tensor<int32, [3]> concat_20 = const()[name = string("concat_20"), val = tensor<int32, [3]>([1, 512, 15616])];
+            tensor<fp16, [1, 512, 15616]> reshape_5_cast_fp16 = reshape(shape = concat_20, x = matmul_1_cast_fp16)[name = string("reshape_5_cast_fp16")];
             bool var_538_interleave_0 = const()[name = string("op_538_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 512, 32000]> logits = concat(axis = var_488, interleave = var_538_interleave_0, values = (reshape_2_cast_fp16, reshape_5_cast_fp16))[name = string("op_538_cast_fp16")];
         } -> (logits, new_k_cache_0, new_k_cache_1, new_v_cache_0, new_v_cache_1);
diff --git a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/metadata.json
index b176ade4ff3b6ebd4932d3b2a360613286106d22..108f63f4b3ba800f21f9424d869622a87ab30d2e 100644
--- a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk2",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk2",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk2.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/metadata.json
index a00ed61aae1b0f3f07b32505d2fe4a01ad146f0e..f9ae2cf0c6299d4ec971d4326762c70a4b255665 100644
--- a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk3",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk3",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk3.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/metadata.json
index f23c9816ec121e35f03e1e496987c8aac8786cf7..6d41cea5d008ffae289b9102a8322d6c5933c102 100644
--- a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk4",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk4",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk4.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/metadata.json
index 668220ce266937fdc4046c5a8f99a9094e5797e5..8ef77004c6ea88d6883928475b9c132955a56a9f 100644
--- a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk5",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk5",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk5.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/metadata.json
index bac0e059681cee38b3a98d1363eaac66a6e236a2..db087b49964fb45eaef4b25975e37d372e4d16c9 100644
--- a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk6",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk6",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk6.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/metadata.json
index 601d1a92197d6041b5c6fe3c8916d026d2ded34f..8b8d5afbc6ac3505ccb60036ba5dc31e68fc298c 100644
--- a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk7",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk7",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk7.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/metadata.json
index f04c031c840fcecb970c6f0963320f07715efc51..2f40b2faa023a4355807b2af5aa5cc9dfcc7915c 100644
--- a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk8",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk8",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk8.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/analytics/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/analytics/coremldata.bin
index e3704b470dad34135a6b5cb4b471021bad5c3ae2..65ae082f31459bf8b09913d8861a15601cc13ad1 100644
--- a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84e317a82cdf4e96f808f63e77f10098844d47ad522545181edfac4d287c9c92
+oid sha256:e69e7ad37dd59e97348d395eec9b4c41b7d3ea44d86f613751ae47803a0a2efe
 size 243
diff --git a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/coremldata.bin b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/coremldata.bin
index 8bbc6dd38c74fe23594355e106f197518d41765b..a503721fa97147a06f91e834353053693378b0ad 100644
--- a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/coremldata.bin
+++ b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e430d0795ff5c384187174f5718a2c13d0070f5d6a811831e18862497865a86d
+oid sha256:d35a0353bcfa501579e07af3718261af3b129b4bec004c1fe6d812a6403a3f5b
 size 1037
diff --git a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/metadata.json b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/metadata.json
index 5b684c1f9cbaff97fda44561e0197dd6cbecfe82..3dc0428c338648533489510935fe541d11d8534a 100644
--- a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/metadata.json
+++ b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/metadata.json
@@ -17,9 +17,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_0",
         "type" : "MultiArray"
       },
@@ -27,9 +27,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_1",
         "type" : "MultiArray"
       },
@@ -37,9 +37,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 128, 508]",
         "name" : "new_k_cache_2",
         "type" : "MultiArray"
       },
@@ -47,9 +47,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_0",
         "type" : "MultiArray"
       },
@@ -57,9 +57,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_1",
         "type" : "MultiArray"
       },
@@ -67,9 +67,9 @@
         "hasShapeFlexibility" : "0",
         "isOptional" : "0",
         "dataType" : "Float16",
-        "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+        "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
         "shortDescription" : "",
-        "shape" : "[1, 32, 128, 511]",
+        "shape" : "[1, 32, 508, 128]",
         "name" : "new_v_cache_2",
         "type" : "MultiArray"
       }
@@ -142,9 +142,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -152,9 +152,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -162,9 +162,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -172,9 +172,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -182,9 +182,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -192,9 +192,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -203,14 +203,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
-          "Ios18.concat" : 18,
+          "Ios18.matmul" : 6,
+          "Ios18.concat" : 12,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -223,9 +224,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "x",
             "type" : "MultiArray"
           },
@@ -233,9 +234,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "cos",
             "type" : "MultiArray"
           },
@@ -243,9 +244,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 128 × 1)",
+            "formattedType" : "MultiArray (Float16 128 × 4)",
             "shortDescription" : "",
-            "shape" : "[128, 1]",
+            "shape" : "[128, 4]",
             "name" : "sin",
             "type" : "MultiArray"
           },
@@ -253,9 +254,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+            "formattedType" : "MultiArray (Float16 1 × 1 × 4 × 512)",
             "shortDescription" : "",
-            "shape" : "[1, 1, 1, 512]",
+            "shape" : "[1, 1, 4, 512]",
             "name" : "mask",
             "type" : "MultiArray"
           },
@@ -263,9 +264,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_0",
             "type" : "MultiArray"
           },
@@ -273,9 +274,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_0",
             "type" : "MultiArray"
           },
@@ -283,9 +284,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_1",
             "type" : "MultiArray"
           },
@@ -293,9 +294,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_1",
             "type" : "MultiArray"
           },
@@ -303,9 +304,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "k_cache_2",
             "type" : "MultiArray"
           },
@@ -313,9 +314,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "1",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)?",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)?",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "v_cache_2",
             "type" : "MultiArray"
           }
@@ -330,9 +331,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 1)",
+            "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 4)",
             "shortDescription" : "",
-            "shape" : "[1, 4096, 1, 1]",
+            "shape" : "[1, 4096, 1, 4]",
             "name" : "new_x",
             "type" : "MultiArray"
           },
@@ -340,9 +341,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_0",
             "type" : "MultiArray"
           },
@@ -350,9 +351,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_1",
             "type" : "MultiArray"
           },
@@ -360,9 +361,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 508)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 128, 508]",
             "name" : "new_k_cache_2",
             "type" : "MultiArray"
           },
@@ -370,9 +371,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_0",
             "type" : "MultiArray"
           },
@@ -380,9 +381,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_1",
             "type" : "MultiArray"
           },
@@ -390,9 +391,9 @@
             "hasShapeFlexibility" : "0",
             "isOptional" : "0",
             "dataType" : "Float16",
-            "formattedType" : "MultiArray (Float16 1 × 32 × 128 × 511)",
+            "formattedType" : "MultiArray (Float16 1 × 32 × 508 × 128)",
             "shortDescription" : "",
-            "shape" : "[1, 32, 128, 511]",
+            "shape" : "[1, 32, 508, 128]",
             "name" : "new_v_cache_2",
             "type" : "MultiArray"
           }
@@ -401,14 +402,15 @@
         "mlProgramOperationTypeHistogram" : {
           "Ios18.constexprLutToDense" : 21,
           "Ios18.conv" : 21,
-          "Ios18.matmul" : 6,
           "Ios18.expandDims" : 6,
+          "Ios18.matmul" : 6,
           "Ios18.concat" : 18,
           "Ios18.add" : 15,
           "Ios18.realDiv" : 6,
           "Ios18.silu" : 3,
           "Ios18.softmax" : 3,
           "Ios18.sliceByIndex" : 18,
+          "Ios18.transpose" : 6,
           "Ios16.reduceL2Norm" : 6,
           "Ios18.squeeze" : 6,
           "Ios18.reshape" : 12,
@@ -419,14 +421,15 @@
     "mlProgramOperationTypeHistogram" : {
       "Ios18.constexprLutToDense" : 21,
       "Ios18.conv" : 21,
-      "Ios18.matmul" : 6,
       "Ios18.expandDims" : 6,
-      "Ios18.concat" : 18,
+      "Ios18.matmul" : 6,
+      "Ios18.concat" : 12,
       "Ios18.add" : 15,
       "Ios18.realDiv" : 6,
       "Ios18.silu" : 3,
       "Ios18.softmax" : 3,
       "Ios18.sliceByIndex" : 18,
+      "Ios18.transpose" : 6,
       "Ios16.reduceL2Norm" : 6,
       "Ios18.squeeze" : 6,
       "Ios18.reshape" : 12,
@@ -491,7 +494,7 @@
       }
     ],
     "defaultFunctionName" : "input_512_context_512",
-    "generatedClassName" : "Llama_2_7b_hf_2024_07_02_20_36_17_merged_chunk9",
+    "generatedClassName" : "Llama_2_7b_hf_2024_07_17_19_34_17_merged_chunk9",
     "userDefinedMetadata" : {
 
     },
diff --git a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/model.mil b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/model.mil
index 5b7b1db6b14fe10ff51aaa601d8484d9ff49576b..85f75a6da9054155e32013d96460963c79fba3f8 100644
--- a/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/model.mil
+++ b/sequoia/Llama-2-7b-hf_chunk9.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}})]
 {
-    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func input_1_context_512<ios18>(tensor<fp16, [128, 4]> cos, tensor<fp16, [1, 32, 128, 508]> k_cache_0, tensor<fp16, [1, 32, 128, 508]> k_cache_1, tensor<fp16, [1, 32, 128, 508]> k_cache_2, tensor<fp16, [1, 1, 4, 512]> mask, tensor<fp16, [128, 4]> sin, tensor<fp16, [1, 32, 508, 128]> v_cache_0, tensor<fp16, [1, 32, 508, 128]> v_cache_1, tensor<fp16, [1, 32, 508, 128]> v_cache_2, tensor<fp16, [1, 4096, 1, 4]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
@@ -29,438 +29,450 @@ program(1.3)
             int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
             bool var_35 = const()[name = string("op_35"), val = bool(true)];
             tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            tensor<fp16, [1, 4096, 4]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
             bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<fp16, [1, 1, 4]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
             tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
             tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
             fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_73 = const()[name = string("op_73"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            string var_77_pad_type_0 = const()[name = string("op_77_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_77_pad_0 = const()[name = string("op_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_77_cast_fp16 = conv(dilations = var_75, groups = var_31, pad = var_77_pad_0, pad_type = var_77_pad_type_0, strides = var_73, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_77_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_1_cast_fp16 = mul(x = var_77_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_81 = const()[name = string("op_81"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            string var_85_pad_type_0 = const()[name = string("op_85_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_85_pad_0 = const()[name = string("op_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_85_cast_fp16 = conv(dilations = var_83, groups = var_31, pad = var_85_pad_0, pad_type = var_85_pad_type_0, strides = var_81, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_85_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
-            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_1_cast_fp16 = mul(x = var_85_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_89 = const()[name = string("op_89"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_91 = const()[name = string("op_91"), val = tensor<int32, [2]>([1, 1])];
+            string var_93_pad_type_0 = const()[name = string("op_93_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_93_pad_0 = const()[name = string("op_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_93_cast_fp16 = conv(dilations = var_91, groups = var_31, pad = var_93_pad_0, pad_type = var_93_pad_type_0, strides = var_89, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_93_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
-            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_1_cast_fp16 = mul(x = var_93_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_95 = const()[name = string("op_95"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_3_cast_fp16 = reshape(shape = var_95, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_97 = const()[name = string("op_97"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_3_cast_fp16 = reshape(shape = var_97, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_99 = const()[name = string("op_99"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_3_cast_fp16 = reshape(shape = var_99, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
+            tensor<int32, [4]> var_117_begin_0 = const()[name = string("op_117_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_117_end_0 = const()[name = string("op_117_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_117_end_mask_0 = const()[name = string("op_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_117_cast_fp16 = slice_by_index(begin = var_117_begin_0, end = var_117_end_0, end_mask = var_117_end_mask_0, x = q_3_cast_fp16)[name = string("op_117_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_119_cast_fp16 = mul(x = var_117_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_119_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
-            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_119_cast_fp16, var_111_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_122_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_123_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_123_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_1_cast_fp16 = add(x = var_122_cast_fp16, y = var_123_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = string("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = string("op_142_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = string("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = k_3_cast_fp16)[name = string("op_142_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_144_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_147_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_148_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_148_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_3_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
             tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
-            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
-            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
-            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool v_7_interleave_0 = const()[name = string("v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 512, 128]> v_7_cast_fp16 = concat(axis = var_34, interleave = v_7_interleave_0, values = (v_cache_0, v_5_cast_fp16))[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_159_begin_0 = const()[name = string("op_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_159_end_0 = const()[name = string("op_159_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_159_end_mask_0 = const()[name = string("op_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_159_begin_0, end = var_159_end_0, end_mask = var_159_end_mask_0, x = k_7_cast_fp16)[name = string("op_159_cast_fp16")];
+            tensor<int32, [4]> var_160_begin_0 = const()[name = string("op_160_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_160_end_0 = const()[name = string("op_160_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_160_end_mask_0 = const()[name = string("op_160_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_160_begin_0, end = var_160_end_0, end_mask = var_160_end_mask_0, x = v_7_cast_fp16)[name = string("op_160_cast_fp16")];
+            fp16 var_165_to_fp16 = const()[name = string("op_165_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_166_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_165_to_fp16)[name = string("op_166_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
-            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
-            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_166_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_175_transpose_x_0 = const()[name = string("op_175_transpose_x_0"), val = bool(false)];
+            bool var_175_transpose_y_0 = const()[name = string("op_175_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_175_cast_fp16 = matmul(transpose_x = var_175_transpose_x_0, transpose_y = var_175_transpose_y_0, x = attn_weights_5_cast_fp16, y = v_7_cast_fp16)[name = string("op_175_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_178 = const()[name = string("op_178"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_175_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 4096, 1, 4]> input_1_cast_fp16 = reshape(shape = var_178, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_182 = const()[name = string("op_182"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_184 = const()[name = string("op_184"), val = tensor<int32, [2]>([1, 1])];
+            string var_186_pad_type_0 = const()[name = string("op_186_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_186_pad_0 = const()[name = string("op_186_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_186_cast_fp16 = conv(dilations = var_184, groups = var_31, pad = var_186_pad_0, pad_type = var_186_pad_type_0, strides = var_182, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_186_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
-            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
-            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_1_cast_fp16 = mul(x = var_186_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_205_axes_0 = const()[name = string("op_205_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_205_cast_fp16 = squeeze(axes = var_205_axes_0, x = x_11_cast_fp16)[name = string("op_205_cast_fp16")];
+            bool var_207_interleave_0 = const()[name = string("op_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_207_cast_fp16 = concat(axis = var_31, interleave = var_207_interleave_0, values = (var_205_cast_fp16, eps_chan_3_to_fp16))[name = string("op_207_cast_fp16")];
             tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_207_cast_fp16)[name = string("x_eps_3_cast_fp16")];
             tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
-            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_212_to_fp16 = const()[name = string("op_212_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_212_to_fp16)[name = string("x_normed_9_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
-            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
-            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_224 = const()[name = string("op_224"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
-            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            string var_228_pad_type_0 = const()[name = string("op_228_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_228_pad_0 = const()[name = string("op_228_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_228_cast_fp16 = conv(dilations = var_226, groups = var_31, pad = var_228_pad_0, pad_type = var_228_pad_type_0, strides = var_224, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_5_cast_fp16 = mul(x = var_228_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_234 = const()[name = string("op_234"), val = tensor<int32, [2]>([1, 1])];
+            string var_236_pad_type_0 = const()[name = string("op_236_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_236_pad_0 = const()[name = string("op_236_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_236_cast_fp16 = conv(dilations = var_234, groups = var_31, pad = var_236_pad_0, pad_type = var_236_pad_type_0, strides = var_232, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_236_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
-            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
-            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_1_cast_fp16 = mul(x = var_236_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_238_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_238_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_7_cast_fp16 = mul(x = var_238_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_242 = const()[name = string("op_242"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_244 = const()[name = string("op_244"), val = tensor<int32, [2]>([1, 1])];
+            string var_246_pad_type_0 = const()[name = string("op_246_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_246_pad_0 = const()[name = string("op_246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_246_cast_fp16 = conv(dilations = var_244, groups = var_31, pad = var_246_pad_0, pad_type = var_246_pad_type_0, strides = var_242, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
-            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
-            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
-            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
-            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
-            bool var_265 = const()[name = string("op_265"), val = bool(true)];
-            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
-            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_247_cast_fp16 = mul(x = var_246_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_247_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_15_cast_fp16 = add(x = var_247_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_258 = const()[name = string("op_258"), val = int32(-1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(3)];
+            int32 var_267 = const()[name = string("op_267"), val = int32(1)];
+            int32 var_270 = const()[name = string("op_270"), val = int32(-2)];
+            bool var_271 = const()[name = string("op_271"), val = bool(true)];
+            tensor<int32, [1]> var_288_axes_0 = const()[name = string("op_288_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_288_cast_fp16 = squeeze(axes = var_288_axes_0, x = x_15_cast_fp16)[name = string("op_288_cast_fp16")];
+            bool var_290_interleave_0 = const()[name = string("op_290_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_290_cast_fp16 = concat(axis = var_267, interleave = var_290_interleave_0, values = (var_288_cast_fp16, eps_chan_5_to_fp16))[name = string("op_290_cast_fp16")];
             tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_290_cast_fp16)[name = string("x_eps_5_cast_fp16")];
             tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
-            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_271, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_295_to_fp16 = const()[name = string("op_295_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_295_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_311 = const()[name = string("op_311"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_313 = const()[name = string("op_313"), val = tensor<int32, [2]>([1, 1])];
+            string var_315_pad_type_0 = const()[name = string("op_315_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_315_pad_0 = const()[name = string("op_315_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_315_cast_fp16 = conv(dilations = var_313, groups = var_267, pad = var_315_pad_0, pad_type = var_315_pad_type_0, strides = var_311, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_315_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_7_cast_fp16 = mul(x = var_315_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_319 = const()[name = string("op_319"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([1, 1])];
+            string var_323_pad_type_0 = const()[name = string("op_323_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_323_pad_0 = const()[name = string("op_323_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_323_cast_fp16 = conv(dilations = var_321, groups = var_267, pad = var_323_pad_0, pad_type = var_323_pad_type_0, strides = var_319, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_323_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_9_cast_fp16 = mul(x = var_323_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_327 = const()[name = string("op_327"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([1, 1])];
+            string var_331_pad_type_0 = const()[name = string("op_331_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_331_pad_0 = const()[name = string("op_331_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_331_cast_fp16 = conv(dilations = var_329, groups = var_267, pad = var_331_pad_0, pad_type = var_331_pad_type_0, strides = var_327, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
-            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_9_cast_fp16 = mul(x = var_331_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_9_cast_fp16 = reshape(shape = var_333, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_11_cast_fp16 = reshape(shape = var_335, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_337 = const()[name = string("op_337"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_11_cast_fp16 = reshape(shape = var_337, x = v_9_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = string("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = string("op_349_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = string("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = q_9_cast_fp16)[name = string("op_349_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = q_9_cast_fp16)[name = string("op_355_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_357_cast_fp16 = mul(x = var_355_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_357_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
-            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_5_cast_fp16 = concat(axis = var_270, interleave = rotated_5_interleave_0, values = (var_357_cast_fp16, var_349_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_360_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_360_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_361_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_361_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_5_cast_fp16 = add(x = var_360_cast_fp16, y = var_361_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = k_11_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = string("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = string("op_380_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = string("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_11_cast_fp16)[name = string("op_380_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_382_cast_fp16 = mul(x = var_380_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_382_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_7_cast_fp16 = concat(axis = var_270, interleave = rotated_7_interleave_0, values = (var_382_cast_fp16, var_374_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_385_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_385_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_386_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_386_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_7_cast_fp16 = add(x = var_385_cast_fp16, y = var_386_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_13_perm_0 = const()[name = string("v_13_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
-            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
-            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
-            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
-            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
-            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
-            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_258, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_15_interleave_0 = const()[name = string("v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> v_13_cast_fp16 = transpose(perm = v_13_perm_0, x = v_11_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 512, 128]> v_15_cast_fp16 = concat(axis = var_270, interleave = v_15_interleave_0, values = (v_cache_1, v_13_cast_fp16))[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = k_15_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<int32, [4]> var_398_begin_0 = const()[name = string("op_398_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_398_end_0 = const()[name = string("op_398_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_398_end_mask_0 = const()[name = string("op_398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_398_begin_0, end = var_398_end_0, end_mask = var_398_end_mask_0, x = v_15_cast_fp16)[name = string("op_398_cast_fp16")];
+            fp16 var_403_to_fp16 = const()[name = string("op_403_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_404_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_403_to_fp16)[name = string("op_404_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_404_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_266, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_413_transpose_x_0 = const()[name = string("op_413_transpose_x_0"), val = bool(false)];
+            bool var_413_transpose_y_0 = const()[name = string("op_413_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_413_cast_fp16 = matmul(transpose_x = var_413_transpose_x_0, transpose_y = var_413_transpose_y_0, x = attn_weights_11_cast_fp16, y = v_15_cast_fp16)[name = string("op_413_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_416 = const()[name = string("op_416"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_413_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 4096, 1, 4]> input_9_cast_fp16 = reshape(shape = var_416, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_422 = const()[name = string("op_422"), val = tensor<int32, [2]>([1, 1])];
+            string var_424_pad_type_0 = const()[name = string("op_424_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_424_pad_0 = const()[name = string("op_424_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_424_cast_fp16 = conv(dilations = var_422, groups = var_267, pad = var_424_pad_0, pad_type = var_424_pad_type_0, strides = var_420, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_424_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
-            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
-            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_3_cast_fp16 = mul(x = var_424_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_443_axes_0 = const()[name = string("op_443_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_443_cast_fp16 = squeeze(axes = var_443_axes_0, x = x_25_cast_fp16)[name = string("op_443_cast_fp16")];
+            bool var_445_interleave_0 = const()[name = string("op_445_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_445_cast_fp16 = concat(axis = var_267, interleave = var_445_interleave_0, values = (var_443_cast_fp16, eps_chan_7_to_fp16))[name = string("op_445_cast_fp16")];
             tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_445_cast_fp16)[name = string("x_eps_7_cast_fp16")];
             tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
-            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_271, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_450_to_fp16)[name = string("x_normed_21_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
-            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
-            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_464 = const()[name = string("op_464"), val = tensor<int32, [2]>([1, 1])];
+            string var_466_pad_type_0 = const()[name = string("op_466_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_466_pad_0 = const()[name = string("op_466_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_466_cast_fp16 = conv(dilations = var_464, groups = var_267, pad = var_466_pad_0, pad_type = var_466_pad_type_0, strides = var_462, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_466_cast_fp16")];
             tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
-            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
-            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
-            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1, 11008, 1, 4]> input_13_cast_fp16 = mul(x = var_466_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
             tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
-            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<int32, [2]> var_472 = const()[name = string("op_472"), val = tensor<int32, [2]>([1, 1])];
+            string var_474_pad_type_0 = const()[name = string("op_474_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_474_pad_0 = const()[name = string("op_474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_474_cast_fp16 = conv(dilations = var_472, groups = var_267, pad = var_474_pad_0, pad_type = var_474_pad_type_0, strides = var_470, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_3_cast_fp16 = mul(x = var_474_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_476_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_476_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_15_cast_fp16 = mul(x = var_476_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_482 = const()[name = string("op_482"), val = tensor<int32, [2]>([1, 1])];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_484_cast_fp16 = conv(dilations = var_482, groups = var_267, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_480, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
-            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
-            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
-            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
-            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
-            bool var_497 = const()[name = string("op_497"), val = bool(true)];
-            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
-            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_485_cast_fp16 = mul(x = var_484_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_485_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_29_cast_fp16 = add(x = var_485_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-1)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(3)];
+            int32 var_505 = const()[name = string("op_505"), val = int32(1)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            bool var_509 = const()[name = string("op_509"), val = bool(true)];
+            tensor<int32, [1]> var_526_axes_0 = const()[name = string("op_526_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_526_cast_fp16 = squeeze(axes = var_526_axes_0, x = x_29_cast_fp16)[name = string("op_526_cast_fp16")];
+            bool var_528_interleave_0 = const()[name = string("op_528_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_528_cast_fp16 = concat(axis = var_505, interleave = var_528_interleave_0, values = (var_526_cast_fp16, eps_chan_9_to_fp16))[name = string("op_528_cast_fp16")];
             tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_528_cast_fp16)[name = string("x_eps_9_cast_fp16")];
             tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
-            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_509, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_533_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
-            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
-            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_551 = const()[name = string("op_551"), val = tensor<int32, [2]>([1, 1])];
+            string var_553_pad_type_0 = const()[name = string("op_553_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_553_pad_0 = const()[name = string("op_553_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_553_cast_fp16 = conv(dilations = var_551, groups = var_505, pad = var_553_pad_0, pad_type = var_553_pad_type_0, strides = var_549, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_553_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> q_13_cast_fp16 = mul(x = var_553_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_559 = const()[name = string("op_559"), val = tensor<int32, [2]>([1, 1])];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_561_cast_fp16 = conv(dilations = var_559, groups = var_505, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_557, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_561_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
-            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> k_17_cast_fp16 = mul(x = var_561_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_567 = const()[name = string("op_567"), val = tensor<int32, [2]>([1, 1])];
+            string var_569_pad_type_0 = const()[name = string("op_569_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_569_pad_0 = const()[name = string("op_569_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_569_cast_fp16 = conv(dilations = var_567, groups = var_505, pad = var_569_pad_0, pad_type = var_569_pad_type_0, strides = var_565, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_569_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
-            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
-            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
-            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
-            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> v_17_cast_fp16 = mul(x = var_569_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_17_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> q_15_cast_fp16 = reshape(shape = var_571, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> k_19_cast_fp16 = reshape(shape = var_573, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_575 = const()[name = string("op_575"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<fp16, [1, 32, 128, 4]> v_19_cast_fp16 = reshape(shape = var_575, x = v_17_cast_fp16)[name = string("v_19_cast_fp16")];
+            tensor<int32, [4]> var_587_begin_0 = const()[name = string("op_587_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_587_end_0 = const()[name = string("op_587_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_587_end_mask_0 = const()[name = string("op_587_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_587_cast_fp16 = slice_by_index(begin = var_587_begin_0, end = var_587_end_0, end_mask = var_587_end_mask_0, x = q_15_cast_fp16)[name = string("op_587_cast_fp16")];
+            tensor<int32, [4]> var_593_begin_0 = const()[name = string("op_593_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_593_end_0 = const()[name = string("op_593_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_593_end_mask_0 = const()[name = string("op_593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_593_cast_fp16 = slice_by_index(begin = var_593_begin_0, end = var_593_end_0, end_mask = var_593_end_mask_0, x = q_15_cast_fp16)[name = string("op_593_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_595_cast_fp16 = mul(x = var_593_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_595_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
-            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
-            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
-            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_9_cast_fp16 = concat(axis = var_508, interleave = rotated_9_interleave_0, values = (var_595_cast_fp16, var_587_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_598_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_598_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_599_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_599_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_9_cast_fp16 = add(x = var_598_cast_fp16, y = var_599_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_612_begin_0 = const()[name = string("op_612_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_612_end_0 = const()[name = string("op_612_end_0"), val = tensor<int32, [4]>([1, 32, 64, 4])];
+            tensor<bool, [4]> var_612_end_mask_0 = const()[name = string("op_612_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_612_cast_fp16 = slice_by_index(begin = var_612_begin_0, end = var_612_end_0, end_mask = var_612_end_mask_0, x = k_19_cast_fp16)[name = string("op_612_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 4])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 4]> var_618_cast_fp16 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_19_cast_fp16)[name = string("op_618_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 4]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_620_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> rotated_cast_fp16 = concat(axis = var_508, interleave = rotated_interleave_0, values = (var_620_cast_fp16, var_612_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_623_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_623_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> var_624_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_624_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 4]> roped_cast_fp16 = add(x = var_623_cast_fp16, y = var_624_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_21_perm_0 = const()[name = string("v_21_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_496, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
             bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
-            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
-            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
-            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
-            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
-            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
-            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
-            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 128]> v_21_cast_fp16 = transpose(perm = v_21_perm_0, x = v_19_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = concat(axis = var_508, interleave = v_interleave_0, values = (v_cache_2, v_21_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_635_begin_0 = const()[name = string("op_635_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_635_end_0 = const()[name = string("op_635_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_635_end_mask_0 = const()[name = string("op_635_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_635_begin_0, end = var_635_end_0, end_mask = var_635_end_mask_0, x = k_cast_fp16)[name = string("op_635_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = v_cast_fp16)[name = string("op_636_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 4]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 4, 512]> attn_weights_cast_fp16 = softmax(axis = var_504, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_0 = const()[name = string("op_651_transpose_x_0"), val = bool(false)];
+            bool var_651_transpose_y_0 = const()[name = string("op_651_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 4, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_0, transpose_y = var_651_transpose_y_0, x = attn_weights_cast_fp16, y = v_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 4]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 4096, 1, 4]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_505, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
-            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
-            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
-            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
-            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 4]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 4]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 4]>([[[0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3, 0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 4]> var_683_cast_fp16 = concat(axis = var_505, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_to_fp16))[name = string("op_683_cast_fp16")];
             tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
-            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<fp16, [1, 4097, 1, 4]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
             tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
-            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
-            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 4]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_509, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 4]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
-            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
-            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
-            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
-            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
-            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
-            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
-            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
             tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
             string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
             tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_505, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 4]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 4]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_505, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 4]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 4]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 4]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_505, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
-            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 4]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
     func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
@@ -502,86 +514,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
             tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
-            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<int32, [2]> var_67 = const()[name = string("op_67"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_69 = const()[name = string("op_69"), val = tensor<int32, [2]>([1, 1])];
+            string var_71_pad_type_0 = const()[name = string("op_71_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_71_pad_0 = const()[name = string("op_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_71_cast_fp16 = conv(dilations = var_69, groups = var_25, pad = var_71_pad_0, pad_type = var_71_pad_type_0, strides = var_67, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
-            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_71_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_75 = const()[name = string("op_75"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_77 = const()[name = string("op_77"), val = tensor<int32, [2]>([1, 1])];
+            string var_79_pad_type_0 = const()[name = string("op_79_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_79_pad_0 = const()[name = string("op_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_79_cast_fp16 = conv(dilations = var_77, groups = var_25, pad = var_79_pad_0, pad_type = var_79_pad_type_0, strides = var_75, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_79_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
-            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
-            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_79_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_83 = const()[name = string("op_83"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_85 = const()[name = string("op_85"), val = tensor<int32, [2]>([1, 1])];
+            string var_87_pad_type_0 = const()[name = string("op_87_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_87_pad_0 = const()[name = string("op_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_87_cast_fp16 = conv(dilations = var_85, groups = var_25, pad = var_87_pad_0, pad_type = var_87_pad_type_0, strides = var_83, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_87_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
-            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
-            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
-            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
-            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
-            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_87_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_89 = const()[name = string("op_89"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_89, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_91 = const()[name = string("op_91"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_91, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_93 = const()[name = string("op_93"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_93, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_105_begin_0 = const()[name = string("op_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_105_end_0 = const()[name = string("op_105_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_105_end_mask_0 = const()[name = string("op_105_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_105_cast_fp16 = slice_by_index(begin = var_105_begin_0, end = var_105_end_0, end_mask = var_105_end_mask_0, x = q_3_cast_fp16)[name = string("op_105_cast_fp16")];
+            tensor<int32, [4]> var_111_begin_0 = const()[name = string("op_111_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_111_end_0 = const()[name = string("op_111_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_111_end_mask_0 = const()[name = string("op_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_111_cast_fp16 = slice_by_index(begin = var_111_begin_0, end = var_111_end_0, end_mask = var_111_end_mask_0, x = q_3_cast_fp16)[name = string("op_111_cast_fp16")];
             fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_113_cast_fp16 = mul(x = var_111_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_113_cast_fp16")];
             bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
-            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_113_cast_fp16, var_105_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_117_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_117_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_116_cast_fp16, y = var_117_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_130_begin_0 = const()[name = string("op_130_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_130_end_0 = const()[name = string("op_130_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_130_end_mask_0 = const()[name = string("op_130_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_130_cast_fp16 = slice_by_index(begin = var_130_begin_0, end = var_130_end_0, end_mask = var_130_end_mask_0, x = k_3_cast_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = string("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = string("op_136_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = string("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = k_3_cast_fp16)[name = string("op_136_cast_fp16")];
             fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_138_cast_fp16 = mul(x = var_136_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_138_cast_fp16")];
             bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
-            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
-            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
-            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
-            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_138_cast_fp16, var_130_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_142_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_142_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_141_cast_fp16, y = var_142_cast_fp16)[name = string("roped_3_cast_fp16")];
+            tensor<int32, [4]> v_5_perm_0 = const()[name = string("v_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_146_begin_0 = const()[name = string("op_146_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_146_end_0 = const()[name = string("op_146_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_146_end_mask_0 = const()[name = string("op_146_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_0 = slice_by_index(begin = var_146_begin_0, end = var_146_end_0, end_mask = var_146_end_mask_0, x = roped_3_cast_fp16)[name = string("op_146_cast_fp16")];
+            tensor<int32, [4]> var_147_begin_0 = const()[name = string("op_147_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_147_end_0 = const()[name = string("op_147_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_147_end_mask_0 = const()[name = string("op_147_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_5_cast_fp16 = transpose(perm = v_5_perm_0, x = v_3_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_0 = slice_by_index(begin = var_147_begin_0, end = var_147_end_0, end_mask = var_147_end_mask_0, x = v_5_cast_fp16)[name = string("op_147_cast_fp16")];
             fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
             bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
             bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = roped_3_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
             tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
-            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
-            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            bool var_171_transpose_x_1 = const()[name = string("op_171_transpose_x_1"), val = bool(false)];
+            bool var_171_transpose_y_1 = const()[name = string("op_171_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_171_cast_fp16 = matmul(transpose_x = var_171_transpose_x_1, transpose_y = var_171_transpose_y_1, x = attn_weights_5_cast_fp16, y = v_3_cast_fp16)[name = string("op_171_cast_fp16")];
+            tensor<int32, [4]> attn_1_perm_0 = const()[name = string("attn_1_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = transpose(perm = attn_1_perm_0, x = var_171_cast_fp16)[name = string("transpose_4")];
             tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
             tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
@@ -645,86 +657,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
             tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
-            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
-            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<int32, [2]> var_307 = const()[name = string("op_307"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_309 = const()[name = string("op_309"), val = tensor<int32, [2]>([1, 1])];
+            string var_311_pad_type_0 = const()[name = string("op_311_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_311_pad_0 = const()[name = string("op_311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_311_cast_fp16 = conv(dilations = var_309, groups = var_263, pad = var_311_pad_0, pad_type = var_311_pad_type_0, strides = var_307, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_311_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
-            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
-            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_311_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_315 = const()[name = string("op_315"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_317 = const()[name = string("op_317"), val = tensor<int32, [2]>([1, 1])];
+            string var_319_pad_type_0 = const()[name = string("op_319_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_319_pad_0 = const()[name = string("op_319_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_319_cast_fp16 = conv(dilations = var_317, groups = var_263, pad = var_319_pad_0, pad_type = var_319_pad_type_0, strides = var_315, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_319_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
-            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
-            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_319_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_323 = const()[name = string("op_323"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([1, 1])];
+            string var_327_pad_type_0 = const()[name = string("op_327_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_327_pad_0 = const()[name = string("op_327_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_327_cast_fp16 = conv(dilations = var_325, groups = var_263, pad = var_327_pad_0, pad_type = var_327_pad_type_0, strides = var_323, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_327_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
-            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
-            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
-            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
-            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
-            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = mul(x = var_327_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_329 = const()[name = string("op_329"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_329, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_331 = const()[name = string("op_331"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_331, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_9_cast_fp16 = reshape(shape = var_333, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_345_begin_0 = const()[name = string("op_345_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_345_end_0 = const()[name = string("op_345_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_345_end_mask_0 = const()[name = string("op_345_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_345_cast_fp16 = slice_by_index(begin = var_345_begin_0, end = var_345_end_0, end_mask = var_345_end_mask_0, x = q_9_cast_fp16)[name = string("op_345_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = q_9_cast_fp16)[name = string("op_351_cast_fp16")];
             fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_353_cast_fp16 = mul(x = var_351_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_353_cast_fp16")];
             bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
-            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_353_cast_fp16, var_345_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_357_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_357_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_356_cast_fp16, y = var_357_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = k_9_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = string("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = string("op_376_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = string("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = string("op_376_cast_fp16")];
             fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_378_cast_fp16 = mul(x = var_376_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_378_cast_fp16")];
             bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
-            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
-            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
-            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
-            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_378_cast_fp16, var_370_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_382_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_382_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_381_cast_fp16, y = var_382_cast_fp16)[name = string("roped_7_cast_fp16")];
+            tensor<int32, [4]> v_11_perm_0 = const()[name = string("v_11_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = roped_7_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_11_cast_fp16 = transpose(perm = v_11_perm_0, x = v_9_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
             fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
-            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
-            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
-            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
-            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_7_transpose_x_0 = const()[name = string("attn_weights_7_transpose_x_0"), val = bool(true)];
+            bool attn_weights_7_transpose_y_0 = const()[name = string("attn_weights_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = matmul(transpose_x = attn_weights_7_transpose_x_0, transpose_y = attn_weights_7_transpose_y_0, x = var_402_cast_fp16, y = roped_7_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = add(x = attn_weights_7_cast_fp16, y = mask)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_11_cast_fp16 = softmax(axis = var_262, x = attn_weights_9_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool var_411_transpose_x_1 = const()[name = string("op_411_transpose_x_1"), val = bool(false)];
+            bool var_411_transpose_y_1 = const()[name = string("op_411_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_411_cast_fp16 = matmul(transpose_x = var_411_transpose_x_1, transpose_y = var_411_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_9_cast_fp16)[name = string("op_411_cast_fp16")];
+            tensor<int32, [4]> attn_3_perm_0 = const()[name = string("attn_3_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = transpose(perm = attn_3_perm_0, x = var_411_cast_fp16)[name = string("transpose_2")];
             tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
             tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
@@ -788,86 +800,86 @@ program(1.3)
             tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
             tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
-            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
-            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<int32, [2]> var_547 = const()[name = string("op_547"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_549 = const()[name = string("op_549"), val = tensor<int32, [2]>([1, 1])];
+            string var_551_pad_type_0 = const()[name = string("op_551_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_551_pad_0 = const()[name = string("op_551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_551_cast_fp16 = conv(dilations = var_549, groups = var_503, pad = var_551_pad_0, pad_type = var_551_pad_type_0, strides = var_547, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_551_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
-            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
-            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
-            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_551_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_555 = const()[name = string("op_555"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_557 = const()[name = string("op_557"), val = tensor<int32, [2]>([1, 1])];
+            string var_559_pad_type_0 = const()[name = string("op_559_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_559_pad_0 = const()[name = string("op_559_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_559_cast_fp16 = conv(dilations = var_557, groups = var_503, pad = var_559_pad_0, pad_type = var_559_pad_type_0, strides = var_555, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
-            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
-            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
-            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
-            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_559_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_563 = const()[name = string("op_563"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_565 = const()[name = string("op_565"), val = tensor<int32, [2]>([1, 1])];
+            string var_567_pad_type_0 = const()[name = string("op_567_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_567_pad_0 = const()[name = string("op_567_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_567_cast_fp16 = conv(dilations = var_565, groups = var_503, pad = var_567_pad_0, pad_type = var_567_pad_type_0, strides = var_563, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_567_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
-            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
-            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
-            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
-            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
-            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
-            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> v_13_cast_fp16 = mul(x = var_567_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_569 = const()[name = string("op_569"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_569, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_571, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_573 = const()[name = string("op_573"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_15_cast_fp16 = reshape(shape = var_573, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_585_begin_0 = const()[name = string("op_585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_585_end_0 = const()[name = string("op_585_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_585_end_mask_0 = const()[name = string("op_585_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_585_cast_fp16 = slice_by_index(begin = var_585_begin_0, end = var_585_end_0, end_mask = var_585_end_mask_0, x = q_15_cast_fp16)[name = string("op_585_cast_fp16")];
+            tensor<int32, [4]> var_591_begin_0 = const()[name = string("op_591_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_591_end_0 = const()[name = string("op_591_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_591_end_mask_0 = const()[name = string("op_591_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_591_cast_fp16 = slice_by_index(begin = var_591_begin_0, end = var_591_end_0, end_mask = var_591_end_mask_0, x = q_15_cast_fp16)[name = string("op_591_cast_fp16")];
             fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_593_cast_fp16 = mul(x = var_591_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_593_cast_fp16")];
             bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
-            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
-            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_593_cast_fp16, var_585_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_597_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_597_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_596_cast_fp16, y = var_597_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_610_begin_0 = const()[name = string("op_610_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_610_end_0 = const()[name = string("op_610_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_610_end_mask_0 = const()[name = string("op_610_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_610_cast_fp16 = slice_by_index(begin = var_610_begin_0, end = var_610_end_0, end_mask = var_610_end_mask_0, x = k_15_cast_fp16)[name = string("op_610_cast_fp16")];
+            tensor<int32, [4]> var_616_begin_0 = const()[name = string("op_616_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_616_end_0 = const()[name = string("op_616_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_616_end_mask_0 = const()[name = string("op_616_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_616_cast_fp16 = slice_by_index(begin = var_616_begin_0, end = var_616_end_0, end_mask = var_616_end_mask_0, x = k_15_cast_fp16)[name = string("op_616_cast_fp16")];
             fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            tensor<fp16, [1, 32, 64, 512]> var_618_cast_fp16 = mul(x = var_616_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_618_cast_fp16")];
             bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
-            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
-            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
-            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
-            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
-            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
-            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_618_cast_fp16, var_610_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_622_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_622_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_621_cast_fp16, y = var_622_cast_fp16)[name = string("roped_cast_fp16")];
+            tensor<int32, [4]> v_perm_0 = const()[name = string("v_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 32, 128, 509])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 32, 128, 508]> new_k_cache_2 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = roped_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_627_begin_0 = const()[name = string("op_627_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_627_end_0 = const()[name = string("op_627_end_0"), val = tensor<int32, [4]>([1, 32, 509, 128])];
+            tensor<bool, [4]> var_627_end_mask_0 = const()[name = string("op_627_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 512, 128]> v_cast_fp16 = transpose(perm = v_perm_0, x = v_15_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 32, 508, 128]> new_v_cache_2 = slice_by_index(begin = var_627_begin_0, end = var_627_end_0, end_mask = var_627_end_mask_0, x = v_cast_fp16)[name = string("op_627_cast_fp16")];
             fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
-            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
-            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
-            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
-            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
-            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(true)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = var_642_cast_fp16, y = roped_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_15_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = mask)[name = string("attn_weights_15_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = softmax(axis = var_502, x = attn_weights_15_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool var_651_transpose_x_1 = const()[name = string("op_651_transpose_x_1"), val = bool(false)];
+            bool var_651_transpose_y_1 = const()[name = string("op_651_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 32, 512, 128]> var_651_cast_fp16 = matmul(transpose_x = var_651_transpose_x_1, transpose_y = var_651_transpose_y_1, x = attn_weights_cast_fp16, y = v_15_cast_fp16)[name = string("op_651_cast_fp16")];
+            tensor<int32, [4]> attn_5_perm_0 = const()[name = string("attn_5_perm_0"), val = tensor<int32, [4]>([0, 1, -1, -2])];
             tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = transpose(perm = attn_5_perm_0, x = var_651_cast_fp16)[name = string("transpose_0")];
             tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
             tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
diff --git a/sequoia/logit-processor.mlmodelc/analytics/coremldata.bin b/sequoia/logit-processor.mlmodelc/analytics/coremldata.bin
index 6e39062898bd063fccc3e6b0e38fe35b6f75a17e..ac67b25bf8f1252645fd24f3b5a86e2cd35dc28d 100644
--- a/sequoia/logit-processor.mlmodelc/analytics/coremldata.bin
+++ b/sequoia/logit-processor.mlmodelc/analytics/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:74983de62535c92d990c8a01bc57449459a6ac9a8263109616a338cdee40e8a2
+oid sha256:72d9433176e6a80c761281219743bd081fc1f935801c88c62bfff49d064c7d7c
 size 243
diff --git a/sequoia/logit-processor.mlmodelc/coremldata.bin b/sequoia/logit-processor.mlmodelc/coremldata.bin
index 18676e410d83b0edd69b9ce4a4768f95b8302c4b..058f6c8faf65e124a39ea511e78d878ce43c49f6 100644
--- a/sequoia/logit-processor.mlmodelc/coremldata.bin
+++ b/sequoia/logit-processor.mlmodelc/coremldata.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ded243adbc20b8605e5c93a187ed40179a5ad97480ca2cb273db2fc61e51605a
-size 351
+oid sha256:9e2213a03579f9bc30440e7174c6efb6a4913fed86ea9dd4d195018629debce9
+size 369
diff --git a/sequoia/logit-processor.mlmodelc/metadata.json b/sequoia/logit-processor.mlmodelc/metadata.json
index 1a46015410315c4b10080c9e6ab7c702e160c0c2..9b36e3d6fcfca536a5b4172135a32dc1fdf1cb56 100644
--- a/sequoia/logit-processor.mlmodelc/metadata.json
+++ b/sequoia/logit-processor.mlmodelc/metadata.json
@@ -47,15 +47,15 @@
         "dataType" : "Float16",
         "hasShapeFlexibility" : "1",
         "isOptional" : "0",
-        "shapeFlexibility" : "1 × 511 × 32000 | 1 × 1 × 32000 | 1 × 64 × 32000 | 1 × 512 × 32000",
+        "shapeFlexibility" : "1 × 511 × 32000 | 1 × 1 × 32000 | 1 × 2 × 32000 | 1 × 4 × 32000 | 1 × 64 × 32000 | 1 × 512 × 32000",
         "formattedType" : "MultiArray (Float16 1 × 511 × 32000)",
         "type" : "MultiArray",
         "shape" : "[1, 511, 32000]",
         "name" : "logits",
-        "enumeratedShapes" : "[[1, 511, 32000], [1, 1, 32000], [1, 64, 32000], [1, 512, 32000]]"
+        "enumeratedShapes" : "[[1, 511, 32000], [1, 1, 32000], [1, 2, 32000], [1, 4, 32000], [1, 64, 32000], [1, 512, 32000]]"
       }
     ],
-    "generatedClassName" : "logit_processor",
+    "generatedClassName" : "logit_processor_2",
     "method" : "predict"
   }
 ]
\ No newline at end of file
diff --git a/sequoia/logit-processor.mlmodelc/model.mil b/sequoia/logit-processor.mlmodelc/model.mil
index 2ac50535e8b3608d4110244e4365d552d3efab0f..9ae6d9427ced6dea75e53b3cc38563441e225bd2 100644
--- a/sequoia/logit-processor.mlmodelc/model.mil
+++ b/sequoia/logit-processor.mlmodelc/model.mil
@@ -1,7 +1,7 @@
 program(1.3)
-[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.42.1"}, {"coremlc-version", "3400.51.1"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
 {
-    func main<ios16>(tensor<fp16, [1, ?, 32000]> logits) [FlexibleShapeInformation = tuple<tuple<string, dict<string, tensor<int32, [?]>>>, tuple<string, dict<string, dict<string, tensor<int32, [?]>>>>>((("DefaultShapes", {{"logits", [1, 511, 32000]}}), ("EnumeratedShapes", {{"logits_1_1_1_1_32000_", {{"logits", [1, 1, 32000]}}}, {"logits_1_1_1_511_32000_", {{"logits", [1, 511, 32000]}}}, {"logits_1_1_1_512_32000_", {{"logits", [1, 512, 32000]}}}, {"logits_1_1_1_64_32000_", {{"logits", [1, 64, 32000]}}}})))] {
+    func main<ios16>(tensor<fp16, [1, ?, 32000]> logits) [FlexibleShapeInformation = tuple<tuple<string, dict<string, tensor<int32, [?]>>>, tuple<string, dict<string, dict<string, tensor<int32, [?]>>>>>((("DefaultShapes", {{"logits", [1, 511, 32000]}}), ("EnumeratedShapes", {{"logits_1_1_1_1_32000_", {{"logits", [1, 1, 32000]}}}, {"logits_1_1_1_2_32000_", {{"logits", [1, 2, 32000]}}}, {"logits_1_1_1_4_32000_", {{"logits", [1, 4, 32000]}}}, {"logits_1_1_1_511_32000_", {{"logits", [1, 511, 32000]}}}, {"logits_1_1_1_512_32000_", {{"logits", [1, 512, 32000]}}}, {"logits_1_1_1_64_32000_", {{"logits", [1, 64, 32000]}}}})))] {
             int32 var_2 = const()[name = string("op_2"), val = int32(-1)];
             bool var_3 = const()[name = string("op_3"), val = bool(false)];
             tensor<int32, [1, ?]> argmax = reduce_argmax(axis = var_2, keep_dims = var_3, x = logits)[name = string("op_4_cast_fp16")];