diff --git "a/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/model.mil" "b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/model.mil"
new file mode 100644--- /dev/null
+++ "b/sequoia/Llama-2-7b-hf_chunk11.mlmodelc/model.mil"
@@ -0,0 +1,918 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3400.34.1"}, {"coremlc-version", "3400.42.1"}})]
+{
+    func input_1_context_512<ios18>(tensor<fp16, [128, 1]> cos, tensor<fp16, [1, 32, 128, 511]> k_cache_0, tensor<fp16, [1, 32, 128, 511]> k_cache_1, tensor<fp16, [1, 32, 128, 511]> k_cache_2, tensor<fp16, [1, 1, 1, 512]> mask, tensor<fp16, [128, 1]> sin, tensor<fp16, [1, 32, 128, 511]> v_cache_0, tensor<fp16, [1, 32, 128, 511]> v_cache_1, tensor<fp16, [1, 32, 128, 511]> v_cache_2, tensor<fp16, [1, 4096, 1, 1]> x) [CoreML_InputDefaultValues = dict<string, fp32>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873792))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303873920))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874048))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25166464))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874176))))[name = string("blocks_0_attn_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33555264))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874304))))[name = string("blocks_0_mlp_fc_1_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56099840))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874432))))[name = string("blocks_0_mlp_fc_2_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 11008, 1, 1]> blocks_0_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78644416))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874560))))[name = string("blocks_0_mlp_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101188992))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874688))))[name = string("blocks_1_attn_q_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109577792))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874816))))[name = string("blocks_1_attn_k_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117966592))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303874944))))[name = string("blocks_1_attn_v_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126355392))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875072))))[name = string("blocks_1_attn_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134744192))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875200))))[name = string("blocks_1_mlp_fc_1_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157288768))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875328))))[name = string("blocks_1_mlp_fc_2_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 11008, 1, 1]> blocks_1_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179833344))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875456))))[name = string("blocks_1_mlp_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202377920))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875584))))[name = string("blocks_2_attn_q_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210766720))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875712))))[name = string("blocks_2_attn_k_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219155520))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875840))))[name = string("blocks_2_attn_v_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227544320))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303875968))))[name = string("blocks_2_attn_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(235933120))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303876096))))[name = string("blocks_2_mlp_fc_1_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258477696))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303876224))))[name = string("blocks_2_mlp_fc_2_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 11008, 1, 1]> blocks_2_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281022272))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303876352))))[name = string("blocks_2_mlp_proj_weight_palettized_cast_fp16")];
+            int32 var_22 = const()[name = string("op_22"), val = int32(-1)];
+            int32 var_30 = const()[name = string("op_30"), val = int32(3)];
+            int32 var_31 = const()[name = string("op_31"), val = int32(1)];
+            int32 var_34 = const()[name = string("op_34"), val = int32(-2)];
+            bool var_35 = const()[name = string("op_35"), val = bool(true)];
+            tensor<int32, [1]> var_53_axes_0 = const()[name = string("op_53_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 1]> var_53_cast_fp16 = squeeze(axes = var_53_axes_0, x = x)[name = string("op_53_cast_fp16")];
+            bool var_55_interleave_0 = const()[name = string("op_55_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 1]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 1]> var_55_cast_fp16 = concat(axis = var_31, interleave = var_55_interleave_0, values = (var_53_cast_fp16, eps_chan_1_to_fp16))[name = string("op_55_cast_fp16")];
+            tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 1]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_55_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 1]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_35, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            fp16 var_60_to_fp16 = const()[name = string("op_60_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_60_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
+            tensor<fp16, [1, 4096, 1, 1]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_72 = const()[name = string("op_72"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
+            string var_76_pad_type_0 = const()[name = string("op_76_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_76_pad_0 = const()[name = string("op_76_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_76_cast_fp16 = conv(dilations = var_74, groups = var_31, pad = var_76_pad_0, pad_type = var_76_pad_type_0, strides = var_72, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_76_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
+            tensor<fp16, [1, 4096, 1, 1]> q_1_cast_fp16 = mul(x = var_76_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_80 = const()[name = string("op_80"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
+            string var_84_pad_type_0 = const()[name = string("op_84_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_84_pad_0 = const()[name = string("op_84_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_84_cast_fp16 = conv(dilations = var_82, groups = var_31, pad = var_84_pad_0, pad_type = var_84_pad_type_0, strides = var_80, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_84_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
+            tensor<fp16, [1, 4096, 1, 1]> k_1_cast_fp16 = mul(x = var_84_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_88 = const()[name = string("op_88"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_90 = const()[name = string("op_90"), val = tensor<int32, [2]>([1, 1])];
+            string var_92_pad_type_0 = const()[name = string("op_92_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_92_pad_0 = const()[name = string("op_92_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_92_cast_fp16 = conv(dilations = var_90, groups = var_31, pad = var_92_pad_0, pad_type = var_92_pad_type_0, strides = var_88, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_92_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
+            tensor<fp16, [1, 4096, 1, 1]> v_1_cast_fp16 = mul(x = var_92_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_94 = const()[name = string("op_94"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> q_3_cast_fp16 = reshape(shape = var_94, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_96 = const()[name = string("op_96"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> k_3_cast_fp16 = reshape(shape = var_96, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_98 = const()[name = string("op_98"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> v_3_cast_fp16 = reshape(shape = var_98, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
+            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            tensor<int32, [4]> var_116_begin_0 = const()[name = string("op_116_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_116_end_0 = const()[name = string("op_116_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<bool, [4]> var_116_end_mask_0 = const()[name = string("op_116_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_116_cast_fp16 = slice_by_index(begin = var_116_begin_0, end = var_116_end_0, end_mask = var_116_end_mask_0, x = q_3_cast_fp16)[name = string("op_116_cast_fp16")];
+            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 1]> var_118_cast_fp16 = mul(x = var_116_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_118_cast_fp16")];
+            bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 1]> rotated_1_cast_fp16 = concat(axis = var_34, interleave = rotated_1_interleave_0, values = (var_118_cast_fp16, var_110_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_121_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_121_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_122_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_122_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> roped_1_cast_fp16 = add(x = var_121_cast_fp16, y = var_122_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
+            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            tensor<int32, [4]> var_141_begin_0 = const()[name = string("op_141_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_141_end_0 = const()[name = string("op_141_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<bool, [4]> var_141_end_mask_0 = const()[name = string("op_141_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_141_cast_fp16 = slice_by_index(begin = var_141_begin_0, end = var_141_end_0, end_mask = var_141_end_mask_0, x = k_3_cast_fp16)[name = string("op_141_cast_fp16")];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 1]> var_143_cast_fp16 = mul(x = var_141_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_143_cast_fp16")];
+            bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 1]> rotated_3_cast_fp16 = concat(axis = var_34, interleave = rotated_3_interleave_0, values = (var_143_cast_fp16, var_135_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_146_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_146_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_147_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_147_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> roped_3_cast_fp16 = add(x = var_146_cast_fp16, y = var_147_cast_fp16)[name = string("roped_3_cast_fp16")];
+            bool k_7_interleave_0 = const()[name = string("k_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_22, interleave = k_7_interleave_0, values = (k_cache_0, roped_3_cast_fp16))[name = string("k_7_cast_fp16")];
+            bool v_5_interleave_0 = const()[name = string("v_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_22, interleave = v_5_interleave_0, values = (v_cache_0, v_3_cast_fp16))[name = string("v_5_cast_fp16")];
+            tensor<int32, [4]> var_154_begin_0 = const()[name = string("op_154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_154_end_0 = const()[name = string("op_154_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_154_end_mask_0 = const()[name = string("op_154_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_154_begin_0, end = var_154_end_0, end_mask = var_154_end_mask_0, x = k_7_cast_fp16)[name = string("op_154_cast_fp16")];
+            tensor<int32, [4]> var_155_begin_0 = const()[name = string("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_155_end_0 = const()[name = string("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_155_end_mask_0 = const()[name = string("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = v_5_cast_fp16)[name = string("op_155_cast_fp16")];
+            fp16 var_159_to_fp16 = const()[name = string("op_159_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 1]> var_160_cast_fp16 = mul(x = roped_1_cast_fp16, y = var_159_to_fp16)[name = string("op_160_cast_fp16")];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_160_cast_fp16, y = k_7_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 1, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 1, 512]> var_168_cast_fp16 = softmax(axis = var_30, x = attn_weights_3_cast_fp16)[name = string("op_168_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 32, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_168_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_172 = const()[name = string("op_172"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 1]> input_1_cast_fp16 = reshape(shape = var_172, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_176 = const()[name = string("op_176"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
+            string var_180_pad_type_0 = const()[name = string("op_180_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_180_pad_0 = const()[name = string("op_180_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_180_cast_fp16 = conv(dilations = var_178, groups = var_31, pad = var_180_pad_0, pad_type = var_180_pad_type_0, strides = var_176, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_180_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
+            tensor<fp16, [1, 4096, 1, 1]> attention_output_1_cast_fp16 = mul(x = var_180_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_199_axes_0 = const()[name = string("op_199_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 1]> var_199_cast_fp16 = squeeze(axes = var_199_axes_0, x = x_11_cast_fp16)[name = string("op_199_cast_fp16")];
+            bool var_201_interleave_0 = const()[name = string("op_201_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 1]> eps_chan_3_to_fp16 = const()[name = string("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 1]> var_201_cast_fp16 = concat(axis = var_31, interleave = var_201_interleave_0, values = (var_199_cast_fp16, eps_chan_3_to_fp16))[name = string("op_201_cast_fp16")];
+            tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 1]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_201_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 1]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_35, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_206_to_fp16 = const()[name = string("op_206_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_206_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
+            tensor<fp16, [1, 4096, 1, 1]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_218 = const()[name = string("op_218"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
+            string var_222_pad_type_0 = const()[name = string("op_222_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_222_pad_0 = const()[name = string("op_222_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 1]> var_222_cast_fp16 = conv(dilations = var_220, groups = var_31, pad = var_222_pad_0, pad_type = var_222_pad_type_0, strides = var_218, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_222_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 1]> input_5_cast_fp16 = mul(x = var_222_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_226 = const()[name = string("op_226"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
+            string var_230_pad_type_0 = const()[name = string("op_230_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_230_pad_0 = const()[name = string("op_230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 1]> var_230_cast_fp16 = conv(dilations = var_228, groups = var_31, pad = var_230_pad_0, pad_type = var_230_pad_type_0, strides = var_226, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_230_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
+            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_1_cast_fp16 = mul(x = var_230_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> var_232_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_232_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> input_7_cast_fp16 = mul(x = var_232_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
+            string var_240_pad_type_0 = const()[name = string("op_240_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_240_pad_0 = const()[name = string("op_240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_240_cast_fp16 = conv(dilations = var_238, groups = var_31, pad = var_240_pad_0, pad_type = var_240_pad_type_0, strides = var_236, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_240_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
+            tensor<fp16, [1, 4096, 1, 1]> var_241_cast_fp16 = mul(x = var_240_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_241_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_15_cast_fp16 = add(x = var_241_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_252 = const()[name = string("op_252"), val = int32(-1)];
+            int32 var_260 = const()[name = string("op_260"), val = int32(3)];
+            int32 var_261 = const()[name = string("op_261"), val = int32(1)];
+            int32 var_264 = const()[name = string("op_264"), val = int32(-2)];
+            bool var_265 = const()[name = string("op_265"), val = bool(true)];
+            tensor<int32, [1]> var_282_axes_0 = const()[name = string("op_282_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 1]> var_282_cast_fp16 = squeeze(axes = var_282_axes_0, x = x_15_cast_fp16)[name = string("op_282_cast_fp16")];
+            bool var_284_interleave_0 = const()[name = string("op_284_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 1]> eps_chan_5_to_fp16 = const()[name = string("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 1]> var_284_cast_fp16 = concat(axis = var_261, interleave = var_284_interleave_0, values = (var_282_cast_fp16, eps_chan_5_to_fp16))[name = string("op_284_cast_fp16")];
+            tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 1]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_284_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 1]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_265, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_289_to_fp16 = const()[name = string("op_289_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_289_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
+            tensor<fp16, [1, 4096, 1, 1]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_304 = const()[name = string("op_304"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
+            string var_308_pad_type_0 = const()[name = string("op_308_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_308_pad_0 = const()[name = string("op_308_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_308_cast_fp16 = conv(dilations = var_306, groups = var_261, pad = var_308_pad_0, pad_type = var_308_pad_type_0, strides = var_304, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_308_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
+            tensor<fp16, [1, 4096, 1, 1]> q_7_cast_fp16 = mul(x = var_308_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_312 = const()[name = string("op_312"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
+            string var_316_pad_type_0 = const()[name = string("op_316_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_316_pad_0 = const()[name = string("op_316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_316_cast_fp16 = conv(dilations = var_314, groups = var_261, pad = var_316_pad_0, pad_type = var_316_pad_type_0, strides = var_312, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_316_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
+            tensor<fp16, [1, 4096, 1, 1]> k_9_cast_fp16 = mul(x = var_316_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [2]> var_320 = const()[name = string("op_320"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
+            string var_324_pad_type_0 = const()[name = string("op_324_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_324_pad_0 = const()[name = string("op_324_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_324_cast_fp16 = conv(dilations = var_322, groups = var_261, pad = var_324_pad_0, pad_type = var_324_pad_type_0, strides = var_320, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
+            tensor<fp16, [1, 4096, 1, 1]> v_7_cast_fp16 = mul(x = var_324_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_326 = const()[name = string("op_326"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> q_9_cast_fp16 = reshape(shape = var_326, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> k_11_cast_fp16 = reshape(shape = var_328, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> v_9_cast_fp16 = reshape(shape = var_330, x = v_7_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
+            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = q_9_cast_fp16)[name = string("op_342_cast_fp16")];
+            tensor<int32, [4]> var_348_begin_0 = const()[name = string("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_348_end_0 = const()[name = string("op_348_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<bool, [4]> var_348_end_mask_0 = const()[name = string("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = q_9_cast_fp16)[name = string("op_348_cast_fp16")];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 1]> var_350_cast_fp16 = mul(x = var_348_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_350_cast_fp16")];
+            bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 1]> rotated_5_cast_fp16 = concat(axis = var_264, interleave = rotated_5_interleave_0, values = (var_350_cast_fp16, var_342_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_353_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_353_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_354_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_354_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> roped_5_cast_fp16 = add(x = var_353_cast_fp16, y = var_354_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
+            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = k_11_cast_fp16)[name = string("op_367_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = string("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = string("op_373_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = string("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = string("op_373_cast_fp16")];
+            fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 1]> var_375_cast_fp16 = mul(x = var_373_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_375_cast_fp16")];
+            bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 1]> rotated_7_cast_fp16 = concat(axis = var_264, interleave = rotated_7_interleave_0, values = (var_375_cast_fp16, var_367_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_378_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = string("op_378_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_379_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_379_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> roped_7_cast_fp16 = add(x = var_378_cast_fp16, y = var_379_cast_fp16)[name = string("roped_7_cast_fp16")];
+            bool k_15_interleave_0 = const()[name = string("k_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_252, interleave = k_15_interleave_0, values = (k_cache_1, roped_7_cast_fp16))[name = string("k_15_cast_fp16")];
+            bool v_11_interleave_0 = const()[name = string("v_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_252, interleave = v_11_interleave_0, values = (v_cache_1, v_9_cast_fp16))[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = k_15_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = v_11_cast_fp16)[name = string("op_387_cast_fp16")];
+            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 1]> var_392_cast_fp16 = mul(x = roped_5_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_392_cast_fp16, y = k_15_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 32, 1, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 1, 512]> var_400_cast_fp16 = softmax(axis = var_260, x = attn_weights_7_cast_fp16)[name = string("op_400_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 32, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_11_cast_fp16, y = var_400_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_404 = const()[name = string("op_404"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 1]> input_9_cast_fp16 = reshape(shape = var_404, x = attn_5_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_408 = const()[name = string("op_408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_410 = const()[name = string("op_410"), val = tensor<int32, [2]>([1, 1])];
+            string var_412_pad_type_0 = const()[name = string("op_412_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_412_pad_0 = const()[name = string("op_412_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_412_cast_fp16 = conv(dilations = var_410, groups = var_261, pad = var_412_pad_0, pad_type = var_412_pad_type_0, strides = var_408, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
+            tensor<fp16, [1, 4096, 1, 1]> attention_output_3_cast_fp16 = mul(x = var_412_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_431_axes_0 = const()[name = string("op_431_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 1]> var_431_cast_fp16 = squeeze(axes = var_431_axes_0, x = x_25_cast_fp16)[name = string("op_431_cast_fp16")];
+            bool var_433_interleave_0 = const()[name = string("op_433_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 1]> eps_chan_7_to_fp16 = const()[name = string("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 1]> var_433_cast_fp16 = concat(axis = var_261, interleave = var_433_interleave_0, values = (var_431_cast_fp16, eps_chan_7_to_fp16))[name = string("op_433_cast_fp16")];
+            tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 1]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_433_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 1]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_265, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_438_to_fp16 = const()[name = string("op_438_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_438_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
+            tensor<fp16, [1, 4096, 1, 1]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_450 = const()[name = string("op_450"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_452 = const()[name = string("op_452"), val = tensor<int32, [2]>([1, 1])];
+            string var_454_pad_type_0 = const()[name = string("op_454_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_454_pad_0 = const()[name = string("op_454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 1]> var_454_cast_fp16 = conv(dilations = var_452, groups = var_261, pad = var_454_pad_0, pad_type = var_454_pad_type_0, strides = var_450, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_454_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
+            tensor<fp16, [1, 11008, 1, 1]> input_13_cast_fp16 = mul(x = var_454_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
+            tensor<int32, [2]> var_458 = const()[name = string("op_458"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
+            string var_462_pad_type_0 = const()[name = string("op_462_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_462_pad_0 = const()[name = string("op_462_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 1]> var_462_cast_fp16 = conv(dilations = var_460, groups = var_261, pad = var_462_pad_0, pad_type = var_462_pad_type_0, strides = var_458, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_462_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_3_cast_fp16 = mul(x = var_462_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> var_464_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_464_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> input_15_cast_fp16 = mul(x = var_464_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
+            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_261, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
+            tensor<fp16, [1, 4096, 1, 1]> var_473_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_473_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_29_cast_fp16 = add(x = var_473_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_484 = const()[name = string("op_484"), val = int32(-1)];
+            int32 var_492 = const()[name = string("op_492"), val = int32(3)];
+            int32 var_493 = const()[name = string("op_493"), val = int32(1)];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
+            bool var_497 = const()[name = string("op_497"), val = bool(true)];
+            tensor<int32, [1]> var_514_axes_0 = const()[name = string("op_514_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 1]> var_514_cast_fp16 = squeeze(axes = var_514_axes_0, x = x_29_cast_fp16)[name = string("op_514_cast_fp16")];
+            bool var_516_interleave_0 = const()[name = string("op_516_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 1]> eps_chan_9_to_fp16 = const()[name = string("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 1]> var_516_cast_fp16 = concat(axis = var_493, interleave = var_516_interleave_0, values = (var_514_cast_fp16, eps_chan_9_to_fp16))[name = string("op_516_cast_fp16")];
+            tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 1]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_516_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 1]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_497, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_521_to_fp16 = const()[name = string("op_521_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_521_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
+            tensor<fp16, [1, 4096, 1, 1]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_536 = const()[name = string("op_536"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_538 = const()[name = string("op_538"), val = tensor<int32, [2]>([1, 1])];
+            string var_540_pad_type_0 = const()[name = string("op_540_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_540_pad_0 = const()[name = string("op_540_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_540_cast_fp16 = conv(dilations = var_538, groups = var_493, pad = var_540_pad_0, pad_type = var_540_pad_type_0, strides = var_536, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_540_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
+            tensor<fp16, [1, 4096, 1, 1]> q_13_cast_fp16 = mul(x = var_540_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_544 = const()[name = string("op_544"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
+            string var_548_pad_type_0 = const()[name = string("op_548_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_548_pad_0 = const()[name = string("op_548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_548_cast_fp16 = conv(dilations = var_546, groups = var_493, pad = var_548_pad_0, pad_type = var_548_pad_type_0, strides = var_544, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_548_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
+            tensor<fp16, [1, 4096, 1, 1]> k_17_cast_fp16 = mul(x = var_548_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_17_cast_fp16")];
+            tensor<int32, [2]> var_552 = const()[name = string("op_552"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
+            string var_556_pad_type_0 = const()[name = string("op_556_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_556_pad_0 = const()[name = string("op_556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_556_cast_fp16 = conv(dilations = var_554, groups = var_493, pad = var_556_pad_0, pad_type = var_556_pad_type_0, strides = var_552, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_556_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
+            tensor<fp16, [1, 4096, 1, 1]> v_13_cast_fp16 = mul(x = var_556_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_558 = const()[name = string("op_558"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> q_15_cast_fp16 = reshape(shape = var_558, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_560 = const()[name = string("op_560"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> k_19_cast_fp16 = reshape(shape = var_560, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<int32, [4]> var_562 = const()[name = string("op_562"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<fp16, [1, 32, 128, 1]> v_15_cast_fp16 = reshape(shape = var_562, x = v_13_cast_fp16)[name = string("v_15_cast_fp16")];
+            tensor<int32, [4]> var_574_begin_0 = const()[name = string("op_574_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_574_end_0 = const()[name = string("op_574_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
+            tensor<bool, [4]> var_574_end_mask_0 = const()[name = string("op_574_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = q_15_cast_fp16)[name = string("op_574_cast_fp16")];
+            tensor<int32, [4]> var_580_begin_0 = const()[name = string("op_580_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_580_end_0 = const()[name = string("op_580_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<bool, [4]> var_580_end_mask_0 = const()[name = string("op_580_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_580_cast_fp16 = slice_by_index(begin = var_580_begin_0, end = var_580_end_0, end_mask = var_580_end_mask_0, x = q_15_cast_fp16)[name = string("op_580_cast_fp16")];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 1]> var_582_cast_fp16 = mul(x = var_580_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_582_cast_fp16")];
+            bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 1]> rotated_9_cast_fp16 = concat(axis = var_496, interleave = rotated_9_interleave_0, values = (var_582_cast_fp16, var_574_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_585_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_585_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_586_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_586_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> roped_9_cast_fp16 = add(x = var_585_cast_fp16, y = var_586_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_599_begin_0 = const()[name = string("op_599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_599_end_0 = const()[name = string("op_599_end_0"), val = tensor<int32, [4]>([1, 32, 64, 1])];
+            tensor<bool, [4]> var_599_end_mask_0 = const()[name = string("op_599_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_599_cast_fp16 = slice_by_index(begin = var_599_begin_0, end = var_599_end_0, end_mask = var_599_end_mask_0, x = k_19_cast_fp16)[name = string("op_599_cast_fp16")];
+            tensor<int32, [4]> var_605_begin_0 = const()[name = string("op_605_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_605_end_0 = const()[name = string("op_605_end_0"), val = tensor<int32, [4]>([1, 32, 128, 1])];
+            tensor<bool, [4]> var_605_end_mask_0 = const()[name = string("op_605_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 1]> var_605_cast_fp16 = slice_by_index(begin = var_605_begin_0, end = var_605_end_0, end_mask = var_605_end_mask_0, x = k_19_cast_fp16)[name = string("op_605_cast_fp16")];
+            fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 1]> var_607_cast_fp16 = mul(x = var_605_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_607_cast_fp16")];
+            bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 1]> rotated_cast_fp16 = concat(axis = var_496, interleave = rotated_interleave_0, values = (var_607_cast_fp16, var_599_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_610_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = string("op_610_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> var_611_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_611_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 1]> roped_cast_fp16 = add(x = var_610_cast_fp16, y = var_611_cast_fp16)[name = string("roped_cast_fp16")];
+            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_484, interleave = k_interleave_0, values = (k_cache_2, roped_cast_fp16))[name = string("k_cast_fp16")];
+            bool v_interleave_0 = const()[name = string("v_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_484, interleave = v_interleave_0, values = (v_cache_2, v_15_cast_fp16))[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_618_begin_0 = const()[name = string("op_618_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_618_end_0 = const()[name = string("op_618_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_618_end_mask_0 = const()[name = string("op_618_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_618_begin_0, end = var_618_end_0, end_mask = var_618_end_mask_0, x = k_cast_fp16)[name = string("op_618_cast_fp16")];
+            tensor<int32, [4]> var_619_begin_0 = const()[name = string("op_619_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_619_end_0 = const()[name = string("op_619_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_619_end_mask_0 = const()[name = string("op_619_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_619_begin_0, end = var_619_end_0, end_mask = var_619_end_mask_0, x = v_cast_fp16)[name = string("op_619_cast_fp16")];
+            fp16 var_623_to_fp16 = const()[name = string("op_623_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 1]> var_624_cast_fp16 = mul(x = roped_9_cast_fp16, y = var_623_to_fp16)[name = string("op_624_cast_fp16")];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_624_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 1, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
+            tensor<fp16, [1, 32, 1, 512]> var_632_cast_fp16 = softmax(axis = var_492, x = attn_weights_cast_fp16)[name = string("op_632_cast_fp16")];
+            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
+            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 32, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = v_cast_fp16, y = var_632_cast_fp16)[name = string("attn_9_cast_fp16")];
+            tensor<int32, [4]> var_636 = const()[name = string("op_636"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 1]> input_17_cast_fp16 = reshape(shape = var_636, x = attn_9_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_640 = const()[name = string("op_640"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_642 = const()[name = string("op_642"), val = tensor<int32, [2]>([1, 1])];
+            string var_644_pad_type_0 = const()[name = string("op_644_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_644_pad_0 = const()[name = string("op_644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_644_cast_fp16 = conv(dilations = var_642, groups = var_493, pad = var_644_pad_0, pad_type = var_644_pad_type_0, strides = var_640, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
+            tensor<fp16, [1, 4096, 1, 1]> attention_output_cast_fp16 = mul(x = var_644_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_663_axes_0 = const()[name = string("op_663_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 1]> var_663_cast_fp16 = squeeze(axes = var_663_axes_0, x = x_39_cast_fp16)[name = string("op_663_cast_fp16")];
+            bool var_665_interleave_0 = const()[name = string("op_665_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 1]> eps_chan_to_fp16 = const()[name = string("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 1]>([[[0x1.9e8p-3]]])];
+            tensor<fp16, [1, 4097, 1]> var_665_cast_fp16 = concat(axis = var_493, interleave = var_665_interleave_0, values = (var_663_cast_fp16, eps_chan_to_fp16))[name = string("op_665_cast_fp16")];
+            tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 1]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_665_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 1]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_497, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_670_to_fp16 = const()[name = string("op_670_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 1]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_670_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
+            tensor<fp16, [1, 4096, 1, 1]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
+            tensor<int32, [2]> var_682 = const()[name = string("op_682"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_684 = const()[name = string("op_684"), val = tensor<int32, [2]>([1, 1])];
+            string var_686_pad_type_0 = const()[name = string("op_686_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_686_pad_0 = const()[name = string("op_686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 1]> var_686_cast_fp16 = conv(dilations = var_684, groups = var_493, pad = var_686_pad_0, pad_type = var_686_pad_type_0, strides = var_682, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_686_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 1]> input_21_cast_fp16 = mul(x = var_686_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_690 = const()[name = string("op_690"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_692 = const()[name = string("op_692"), val = tensor<int32, [2]>([1, 1])];
+            string var_694_pad_type_0 = const()[name = string("op_694_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_694_pad_0 = const()[name = string("op_694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 1]> var_694_cast_fp16 = conv(dilations = var_692, groups = var_493, pad = var_694_pad_0, pad_type = var_694_pad_type_0, strides = var_690, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_694_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 1]> x_fc_2_cast_fp16 = mul(x = var_694_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> var_696_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_696_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> input_cast_fp16 = mul(x = var_696_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
+            string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 1]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_493, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
+            tensor<fp16, [1, 4096, 1, 1]> var_705_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_705_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> new_x = add(x = var_705_cast_fp16, y = x_39_cast_fp16)[name = string("op_706_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
+    func input_512_context_512<ios18>(tensor<fp16, [128, 512]> cos, tensor<fp16, [1, 1, 512, 512]> mask, tensor<fp16, [128, 512]> sin, tensor<fp16, [1, 4096, 1, 512]> x) {
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388736))))[name = string("blocks_0_attn_q_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8388864))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777536))))[name = string("blocks_0_attn_k_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16777664))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25166336))))[name = string("blocks_0_attn_v_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25166464))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33555136))))[name = string("blocks_0_attn_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33555264))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56099712))))[name = string("blocks_0_mlp_fc_1_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_0_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56099840))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78644288))))[name = string("blocks_0_mlp_fc_2_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 11008, 1, 1]> blocks_0_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78644416))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101188864))))[name = string("blocks_0_mlp_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101188992))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109577664))))[name = string("blocks_1_attn_q_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109577792))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117966464))))[name = string("blocks_1_attn_k_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117966592))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126355264))))[name = string("blocks_1_attn_v_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_1_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126355392))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134744064))))[name = string("blocks_1_attn_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134744192))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157288640))))[name = string("blocks_1_mlp_fc_1_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_1_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157288768))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179833216))))[name = string("blocks_1_mlp_fc_2_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 11008, 1, 1]> blocks_1_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179833344))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202377792))))[name = string("blocks_1_mlp_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202377920))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210766592))))[name = string("blocks_2_attn_q_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210766720))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219155392))))[name = string("blocks_2_attn_k_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219155520))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227544192))))[name = string("blocks_2_attn_v_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 4096, 1, 1]> blocks_2_attn_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227544320))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(235932992))))[name = string("blocks_2_attn_proj_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(235933120))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258477568))))[name = string("blocks_2_mlp_fc_1_weight_palettized_cast_fp16")];
+            tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [11008, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258477696))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281022144))))[name = string("blocks_2_mlp_fc_2_weight_palettized_cast_fp16")];
+            tensor<fp16, [4096, 11008, 1, 1]> blocks_2_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281022272))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303566720))))[name = string("blocks_2_mlp_proj_weight_palettized_cast_fp16")];
+            int32 var_24 = const()[name = string("op_24"), val = int32(3)];
+            int32 var_25 = const()[name = string("op_25"), val = int32(1)];
+            int32 var_28 = const()[name = string("op_28"), val = int32(-2)];
+            bool var_29 = const()[name = string("op_29"), val = bool(true)];
+            tensor<int32, [1]> var_47_axes_0 = const()[name = string("op_47_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 512]> var_47_cast_fp16 = squeeze(axes = var_47_axes_0, x = x)[name = string("op_47_cast_fp16")];
+            bool var_49_interleave_0 = const()[name = string("op_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 512]> eps_chan_1_to_fp16 = const()[name = string("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303566848)))];
+            tensor<fp16, [1, 4097, 512]> var_49_cast_fp16 = concat(axis = var_25, interleave = var_49_interleave_0, values = (var_47_cast_fp16, eps_chan_1_to_fp16))[name = string("op_49_cast_fp16")];
+            tensor<int32, [1]> x_eps_1_axes_0 = const()[name = string("x_eps_1_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 512]> x_eps_1_cast_fp16 = expand_dims(axes = x_eps_1_axes_0, x = var_49_cast_fp16)[name = string("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = string("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 512]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_29, x = x_eps_1_cast_fp16)[name = string("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = string("x_normed_1_cast_fp16")];
+            fp16 var_54_to_fp16 = const()[name = string("op_54_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_54_to_fp16)[name = string("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = string("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303567936)))];
+            tensor<fp16, [1, 4096, 1, 512]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = string("x_5_cast_fp16")];
+            tensor<int32, [2]> var_66 = const()[name = string("op_66"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_68 = const()[name = string("op_68"), val = tensor<int32, [2]>([1, 1])];
+            string var_70_pad_type_0 = const()[name = string("op_70_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_70_pad_0 = const()[name = string("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_25, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_70_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303576192)))];
+            tensor<fp16, [1, 4096, 1, 512]> q_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = string("q_1_cast_fp16")];
+            tensor<int32, [2]> var_74 = const()[name = string("op_74"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_76 = const()[name = string("op_76"), val = tensor<int32, [2]>([1, 1])];
+            string var_78_pad_type_0 = const()[name = string("op_78_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_78_pad_0 = const()[name = string("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_25, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_78_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303584448)))];
+            tensor<fp16, [1, 4096, 1, 512]> k_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = string("k_1_cast_fp16")];
+            tensor<int32, [2]> var_82 = const()[name = string("op_82"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_84 = const()[name = string("op_84"), val = tensor<int32, [2]>([1, 1])];
+            string var_86_pad_type_0 = const()[name = string("op_86_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_86_pad_0 = const()[name = string("op_86_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_86_cast_fp16 = conv(dilations = var_84, groups = var_25, pad = var_86_pad_0, pad_type = var_86_pad_type_0, strides = var_82, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = string("op_86_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303592704)))];
+            tensor<fp16, [1, 4096, 1, 512]> v_1_cast_fp16 = mul(x = var_86_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_88 = const()[name = string("op_88"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_3_cast_fp16 = reshape(shape = var_88, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_90 = const()[name = string("op_90"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_3_cast_fp16 = reshape(shape = var_90, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [4]> var_92 = const()[name = string("op_92"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_3_cast_fp16 = reshape(shape = var_92, x = v_1_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_104_begin_0 = const()[name = string("op_104_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_104_end_0 = const()[name = string("op_104_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_104_end_mask_0 = const()[name = string("op_104_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_104_cast_fp16 = slice_by_index(begin = var_104_begin_0, end = var_104_end_0, end_mask = var_104_end_mask_0, x = q_3_cast_fp16)[name = string("op_104_cast_fp16")];
+            tensor<int32, [4]> var_110_begin_0 = const()[name = string("op_110_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_110_end_0 = const()[name = string("op_110_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_110_end_mask_0 = const()[name = string("op_110_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_110_cast_fp16 = slice_by_index(begin = var_110_begin_0, end = var_110_end_0, end_mask = var_110_end_mask_0, x = q_3_cast_fp16)[name = string("op_110_cast_fp16")];
+            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 512]> var_112_cast_fp16 = mul(x = var_110_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_112_cast_fp16")];
+            bool rotated_1_interleave_0 = const()[name = string("rotated_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> rotated_1_cast_fp16 = concat(axis = var_28, interleave = rotated_1_interleave_0, values = (var_112_cast_fp16, var_104_cast_fp16))[name = string("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_115_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = string("op_115_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_116_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = string("op_116_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_1_cast_fp16 = add(x = var_115_cast_fp16, y = var_116_cast_fp16)[name = string("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_129_begin_0 = const()[name = string("op_129_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_129_end_0 = const()[name = string("op_129_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_129_end_mask_0 = const()[name = string("op_129_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_129_cast_fp16 = slice_by_index(begin = var_129_begin_0, end = var_129_end_0, end_mask = var_129_end_mask_0, x = k_3_cast_fp16)[name = string("op_129_cast_fp16")];
+            tensor<int32, [4]> var_135_begin_0 = const()[name = string("op_135_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_135_end_0 = const()[name = string("op_135_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_135_end_mask_0 = const()[name = string("op_135_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_135_cast_fp16 = slice_by_index(begin = var_135_begin_0, end = var_135_end_0, end_mask = var_135_end_mask_0, x = k_3_cast_fp16)[name = string("op_135_cast_fp16")];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 512]> var_137_cast_fp16 = mul(x = var_135_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_137_cast_fp16")];
+            bool rotated_3_interleave_0 = const()[name = string("rotated_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> rotated_3_cast_fp16 = concat(axis = var_28, interleave = rotated_3_interleave_0, values = (var_137_cast_fp16, var_129_cast_fp16))[name = string("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_140_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = string("op_140_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_141_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = string("op_141_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_3_cast_fp16 = add(x = var_140_cast_fp16, y = var_141_cast_fp16)[name = string("roped_3_cast_fp16")];
+            bool q_5_interleave_0 = const()[name = string("q_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> q_5_cast_fp16 = concat(axis = var_28, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = string("q_5_cast_fp16")];
+            bool k_5_interleave_0 = const()[name = string("k_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> k_5_cast_fp16 = concat(axis = var_28, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = string("k_5_cast_fp16")];
+            tensor<int32, [4]> var_156_begin_0 = const()[name = string("op_156_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_156_end_0 = const()[name = string("op_156_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_156_end_mask_0 = const()[name = string("op_156_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_k_cache_0 = slice_by_index(begin = var_156_begin_0, end = var_156_end_0, end_mask = var_156_end_mask_0, x = k_5_cast_fp16)[name = string("op_156_cast_fp16")];
+            tensor<int32, [4]> var_157_begin_0 = const()[name = string("op_157_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_157_end_0 = const()[name = string("op_157_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_157_end_mask_0 = const()[name = string("op_157_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_v_cache_0 = slice_by_index(begin = var_157_begin_0, end = var_157_end_0, end_mask = var_157_end_mask_0, x = v_3_cast_fp16)[name = string("op_157_cast_fp16")];
+            fp16 var_161_to_fp16 = const()[name = string("op_161_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 512]> var_162_cast_fp16 = mul(x = q_5_cast_fp16, y = var_161_to_fp16)[name = string("op_162_cast_fp16")];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(true)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_162_cast_fp16, y = k_5_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = string("attn_weights_3_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> var_170_cast_fp16 = softmax(axis = var_24, x = attn_weights_3_cast_fp16)[name = string("op_170_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 32, 128, 512]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_3_cast_fp16, y = var_170_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_174 = const()[name = string("op_174"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 512]> input_1_cast_fp16 = reshape(shape = var_174, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            tensor<int32, [2]> var_178 = const()[name = string("op_178"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_180 = const()[name = string("op_180"), val = tensor<int32, [2]>([1, 1])];
+            string var_182_pad_type_0 = const()[name = string("op_182_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_182_pad_0 = const()[name = string("op_182_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_182_cast_fp16 = conv(dilations = var_180, groups = var_25, pad = var_182_pad_0, pad_type = var_182_pad_type_0, strides = var_178, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = string("op_182_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303600960)))];
+            tensor<fp16, [1, 4096, 1, 512]> attention_output_1_cast_fp16 = mul(x = var_182_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = string("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = string("x_11_cast_fp16")];
+            tensor<int32, [1]> var_201_axes_0 = const()[name = string("op_201_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 512]> var_201_cast_fp16 = squeeze(axes = var_201_axes_0, x = x_11_cast_fp16)[name = string("op_201_cast_fp16")];
+            bool var_203_interleave_0 = const()[name = string("op_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 4097, 512]> var_203_cast_fp16 = concat(axis = var_25, interleave = var_203_interleave_0, values = (var_201_cast_fp16, eps_chan_1_to_fp16))[name = string("op_203_cast_fp16")];
+            tensor<int32, [1]> x_eps_3_axes_0 = const()[name = string("x_eps_3_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 512]> x_eps_3_cast_fp16 = expand_dims(axes = x_eps_3_axes_0, x = var_203_cast_fp16)[name = string("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = string("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 512]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_29, x = x_eps_3_cast_fp16)[name = string("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_7_cast_fp16 = real_div(x = x_11_cast_fp16, y = norm_x_3_cast_fp16)[name = string("x_normed_7_cast_fp16")];
+            fp16 var_208_to_fp16 = const()[name = string("op_208_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_208_to_fp16)[name = string("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = string("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303609216)))];
+            tensor<fp16, [1, 4096, 1, 512]> input_3_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = string("input_3_cast_fp16")];
+            tensor<int32, [2]> var_220 = const()[name = string("op_220"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_222 = const()[name = string("op_222"), val = tensor<int32, [2]>([1, 1])];
+            string var_224_pad_type_0 = const()[name = string("op_224_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_224_pad_0 = const()[name = string("op_224_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 512]> var_224_cast_fp16 = conv(dilations = var_222, groups = var_25, pad = var_224_pad_0, pad_type = var_224_pad_type_0, strides = var_220, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_224_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303617472)))];
+            tensor<fp16, [1, 11008, 1, 512]> input_5_cast_fp16 = mul(x = var_224_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = string("input_5_cast_fp16")];
+            tensor<int32, [2]> var_228 = const()[name = string("op_228"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_230 = const()[name = string("op_230"), val = tensor<int32, [2]>([1, 1])];
+            string var_232_pad_type_0 = const()[name = string("op_232_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_232_pad_0 = const()[name = string("op_232_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 512]> var_232_cast_fp16 = conv(dilations = var_230, groups = var_25, pad = var_232_pad_0, pad_type = var_232_pad_type_0, strides = var_228, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = string("op_232_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303639552)))];
+            tensor<fp16, [1, 11008, 1, 512]> x_fc_2_1_cast_fp16 = mul(x = var_232_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 512]> var_234_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_234_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 512]> input_7_cast_fp16 = mul(x = var_234_cast_fp16, y = x_fc_2_1_cast_fp16)[name = string("input_7_cast_fp16")];
+            tensor<int32, [2]> var_238 = const()[name = string("op_238"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_240 = const()[name = string("op_240"), val = tensor<int32, [2]>([1, 1])];
+            string var_242_pad_type_0 = const()[name = string("op_242_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_242_pad_0 = const()[name = string("op_242_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_242_cast_fp16 = conv(dilations = var_240, groups = var_25, pad = var_242_pad_0, pad_type = var_242_pad_type_0, strides = var_238, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = string("op_242_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303661632)))];
+            tensor<fp16, [1, 4096, 1, 512]> var_243_cast_fp16 = mul(x = var_242_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = string("op_243_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_15_cast_fp16 = add(x = var_243_cast_fp16, y = x_11_cast_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_262 = const()[name = string("op_262"), val = int32(3)];
+            int32 var_263 = const()[name = string("op_263"), val = int32(1)];
+            int32 var_266 = const()[name = string("op_266"), val = int32(-2)];
+            bool var_267 = const()[name = string("op_267"), val = bool(true)];
+            tensor<int32, [1]> var_284_axes_0 = const()[name = string("op_284_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 512]> var_284_cast_fp16 = squeeze(axes = var_284_axes_0, x = x_15_cast_fp16)[name = string("op_284_cast_fp16")];
+            bool var_286_interleave_0 = const()[name = string("op_286_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 4097, 512]> var_286_cast_fp16 = concat(axis = var_263, interleave = var_286_interleave_0, values = (var_284_cast_fp16, eps_chan_1_to_fp16))[name = string("op_286_cast_fp16")];
+            tensor<int32, [1]> x_eps_5_axes_0 = const()[name = string("x_eps_5_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 512]> x_eps_5_cast_fp16 = expand_dims(axes = x_eps_5_axes_0, x = var_286_cast_fp16)[name = string("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = string("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 512]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_267, x = x_eps_5_cast_fp16)[name = string("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_13_cast_fp16 = real_div(x = x_15_cast_fp16, y = norm_x_5_cast_fp16)[name = string("x_normed_13_cast_fp16")];
+            fp16 var_291_to_fp16 = const()[name = string("op_291_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_291_to_fp16)[name = string("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = string("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303669888)))];
+            tensor<fp16, [1, 4096, 1, 512]> x_19_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [2]> var_306 = const()[name = string("op_306"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_308 = const()[name = string("op_308"), val = tensor<int32, [2]>([1, 1])];
+            string var_310_pad_type_0 = const()[name = string("op_310_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_310_pad_0 = const()[name = string("op_310_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_310_cast_fp16 = conv(dilations = var_308, groups = var_263, pad = var_310_pad_0, pad_type = var_310_pad_type_0, strides = var_306, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_310_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303678144)))];
+            tensor<fp16, [1, 4096, 1, 512]> q_7_cast_fp16 = mul(x = var_310_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [2]> var_314 = const()[name = string("op_314"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_316 = const()[name = string("op_316"), val = tensor<int32, [2]>([1, 1])];
+            string var_318_pad_type_0 = const()[name = string("op_318_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_318_pad_0 = const()[name = string("op_318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_318_cast_fp16 = conv(dilations = var_316, groups = var_263, pad = var_318_pad_0, pad_type = var_318_pad_type_0, strides = var_314, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_318_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303686400)))];
+            tensor<fp16, [1, 4096, 1, 512]> k_7_cast_fp16 = mul(x = var_318_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = string("k_7_cast_fp16")];
+            tensor<int32, [2]> var_322 = const()[name = string("op_322"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_324 = const()[name = string("op_324"), val = tensor<int32, [2]>([1, 1])];
+            string var_326_pad_type_0 = const()[name = string("op_326_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_326_pad_0 = const()[name = string("op_326_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_326_cast_fp16 = conv(dilations = var_324, groups = var_263, pad = var_326_pad_0, pad_type = var_326_pad_type_0, strides = var_322, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = string("op_326_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303694656)))];
+            tensor<fp16, [1, 4096, 1, 512]> v_5_cast_fp16 = mul(x = var_326_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = string("v_5_cast_fp16")];
+            tensor<int32, [4]> var_328 = const()[name = string("op_328"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_9_cast_fp16 = reshape(shape = var_328, x = q_7_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> var_330 = const()[name = string("op_330"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_9_cast_fp16 = reshape(shape = var_330, x = k_7_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<int32, [4]> var_332 = const()[name = string("op_332"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_7_cast_fp16 = reshape(shape = var_332, x = v_5_cast_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_344_begin_0 = const()[name = string("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_344_end_0 = const()[name = string("op_344_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_344_end_mask_0 = const()[name = string("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = q_9_cast_fp16)[name = string("op_344_cast_fp16")];
+            tensor<int32, [4]> var_350_begin_0 = const()[name = string("op_350_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_350_end_0 = const()[name = string("op_350_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_350_end_mask_0 = const()[name = string("op_350_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_350_cast_fp16 = slice_by_index(begin = var_350_begin_0, end = var_350_end_0, end_mask = var_350_end_mask_0, x = q_9_cast_fp16)[name = string("op_350_cast_fp16")];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 512]> var_352_cast_fp16 = mul(x = var_350_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_352_cast_fp16")];
+            bool rotated_5_interleave_0 = const()[name = string("rotated_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> rotated_5_cast_fp16 = concat(axis = var_266, interleave = rotated_5_interleave_0, values = (var_352_cast_fp16, var_344_cast_fp16))[name = string("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_355_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = string("op_355_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_356_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = string("op_356_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_5_cast_fp16 = add(x = var_355_cast_fp16, y = var_356_cast_fp16)[name = string("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_369_begin_0 = const()[name = string("op_369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_369_end_0 = const()[name = string("op_369_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_369_end_mask_0 = const()[name = string("op_369_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_369_cast_fp16 = slice_by_index(begin = var_369_begin_0, end = var_369_end_0, end_mask = var_369_end_mask_0, x = k_9_cast_fp16)[name = string("op_369_cast_fp16")];
+            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = k_9_cast_fp16)[name = string("op_375_cast_fp16")];
+            fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 512]> var_377_cast_fp16 = mul(x = var_375_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_377_cast_fp16")];
+            bool rotated_7_interleave_0 = const()[name = string("rotated_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> rotated_7_cast_fp16 = concat(axis = var_266, interleave = rotated_7_interleave_0, values = (var_377_cast_fp16, var_369_cast_fp16))[name = string("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_380_cast_fp16 = mul(x = k_9_cast_fp16, y = cos)[name = string("op_380_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_381_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = string("op_381_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_7_cast_fp16 = add(x = var_380_cast_fp16, y = var_381_cast_fp16)[name = string("roped_7_cast_fp16")];
+            bool q_11_interleave_0 = const()[name = string("q_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> q_11_cast_fp16 = concat(axis = var_266, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = string("q_11_cast_fp16")];
+            bool k_11_interleave_0 = const()[name = string("k_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> k_11_cast_fp16 = concat(axis = var_266, interleave = k_11_interleave_0, values = roped_7_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<int32, [4]> var_396_begin_0 = const()[name = string("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_396_end_0 = const()[name = string("op_396_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_396_end_mask_0 = const()[name = string("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_k_cache_1 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_11_cast_fp16)[name = string("op_396_cast_fp16")];
+            tensor<int32, [4]> var_397_begin_0 = const()[name = string("op_397_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_397_end_0 = const()[name = string("op_397_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_397_end_mask_0 = const()[name = string("op_397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_v_cache_1 = slice_by_index(begin = var_397_begin_0, end = var_397_end_0, end_mask = var_397_end_mask_0, x = v_7_cast_fp16)[name = string("op_397_cast_fp16")];
+            fp16 var_401_to_fp16 = const()[name = string("op_401_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 512]> var_402_cast_fp16 = mul(x = q_11_cast_fp16, y = var_401_to_fp16)[name = string("op_402_cast_fp16")];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(true)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_402_cast_fp16, y = k_11_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = string("attn_weights_7_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> var_410_cast_fp16 = softmax(axis = var_262, x = attn_weights_7_cast_fp16)[name = string("op_410_cast_fp16")];
+            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
+            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 32, 128, 512]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_7_cast_fp16, y = var_410_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<int32, [4]> var_414 = const()[name = string("op_414"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 512]> input_9_cast_fp16 = reshape(shape = var_414, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [2]> var_418 = const()[name = string("op_418"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_420 = const()[name = string("op_420"), val = tensor<int32, [2]>([1, 1])];
+            string var_422_pad_type_0 = const()[name = string("op_422_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_422_pad_0 = const()[name = string("op_422_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_422_cast_fp16 = conv(dilations = var_420, groups = var_263, pad = var_422_pad_0, pad_type = var_422_pad_type_0, strides = var_418, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = string("op_422_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303702912)))];
+            tensor<fp16, [1, 4096, 1, 512]> attention_output_3_cast_fp16 = mul(x = var_422_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = string("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = string("x_25_cast_fp16")];
+            tensor<int32, [1]> var_441_axes_0 = const()[name = string("op_441_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 512]> var_441_cast_fp16 = squeeze(axes = var_441_axes_0, x = x_25_cast_fp16)[name = string("op_441_cast_fp16")];
+            bool var_443_interleave_0 = const()[name = string("op_443_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 4097, 512]> var_443_cast_fp16 = concat(axis = var_263, interleave = var_443_interleave_0, values = (var_441_cast_fp16, eps_chan_1_to_fp16))[name = string("op_443_cast_fp16")];
+            tensor<int32, [1]> x_eps_7_axes_0 = const()[name = string("x_eps_7_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 512]> x_eps_7_cast_fp16 = expand_dims(axes = x_eps_7_axes_0, x = var_443_cast_fp16)[name = string("x_eps_7_cast_fp16")];
+            tensor<int32, [1]> norm_x_7_axes_0 = const()[name = string("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 512]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_267, x = x_eps_7_cast_fp16)[name = string("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_19_cast_fp16 = real_div(x = x_25_cast_fp16, y = norm_x_7_cast_fp16)[name = string("x_normed_19_cast_fp16")];
+            fp16 var_448_to_fp16 = const()[name = string("op_448_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_448_to_fp16)[name = string("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = string("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303711168)))];
+            tensor<fp16, [1, 4096, 1, 512]> input_11_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = string("input_11_cast_fp16")];
+            tensor<int32, [2]> var_460 = const()[name = string("op_460"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_462 = const()[name = string("op_462"), val = tensor<int32, [2]>([1, 1])];
+            string var_464_pad_type_0 = const()[name = string("op_464_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_464_pad_0 = const()[name = string("op_464_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 512]> var_464_cast_fp16 = conv(dilations = var_462, groups = var_263, pad = var_464_pad_0, pad_type = var_464_pad_type_0, strides = var_460, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_464_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303719424)))];
+            tensor<fp16, [1, 11008, 1, 512]> input_13_cast_fp16 = mul(x = var_464_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = string("input_13_cast_fp16")];
+            tensor<int32, [2]> var_468 = const()[name = string("op_468"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_470 = const()[name = string("op_470"), val = tensor<int32, [2]>([1, 1])];
+            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 512]> var_472_cast_fp16 = conv(dilations = var_470, groups = var_263, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_468, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303741504)))];
+            tensor<fp16, [1, 11008, 1, 512]> x_fc_2_3_cast_fp16 = mul(x = var_472_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 512]> var_474_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_474_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 512]> input_15_cast_fp16 = mul(x = var_474_cast_fp16, y = x_fc_2_3_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<int32, [2]> var_478 = const()[name = string("op_478"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_480 = const()[name = string("op_480"), val = tensor<int32, [2]>([1, 1])];
+            string var_482_pad_type_0 = const()[name = string("op_482_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_482_pad_0 = const()[name = string("op_482_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_482_cast_fp16 = conv(dilations = var_480, groups = var_263, pad = var_482_pad_0, pad_type = var_482_pad_type_0, strides = var_478, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = string("op_482_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303763584)))];
+            tensor<fp16, [1, 4096, 1, 512]> var_483_cast_fp16 = mul(x = var_482_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = string("op_483_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_29_cast_fp16 = add(x = var_483_cast_fp16, y = x_25_cast_fp16)[name = string("x_29_cast_fp16")];
+            int32 var_502 = const()[name = string("op_502"), val = int32(3)];
+            int32 var_503 = const()[name = string("op_503"), val = int32(1)];
+            int32 var_506 = const()[name = string("op_506"), val = int32(-2)];
+            bool var_507 = const()[name = string("op_507"), val = bool(true)];
+            tensor<int32, [1]> var_524_axes_0 = const()[name = string("op_524_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 512]> var_524_cast_fp16 = squeeze(axes = var_524_axes_0, x = x_29_cast_fp16)[name = string("op_524_cast_fp16")];
+            bool var_526_interleave_0 = const()[name = string("op_526_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 4097, 512]> var_526_cast_fp16 = concat(axis = var_503, interleave = var_526_interleave_0, values = (var_524_cast_fp16, eps_chan_1_to_fp16))[name = string("op_526_cast_fp16")];
+            tensor<int32, [1]> x_eps_9_axes_0 = const()[name = string("x_eps_9_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 512]> x_eps_9_cast_fp16 = expand_dims(axes = x_eps_9_axes_0, x = var_526_cast_fp16)[name = string("x_eps_9_cast_fp16")];
+            tensor<int32, [1]> norm_x_9_axes_0 = const()[name = string("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 512]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_507, x = x_eps_9_cast_fp16)[name = string("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_25_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_9_cast_fp16)[name = string("x_normed_25_cast_fp16")];
+            fp16 var_531_to_fp16 = const()[name = string("op_531_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_531_to_fp16)[name = string("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = string("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303771840)))];
+            tensor<fp16, [1, 4096, 1, 512]> x_33_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = string("x_33_cast_fp16")];
+            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_548 = const()[name = string("op_548"), val = tensor<int32, [2]>([1, 1])];
+            string var_550_pad_type_0 = const()[name = string("op_550_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_550_pad_0 = const()[name = string("op_550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_550_cast_fp16 = conv(dilations = var_548, groups = var_503, pad = var_550_pad_0, pad_type = var_550_pad_type_0, strides = var_546, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_550_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303780096)))];
+            tensor<fp16, [1, 4096, 1, 512]> q_13_cast_fp16 = mul(x = var_550_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = string("q_13_cast_fp16")];
+            tensor<int32, [2]> var_554 = const()[name = string("op_554"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_556 = const()[name = string("op_556"), val = tensor<int32, [2]>([1, 1])];
+            string var_558_pad_type_0 = const()[name = string("op_558_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_558_pad_0 = const()[name = string("op_558_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_558_cast_fp16 = conv(dilations = var_556, groups = var_503, pad = var_558_pad_0, pad_type = var_558_pad_type_0, strides = var_554, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_558_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303788352)))];
+            tensor<fp16, [1, 4096, 1, 512]> k_13_cast_fp16 = mul(x = var_558_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = string("k_13_cast_fp16")];
+            tensor<int32, [2]> var_562 = const()[name = string("op_562"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_564 = const()[name = string("op_564"), val = tensor<int32, [2]>([1, 1])];
+            string var_566_pad_type_0 = const()[name = string("op_566_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_566_pad_0 = const()[name = string("op_566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_566_cast_fp16 = conv(dilations = var_564, groups = var_503, pad = var_566_pad_0, pad_type = var_566_pad_type_0, strides = var_562, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = string("op_566_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303796608)))];
+            tensor<fp16, [1, 4096, 1, 512]> v_9_cast_fp16 = mul(x = var_566_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_568 = const()[name = string("op_568"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> q_15_cast_fp16 = reshape(shape = var_568, x = q_13_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_570 = const()[name = string("op_570"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = reshape(shape = var_570, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<int32, [4]> var_572 = const()[name = string("op_572"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = reshape(shape = var_572, x = v_9_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_584_begin_0 = const()[name = string("op_584_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_584_end_0 = const()[name = string("op_584_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_584_end_mask_0 = const()[name = string("op_584_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_584_cast_fp16 = slice_by_index(begin = var_584_begin_0, end = var_584_end_0, end_mask = var_584_end_mask_0, x = q_15_cast_fp16)[name = string("op_584_cast_fp16")];
+            tensor<int32, [4]> var_590_begin_0 = const()[name = string("op_590_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_590_end_0 = const()[name = string("op_590_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_590_end_mask_0 = const()[name = string("op_590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = q_15_cast_fp16)[name = string("op_590_cast_fp16")];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 512]> var_592_cast_fp16 = mul(x = var_590_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_592_cast_fp16")];
+            bool rotated_9_interleave_0 = const()[name = string("rotated_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> rotated_9_cast_fp16 = concat(axis = var_506, interleave = rotated_9_interleave_0, values = (var_592_cast_fp16, var_584_cast_fp16))[name = string("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_595_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = string("op_595_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_596_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = string("op_596_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_9_cast_fp16 = add(x = var_595_cast_fp16, y = var_596_cast_fp16)[name = string("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_609_begin_0 = const()[name = string("op_609_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_609_end_0 = const()[name = string("op_609_end_0"), val = tensor<int32, [4]>([1, 32, 64, 512])];
+            tensor<bool, [4]> var_609_end_mask_0 = const()[name = string("op_609_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_609_cast_fp16 = slice_by_index(begin = var_609_begin_0, end = var_609_end_0, end_mask = var_609_end_mask_0, x = k_15_cast_fp16)[name = string("op_609_cast_fp16")];
+            tensor<int32, [4]> var_615_begin_0 = const()[name = string("op_615_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_615_end_0 = const()[name = string("op_615_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_615_end_mask_0 = const()[name = string("op_615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 512]> var_615_cast_fp16 = slice_by_index(begin = var_615_begin_0, end = var_615_end_0, end_mask = var_615_end_mask_0, x = k_15_cast_fp16)[name = string("op_615_cast_fp16")];
+            fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 512]> var_617_cast_fp16 = mul(x = var_615_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_617_cast_fp16")];
+            bool rotated_interleave_0 = const()[name = string("rotated_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> rotated_cast_fp16 = concat(axis = var_506, interleave = rotated_interleave_0, values = (var_617_cast_fp16, var_609_cast_fp16))[name = string("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_620_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = string("op_620_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> var_621_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = string("op_621_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 512]> roped_cast_fp16 = add(x = var_620_cast_fp16, y = var_621_cast_fp16)[name = string("roped_cast_fp16")];
+            bool q_interleave_0 = const()[name = string("q_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> q_cast_fp16 = concat(axis = var_506, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = string("q_cast_fp16")];
+            bool k_interleave_0 = const()[name = string("k_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_506, interleave = k_interleave_0, values = roped_cast_fp16)[name = string("k_cast_fp16")];
+            tensor<int32, [4]> var_636_begin_0 = const()[name = string("op_636_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_636_end_0 = const()[name = string("op_636_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_636_end_mask_0 = const()[name = string("op_636_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_k_cache_2 = slice_by_index(begin = var_636_begin_0, end = var_636_end_0, end_mask = var_636_end_mask_0, x = k_cast_fp16)[name = string("op_636_cast_fp16")];
+            tensor<int32, [4]> var_637_begin_0 = const()[name = string("op_637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1])];
+            tensor<int32, [4]> var_637_end_0 = const()[name = string("op_637_end_0"), val = tensor<int32, [4]>([1, 32, 128, 512])];
+            tensor<bool, [4]> var_637_end_mask_0 = const()[name = string("op_637_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 128, 511]> new_v_cache_2 = slice_by_index(begin = var_637_begin_0, end = var_637_end_0, end_mask = var_637_end_mask_0, x = v_cast_fp16)[name = string("op_637_cast_fp16")];
+            fp16 var_641_to_fp16 = const()[name = string("op_641_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 32, 128, 512]> var_642_cast_fp16 = mul(x = q_cast_fp16, y = var_641_to_fp16)[name = string("op_642_cast_fp16")];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(true)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_642_cast_fp16, y = k_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = string("attn_weights_cast_fp16")];
+            tensor<fp16, [1, 32, 512, 512]> var_650_cast_fp16 = softmax(axis = var_502, x = attn_weights_cast_fp16)[name = string("op_650_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 32, 128, 512]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_650_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_654 = const()[name = string("op_654"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 512]> input_17_cast_fp16 = reshape(shape = var_654, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            tensor<int32, [2]> var_658 = const()[name = string("op_658"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_660 = const()[name = string("op_660"), val = tensor<int32, [2]>([1, 1])];
+            string var_662_pad_type_0 = const()[name = string("op_662_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_662_pad_0 = const()[name = string("op_662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_662_cast_fp16 = conv(dilations = var_660, groups = var_503, pad = var_662_pad_0, pad_type = var_662_pad_type_0, strides = var_658, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = string("op_662_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = string("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303804864)))];
+            tensor<fp16, [1, 4096, 1, 512]> attention_output_cast_fp16 = mul(x = var_662_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = string("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_681_axes_0 = const()[name = string("op_681_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4096, 512]> var_681_cast_fp16 = squeeze(axes = var_681_axes_0, x = x_39_cast_fp16)[name = string("op_681_cast_fp16")];
+            bool var_683_interleave_0 = const()[name = string("op_683_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 4097, 512]> var_683_cast_fp16 = concat(axis = var_503, interleave = var_683_interleave_0, values = (var_681_cast_fp16, eps_chan_1_to_fp16))[name = string("op_683_cast_fp16")];
+            tensor<int32, [1]> x_eps_axes_0 = const()[name = string("x_eps_axes_0"), val = tensor<int32, [1]>([-2])];
+            tensor<fp16, [1, 4097, 1, 512]> x_eps_cast_fp16 = expand_dims(axes = x_eps_axes_0, x = var_683_cast_fp16)[name = string("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = string("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 512]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_507, x = x_eps_cast_fp16)[name = string("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_31_cast_fp16 = real_div(x = x_39_cast_fp16, y = norm_x_cast_fp16)[name = string("x_normed_31_cast_fp16")];
+            fp16 var_688_to_fp16 = const()[name = string("op_688_to_fp16"), val = fp16(0x1p+6)];
+            tensor<fp16, [1, 4096, 1, 512]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_688_to_fp16)[name = string("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = string("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303813120)))];
+            tensor<fp16, [1, 4096, 1, 512]> input_19_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = string("input_19_cast_fp16")];
+            tensor<int32, [2]> var_700 = const()[name = string("op_700"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_702 = const()[name = string("op_702"), val = tensor<int32, [2]>([1, 1])];
+            string var_704_pad_type_0 = const()[name = string("op_704_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_704_pad_0 = const()[name = string("op_704_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 512]> var_704_cast_fp16 = conv(dilations = var_702, groups = var_503, pad = var_704_pad_0, pad_type = var_704_pad_type_0, strides = var_700, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_704_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303821376)))];
+            tensor<fp16, [1, 11008, 1, 512]> input_21_cast_fp16 = mul(x = var_704_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = string("input_21_cast_fp16")];
+            tensor<int32, [2]> var_708 = const()[name = string("op_708"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_710 = const()[name = string("op_710"), val = tensor<int32, [2]>([1, 1])];
+            string var_712_pad_type_0 = const()[name = string("op_712_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_712_pad_0 = const()[name = string("op_712_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 1, 512]> var_712_cast_fp16 = conv(dilations = var_710, groups = var_503, pad = var_712_pad_0, pad_type = var_712_pad_type_0, strides = var_708, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303843456)))];
+            tensor<fp16, [1, 11008, 1, 512]> x_fc_2_cast_fp16 = mul(x = var_712_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = string("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 512]> var_714_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 512]> input_cast_fp16 = mul(x = var_714_cast_fp16, y = x_fc_2_cast_fp16)[name = string("input_cast_fp16")];
+            tensor<int32, [2]> var_718 = const()[name = string("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_720 = const()[name = string("op_720"), val = tensor<int32, [2]>([1, 1])];
+            string var_722_pad_type_0 = const()[name = string("op_722_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> var_722_pad_0 = const()[name = string("op_722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 512]> var_722_cast_fp16 = conv(dilations = var_720, groups = var_503, pad = var_722_pad_0, pad_type = var_722_pad_type_0, strides = var_718, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = string("op_722_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = string("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303865536)))];
+            tensor<fp16, [1, 4096, 1, 512]> var_723_cast_fp16 = mul(x = var_722_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = string("op_723_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 512]> new_x = add(x = var_723_cast_fp16, y = x_39_cast_fp16)[name = string("op_724_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
+}
\ No newline at end of file