Add new joint prefill+generation cache processor

Browse files

Files changed (4) hide show

cache-processor.mlmodelc/analytics/coremldata.bin +3 -0
cache-processor.mlmodelc/coremldata.bin +3 -0
cache-processor.mlmodelc/metadata.json +109 -0
cache-processor.mlmodelc/model.mil +24 -0

cache-processor.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12adcd9eb610f08f550a8914fe8c71a5748b2ad96ca8e099ffe4ad2a40ded079
+size 243

cache-processor.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9b26e05e4ae0f96f84c27589c4f0466340377ba6827456f8fcc6414539b5718
+size 516

cache-processor.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,109 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 4096)",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 4096]",
+        "name" : "updated_k_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 448)",
+        "shortDescription" : "",
+        "shape" : "[1, 4096, 1, 448]",
+        "name" : "updated_v_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16)",
+        "shortDescription" : "",
+        "shape" : "[]",
+        "name" : "ignore_me_im_only_here_so_this_runs_on_the_ane",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "SliceByIndex" : 2,
+      "Ios16.mul" : 1,
+      "Concat" : 2,
+      "Ios16.reduceMin" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 4096)",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 4096]",
+        "name" : "old_k_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 4096)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 4096]",
+        "name" : "new_k_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 448)",
+        "shortDescription" : "",
+        "shape" : "[1, 4096, 1, 448]",
+        "name" : "old_v_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 4096, 1, 64]",
+        "name" : "new_v_cache",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "cache_processor",
+    "method" : "predict"
+  }
+]

cache-processor.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,24 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [1, 64, 1, 4096]> new_k_cache, tensor<fp16, [1, 4096, 1, 64]> new_v_cache, tensor<fp16, [1, 448, 1, 4096]> old_k_cache, tensor<fp16, [1, 4096, 1, 448]> old_v_cache) {
+            tensor<int32, []> var_6 = const()[name = tensor<string, []>("op_6"), val = tensor<int32, []>(-3)];
+            tensor<bool, []> cat_k_1_interleave_0 = const()[name = tensor<string, []>("cat_k_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 512, 1, 4096]> cat_k_1_cast_fp16 = concat(axis = var_6, interleave = cat_k_1_interleave_0, values = (old_k_cache, new_k_cache))[name = tensor<string, []>("cat_k_1_cast_fp16")];
+            tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> cat_v_interleave_0 = const()[name = tensor<string, []>("cat_v_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 4096, 1, 512]> cat_v_cast_fp16 = concat(axis = var_9, interleave = cat_v_interleave_0, values = (old_v_cache, new_v_cache))[name = tensor<string, []>("cat_v_cast_fp16")];
+            tensor<int32, [4]> var_20_begin_0 = const()[name = tensor<string, []>("op_20_begin_0"), val = tensor<int32, [4]>([0, 64, 0, 0])];
+            tensor<int32, [4]> var_20_end_0 = const()[name = tensor<string, []>("op_20_end_0"), val = tensor<int32, [4]>([1, 512, 1, 4096])];
+            tensor<bool, [4]> var_20_end_mask_0 = const()[name = tensor<string, []>("op_20_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 448, 1, 4096]> updated_k_cache = slice_by_index(begin = var_20_begin_0, end = var_20_end_0, end_mask = var_20_end_mask_0, x = cat_k_1_cast_fp16)[name = tensor<string, []>("op_20_cast_fp16")];
+            tensor<int32, [4]> var_50_begin_0 = const()[name = tensor<string, []>("op_50_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 64])];
+            tensor<int32, [4]> var_50_end_0 = const()[name = tensor<string, []>("op_50_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 512])];
+            tensor<bool, [4]> var_50_end_mask_0 = const()[name = tensor<string, []>("op_50_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 4096, 1, 448]> updated_v_cache = slice_by_index(begin = var_50_begin_0, end = var_50_end_0, end_mask = var_50_end_mask_0, x = cat_v_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
+            tensor<fp16, []> var_51_promoted_to_fp16 = const()[name = tensor<string, []>("op_51_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
+            tensor<fp16, [1, 448, 1, 4096]> prod_cast_fp16 = mul(x = updated_k_cache, y = var_51_promoted_to_fp16)[name = tensor<string, []>("prod_cast_fp16")];
+            tensor<bool, []> var_53_keep_dims_0 = const()[name = tensor<string, []>("op_53_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, []> ignore_me_im_only_here_so_this_runs_on_the_ane = reduce_min(keep_dims = var_53_keep_dims_0, x = prod_cast_fp16)[name = tensor<string, []>("op_53_cast_fp16")];
+        } -> (updated_k_cache, updated_v_cache, ignore_me_im_only_here_so_this_runs_on_the_ane);
+}