smpanaro commited on
Commit
07183f4
1 Parent(s): dba673f

Add new joint prefill+generation cache processor

Browse files
cache-processor.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12adcd9eb610f08f550a8914fe8c71a5748b2ad96ca8e099ffe4ad2a40ded079
3
+ size 243
cache-processor.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9b26e05e4ae0f96f84c27589c4f0466340377ba6827456f8fcc6414539b5718
3
+ size 516
cache-processor.mlmodelc/metadata.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "outputSchema" : [
5
+ {
6
+ "hasShapeFlexibility" : "0",
7
+ "isOptional" : "0",
8
+ "dataType" : "Float16",
9
+ "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 4096)",
10
+ "shortDescription" : "",
11
+ "shape" : "[1, 448, 1, 4096]",
12
+ "name" : "updated_k_cache",
13
+ "type" : "MultiArray"
14
+ },
15
+ {
16
+ "hasShapeFlexibility" : "0",
17
+ "isOptional" : "0",
18
+ "dataType" : "Float16",
19
+ "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 448)",
20
+ "shortDescription" : "",
21
+ "shape" : "[1, 4096, 1, 448]",
22
+ "name" : "updated_v_cache",
23
+ "type" : "MultiArray"
24
+ },
25
+ {
26
+ "hasShapeFlexibility" : "0",
27
+ "isOptional" : "0",
28
+ "dataType" : "Float16",
29
+ "formattedType" : "MultiArray (Float16)",
30
+ "shortDescription" : "",
31
+ "shape" : "[]",
32
+ "name" : "ignore_me_im_only_here_so_this_runs_on_the_ane",
33
+ "type" : "MultiArray"
34
+ }
35
+ ],
36
+ "modelParameters" : [
37
+
38
+ ],
39
+ "specificationVersion" : 7,
40
+ "mlProgramOperationTypeHistogram" : {
41
+ "SliceByIndex" : 2,
42
+ "Ios16.mul" : 1,
43
+ "Concat" : 2,
44
+ "Ios16.reduceMin" : 1
45
+ },
46
+ "computePrecision" : "Mixed (Float16, Int32)",
47
+ "isUpdatable" : "0",
48
+ "availability" : {
49
+ "macOS" : "13.0",
50
+ "tvOS" : "16.0",
51
+ "visionOS" : "1.0",
52
+ "watchOS" : "9.0",
53
+ "iOS" : "16.0",
54
+ "macCatalyst" : "16.0"
55
+ },
56
+ "modelType" : {
57
+ "name" : "MLModelType_mlProgram"
58
+ },
59
+ "userDefinedMetadata" : {
60
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
61
+ "com.github.apple.coremltools.source" : "torch==2.1.0",
62
+ "com.github.apple.coremltools.version" : "8.0b1"
63
+ },
64
+ "inputSchema" : [
65
+ {
66
+ "hasShapeFlexibility" : "0",
67
+ "isOptional" : "0",
68
+ "dataType" : "Float16",
69
+ "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 4096)",
70
+ "shortDescription" : "",
71
+ "shape" : "[1, 448, 1, 4096]",
72
+ "name" : "old_k_cache",
73
+ "type" : "MultiArray"
74
+ },
75
+ {
76
+ "hasShapeFlexibility" : "0",
77
+ "isOptional" : "0",
78
+ "dataType" : "Float16",
79
+ "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 4096)",
80
+ "shortDescription" : "",
81
+ "shape" : "[1, 64, 1, 4096]",
82
+ "name" : "new_k_cache",
83
+ "type" : "MultiArray"
84
+ },
85
+ {
86
+ "hasShapeFlexibility" : "0",
87
+ "isOptional" : "0",
88
+ "dataType" : "Float16",
89
+ "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 448)",
90
+ "shortDescription" : "",
91
+ "shape" : "[1, 4096, 1, 448]",
92
+ "name" : "old_v_cache",
93
+ "type" : "MultiArray"
94
+ },
95
+ {
96
+ "hasShapeFlexibility" : "0",
97
+ "isOptional" : "0",
98
+ "dataType" : "Float16",
99
+ "formattedType" : "MultiArray (Float16 1 × 4096 × 1 × 64)",
100
+ "shortDescription" : "",
101
+ "shape" : "[1, 4096, 1, 64]",
102
+ "name" : "new_v_cache",
103
+ "type" : "MultiArray"
104
+ }
105
+ ],
106
+ "generatedClassName" : "cache_processor",
107
+ "method" : "predict"
108
+ }
109
+ ]
cache-processor.mlmodelc/model.mil ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
3
+ {
4
+ func main<ios16>(tensor<fp16, [1, 64, 1, 4096]> new_k_cache, tensor<fp16, [1, 4096, 1, 64]> new_v_cache, tensor<fp16, [1, 448, 1, 4096]> old_k_cache, tensor<fp16, [1, 4096, 1, 448]> old_v_cache) {
5
+ tensor<int32, []> var_6 = const()[name = tensor<string, []>("op_6"), val = tensor<int32, []>(-3)];
6
+ tensor<bool, []> cat_k_1_interleave_0 = const()[name = tensor<string, []>("cat_k_1_interleave_0"), val = tensor<bool, []>(false)];
7
+ tensor<fp16, [1, 512, 1, 4096]> cat_k_1_cast_fp16 = concat(axis = var_6, interleave = cat_k_1_interleave_0, values = (old_k_cache, new_k_cache))[name = tensor<string, []>("cat_k_1_cast_fp16")];
8
+ tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(-1)];
9
+ tensor<bool, []> cat_v_interleave_0 = const()[name = tensor<string, []>("cat_v_interleave_0"), val = tensor<bool, []>(false)];
10
+ tensor<fp16, [1, 4096, 1, 512]> cat_v_cast_fp16 = concat(axis = var_9, interleave = cat_v_interleave_0, values = (old_v_cache, new_v_cache))[name = tensor<string, []>("cat_v_cast_fp16")];
11
+ tensor<int32, [4]> var_20_begin_0 = const()[name = tensor<string, []>("op_20_begin_0"), val = tensor<int32, [4]>([0, 64, 0, 0])];
12
+ tensor<int32, [4]> var_20_end_0 = const()[name = tensor<string, []>("op_20_end_0"), val = tensor<int32, [4]>([1, 512, 1, 4096])];
13
+ tensor<bool, [4]> var_20_end_mask_0 = const()[name = tensor<string, []>("op_20_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
14
+ tensor<fp16, [1, 448, 1, 4096]> updated_k_cache = slice_by_index(begin = var_20_begin_0, end = var_20_end_0, end_mask = var_20_end_mask_0, x = cat_k_1_cast_fp16)[name = tensor<string, []>("op_20_cast_fp16")];
15
+ tensor<int32, [4]> var_50_begin_0 = const()[name = tensor<string, []>("op_50_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 64])];
16
+ tensor<int32, [4]> var_50_end_0 = const()[name = tensor<string, []>("op_50_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 512])];
17
+ tensor<bool, [4]> var_50_end_mask_0 = const()[name = tensor<string, []>("op_50_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
18
+ tensor<fp16, [1, 4096, 1, 448]> updated_v_cache = slice_by_index(begin = var_50_begin_0, end = var_50_end_0, end_mask = var_50_end_mask_0, x = cat_v_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
19
+ tensor<fp16, []> var_51_promoted_to_fp16 = const()[name = tensor<string, []>("op_51_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
20
+ tensor<fp16, [1, 448, 1, 4096]> prod_cast_fp16 = mul(x = updated_k_cache, y = var_51_promoted_to_fp16)[name = tensor<string, []>("prod_cast_fp16")];
21
+ tensor<bool, []> var_53_keep_dims_0 = const()[name = tensor<string, []>("op_53_keep_dims_0"), val = tensor<bool, []>(false)];
22
+ tensor<fp16, []> ignore_me_im_only_here_so_this_runs_on_the_ane = reduce_min(keep_dims = var_53_keep_dims_0, x = prod_cast_fp16)[name = tensor<string, []>("op_53_cast_fp16")];
23
+ } -> (updated_k_cache, updated_v_cache, ignore_me_im_only_here_so_this_runs_on_the_ane);
24
+ }