|
{ |
|
"measurement": { |
|
"model.layers.0": { |
|
"accuracy": 0.7972698211669922, |
|
"total_bits": 36303648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.1": { |
|
"accuracy": 0.8311243057250977, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.2": { |
|
"accuracy": 0.8292713165283203, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.3": { |
|
"accuracy": 0.8550562858581543, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.4": { |
|
"accuracy": 0.8643341064453125, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.5": { |
|
"accuracy": 0.8951015472412109, |
|
"total_bits": 34239648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.6": { |
|
"accuracy": 0.8700046539306641, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.7": { |
|
"accuracy": 0.8551735877990723, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.8": { |
|
"accuracy": 0.8605437278747559, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.9": { |
|
"accuracy": 0.8751955032348633, |
|
"total_bits": 28563648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.10": { |
|
"accuracy": 0.8752927780151367, |
|
"total_bits": 30627648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.11": { |
|
"accuracy": 0.9034562110900879, |
|
"total_bits": 38367648, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.12": { |
|
"accuracy": 0.901634693145752, |
|
"total_bits": 36303648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.13": { |
|
"accuracy": 0.9090676307678223, |
|
"total_bits": 36303648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.14": { |
|
"accuracy": 0.9031753540039062, |
|
"total_bits": 36303648, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.15": { |
|
"accuracy": 0.9214200973510742, |
|
"total_bits": 41980320, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.16": { |
|
"accuracy": 0.9262242317199707, |
|
"total_bits": 41980320, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.17": { |
|
"accuracy": 0.9712269306182861, |
|
"total_bits": 51784992, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.18": { |
|
"accuracy": 0.9706757068634033, |
|
"total_bits": 51784992, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.19": { |
|
"accuracy": 0.9724410772323608, |
|
"total_bits": 51784992, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.20": { |
|
"accuracy": 0.9736490249633789, |
|
"total_bits": 51784992, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.21": { |
|
"accuracy": 0.9740629196166992, |
|
"total_bits": 51784992, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.22": { |
|
"accuracy": 0.9723896980285645, |
|
"total_bits": 51784992, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.23": { |
|
"accuracy": 0.9673030376434326, |
|
"total_bits": 51784992, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
} |
|
} |
|
} |