Text Generation
English
Eval Results
gpt2 / FALLBACK.yaml
d-matrix-user's picture
adding activations monkey-patched version
e66e7f5
raw
history blame
12.4 kB
lm_head:
accum_format: SAME
approximation_function: NONE
input_format: SAME
instance: Linear
output_format: SAME
weight_format: SAME
weight_sparseness: DENSE
transformer.drop:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.0.attn.attn_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: BFP[8|8]{64,-1}(SN)
transformer.h.0.attn.c_attn:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: BFP[8|8]{64,-1}(SN)
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.0.attn.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.0.attn.resid_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.0.attn.softmax:
approximation_function: SOFTMAX(base2,float16)
input_format: SAME
instance: Softmax
output_format: SAME
transformer.h.0.ln_1:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.0.ln_2:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.0.mlp.act:
approximation_function: GELU(poly2,float16)
input_format: SAME
instance: GELU
output_format: SAME
transformer.h.0.mlp.c_fc:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.0.mlp.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.0.mlp.dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.1.attn.attn_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: BFP[8|8]{64,-1}(SN)
transformer.h.1.attn.c_attn:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: BFP[8|8]{64,-1}(SN)
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.1.attn.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.1.attn.resid_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.1.attn.softmax:
approximation_function: SOFTMAX(base2,float16)
input_format: SAME
instance: Softmax
output_format: SAME
transformer.h.1.ln_1:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.1.ln_2:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.1.mlp.act:
approximation_function: GELU(poly2,float16)
input_format: SAME
instance: GELU
output_format: SAME
transformer.h.1.mlp.c_fc:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.1.mlp.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.1.mlp.dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.2.attn.attn_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: BFP[8|8]{64,-1}(SN)
transformer.h.2.attn.c_attn:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: BFP[8|8]{64,-1}(SN)
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.2.attn.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.2.attn.resid_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.2.attn.softmax:
approximation_function: SOFTMAX(base2,float16)
input_format: SAME
instance: Softmax
output_format: SAME
transformer.h.2.ln_1:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.2.ln_2:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.2.mlp.act:
approximation_function: GELU(poly2,float16)
input_format: SAME
instance: GELU
output_format: SAME
transformer.h.2.mlp.c_fc:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.2.mlp.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.2.mlp.dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.3.attn.attn_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: BFP[8|8]{64,-1}(SN)
transformer.h.3.attn.c_attn:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: BFP[8|8]{64,-1}(SN)
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.3.attn.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.3.attn.resid_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.3.attn.softmax:
approximation_function: SOFTMAX(base2,float16)
input_format: SAME
instance: Softmax
output_format: SAME
transformer.h.3.ln_1:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.3.ln_2:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.3.mlp.act:
approximation_function: GELU(poly2,float16)
input_format: SAME
instance: GELU
output_format: SAME
transformer.h.3.mlp.c_fc:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.3.mlp.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.3.mlp.dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.4.attn.attn_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: BFP[8|8]{64,-1}(SN)
transformer.h.4.attn.c_attn:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: BFP[8|8]{64,-1}(SN)
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.4.attn.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.4.attn.resid_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.4.attn.softmax:
approximation_function: SOFTMAX(base2,float16)
input_format: SAME
instance: Softmax
output_format: SAME
transformer.h.4.ln_1:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.4.ln_2:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.4.mlp.act:
approximation_function: GELU(poly2,float16)
input_format: SAME
instance: GELU
output_format: SAME
transformer.h.4.mlp.c_fc:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.4.mlp.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.4.mlp.dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.5.attn.attn_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: BFP[8|8]{64,-1}(SN)
transformer.h.5.attn.c_attn:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: BFP[8|8]{64,-1}(SN)
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.5.attn.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.5.attn.resid_dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.h.5.attn.softmax:
approximation_function: SOFTMAX(base2,float16)
input_format: SAME
instance: Softmax
output_format: SAME
transformer.h.5.ln_1:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.5.ln_2:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME
transformer.h.5.mlp.act:
approximation_function: GELU(poly2,float16)
input_format: SAME
instance: GELU
output_format: SAME
transformer.h.5.mlp.c_fc:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.5.mlp.c_proj:
approximation_function: NONE
bias_format: SAME
input_format: BFP[8|8]{64,-1}(SN)
instance: HFTransformersConv1D
output_format: SAME
weight_format: BFP[8|8]{64,0}(SN)
weight_sparseness: DENSE
transformer.h.5.mlp.dropout:
approximation_function: NONE
input_format: SAME
instance: Dropout
output_format: SAME
transformer.ln_f:
approximation_function: LAYERNORM(fallback,4,float16)
bias_format: SAME
input_format: SAME
instance: LayerNorm
output_format: SAME
weight_format: SAME