Dan Fu
		
	commited on
		
		
					Commit 
							
							·
						
						afc7050
	
1
								Parent(s):
							
							6d98b24
								
32K partial checkpoint
Browse files- README.md +13 -0
- config.json +4 -0
- config.yaml +38 -0
- model.bin +3 -0
- model.pt +3 -0
- version.txt +1 -0
    	
        README.md
    CHANGED
    
    | @@ -1,3 +1,16 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            license: apache-2.0
         | 
|  | |
|  | |
|  | |
| 3 | 
             
            ---
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            license: apache-2.0
         | 
| 3 | 
            +
            language:
         | 
| 4 | 
            +
            - en
         | 
| 5 | 
            +
            pipeline_tag: text-classification
         | 
| 6 | 
             
            ---
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            # Monarch Mixer-BERT
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            The 80M checkpoint for M2-BERT-base from the paper [Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture](https://arxiv.org/abs/2310.12109).
         | 
| 11 | 
            +
            This model has been pretrained with sequence length 32K.
         | 
| 12 | 
            +
            Note (11/3 evening): this is a partial checkpoint, this one had not finished training before upload.
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            This model was trained by Dan Fu, Jon Saad-Falcon, and Simran Arora.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            Check out our [GitHub](https://github.com/HazyResearch/m2/tree/main) for instructions on how to download and fine-tune it!
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,4 @@ | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_type": "m2_bert"
         | 
| 3 | 
            +
            }
         | 
| 4 | 
            +
             | 
    	
        config.yaml
    ADDED
    
    | @@ -0,0 +1,38 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Note that some of the fields in this template haven't been filled in yet.
         | 
| 2 | 
            +
            # Please resolve any `null` fields before launching!
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            precision: amp_bf16 
         | 
| 5 | 
            +
            max_seq_len: 32768
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            # Tokenizer for dataset creation
         | 
| 8 | 
            +
            tokenizer_name: bert-base-uncased
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Base model config
         | 
| 11 | 
            +
            model:
         | 
| 12 | 
            +
              name: bert
         | 
| 13 | 
            +
              pretrained_model_name: ${tokenizer_name}
         | 
| 14 | 
            +
              tokenizer_name: ${tokenizer_name}
         | 
| 15 | 
            +
              model_config:
         | 
| 16 | 
            +
                num_attention_heads: 12 
         | 
| 17 | 
            +
                num_hidden_layers: 12 
         | 
| 18 | 
            +
                attention_probs_dropout_prob: 0.0 
         | 
| 19 | 
            +
                max_position_embeddings: 32768
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                monarch_mixer_sequence_mixing: True
         | 
| 22 | 
            +
                long_conv_l_max: 32768
         | 
| 23 | 
            +
                long_conv_kernel_learning_rate: 1e-3
         | 
| 24 | 
            +
                hyena_lr_pos_emb: 1e-5
         | 
| 25 | 
            +
                hyena_w: 10
         | 
| 26 | 
            +
                hyena_wd: 0.1
         | 
| 27 | 
            +
                hyena_emb_dim: 5
         | 
| 28 | 
            +
                hyena_filter_order: 128
         | 
| 29 | 
            +
                hyena_training_additions: False
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                bidirectional: true
         | 
| 32 | 
            +
                residual_long_conv: true
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                use_glu_mlp: True
         | 
| 35 | 
            +
                use_monarch_mlp: True
         | 
| 36 | 
            +
                monarch_mlp_nblocks: 4
         | 
| 37 | 
            +
                use_positional_encodings: True
         | 
| 38 | 
            +
             | 
    	
        model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f5f6da4ea57ab1b407363530efba622552c121dc29b439cc9b202f042108d7d2
         | 
| 3 | 
            +
            size 440736801
         | 
    	
        model.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:069a59ecf30222fa1e67f68b76f7155966a875ac2ab060f1cb2d1213015e3596
         | 
| 3 | 
            +
            size 1315397236
         | 
    	
        version.txt
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            1
         | 
