x54-729
		
	commited on
		
		
					Commit 
							
							·
						
						35f91cd
	
1
								Parent(s):
							
							454e418
								
fix no white space when using stream_chat with fast tokenizer
Browse files
    	
        configuration_internlm2.py
    CHANGED
    
    | @@ -148,4 +148,4 @@ class InternLM2Config(PretrainedConfig): | |
| 148 | 
             
                            f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
         | 
| 149 | 
             
                        )
         | 
| 150 | 
             
                    if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
         | 
| 151 | 
            -
                        raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
         | 
|  | |
| 148 | 
             
                            f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
         | 
| 149 | 
             
                        )
         | 
| 150 | 
             
                    if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
         | 
| 151 | 
            +
                        raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
         | 
    	
        tokenization_internlm2.py
    CHANGED
    
    | @@ -233,4 +233,4 @@ class InternLM2Tokenizer(PreTrainedTokenizer): | |
| 233 |  | 
| 234 | 
             
                    if token_ids_1 is None:
         | 
| 235 | 
             
                        return len(token_ids_0 + eos) * [0]
         | 
| 236 | 
            -
                    return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
         | 
|  | |
| 233 |  | 
| 234 | 
             
                    if token_ids_1 is None:
         | 
| 235 | 
             
                        return len(token_ids_0 + eos) * [0]
         | 
| 236 | 
            +
                    return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
         | 
    	
        tokenization_internlm2_fast.py
    CHANGED
    
    | @@ -56,14 +56,14 @@ class InternLM2Converter(SpmConverter): | |
| 56 | 
             
                    return unk_id
         | 
| 57 |  | 
| 58 | 
             
                def decoder(self, replacement, add_prefix_space):
         | 
| 59 | 
            -
                     | 
| 60 | 
            -
                         | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
                         | 
| 66 | 
            -
                    )
         | 
| 67 |  | 
| 68 | 
             
                def tokenizer(self, proto):
         | 
| 69 | 
             
                    model_type = proto.trainer_spec.model_type
         | 
| @@ -211,4 +211,4 @@ class InternLM2TokenizerFast(PreTrainedTokenizerFast): | |
| 211 | 
             
                    if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
         | 
| 212 | 
             
                        copyfile(self.vocab_file, out_vocab_file)
         | 
| 213 |  | 
| 214 | 
            -
                    return (out_vocab_file,)
         | 
|  | |
| 56 | 
             
                    return unk_id
         | 
| 57 |  | 
| 58 | 
             
                def decoder(self, replacement, add_prefix_space):
         | 
| 59 | 
            +
                    decoders_sequence = [
         | 
| 60 | 
            +
                        decoders.Replace("▁", " "),
         | 
| 61 | 
            +
                        decoders.ByteFallback(),
         | 
| 62 | 
            +
                        decoders.Fuse(),
         | 
| 63 | 
            +
                    ]
         | 
| 64 | 
            +
                    if self.proto.normalizer_spec.add_dummy_prefix:
         | 
| 65 | 
            +
                        decoders_sequence.append(decoders.Strip(content=" ", left=1))
         | 
| 66 | 
            +
                    return decoders.Sequence(decoders_sequence)
         | 
| 67 |  | 
| 68 | 
             
                def tokenizer(self, proto):
         | 
| 69 | 
             
                    model_type = proto.trainer_spec.model_type
         | 
|  | |
| 211 | 
             
                    if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
         | 
| 212 | 
             
                        copyfile(self.vocab_file, out_vocab_file)
         | 
| 213 |  | 
| 214 | 
            +
                    return (out_vocab_file,)
         | 
