Training in progress, epoch 1
Browse files- adapter_config.json +36 -0
- adapter_model.safetensors +3 -0
- added_tokens.json +11 -0
- runs/Nov06_00-49-17_0411-032836-q3c3gg0s-10-129-61-251/events.out.tfevents.1730854159.0411-032836-q3c3gg0s-10-129-61-251.2778.5 +3 -0
- runs/Nov06_01-26-20_0411-032836-q3c3gg0s-10-129-61-251/events.out.tfevents.1730856382.0411-032836-q3c3gg0s-10-129-61-251.2753.2 +3 -0
- runs/Nov06_01-39-19_0411-032836-q3c3gg0s-10-129-61-251/events.out.tfevents.1730857168.0411-032836-q3c3gg0s-10-129-61-251.13398.0 +3 -0
- runs/Nov06_23-46-05_0411-032836-q3c3gg0s-10-129-59-142/events.out.tfevents.1730936774.0411-032836-q3c3gg0s-10-129-59-142.9804.0 +3 -0
- runs/Nov08_04-15-34_0411-032836-q3c3gg0s-10-129-49-189/events.out.tfevents.1731039345.0411-032836-q3c3gg0s-10-129-49-189.3304.0 +3 -0
- special_tokens_map.json +47 -0
- tokenization_internlm2.py +257 -0
- tokenizer.model +3 -0
- tokenizer_config.json +179 -0
- training_args.bin +3 -0
    	
        adapter_config.json
    ADDED
    
    | @@ -0,0 +1,36 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "alpha_pattern": {},
         | 
| 3 | 
            +
              "auto_mapping": {
         | 
| 4 | 
            +
                "base_model_class": "InternVLChatModel",
         | 
| 5 | 
            +
                "parent_library": "modeling_internvl_chat"
         | 
| 6 | 
            +
              },
         | 
| 7 | 
            +
              "base_model_name_or_path": "OpenGVLab/InternVL2-8B",
         | 
| 8 | 
            +
              "bias": "lora_only",
         | 
| 9 | 
            +
              "fan_in_fan_out": false,
         | 
| 10 | 
            +
              "inference_mode": true,
         | 
| 11 | 
            +
              "init_lora_weights": true,
         | 
| 12 | 
            +
              "layer_replication": null,
         | 
| 13 | 
            +
              "layers_pattern": null,
         | 
| 14 | 
            +
              "layers_to_transform": null,
         | 
| 15 | 
            +
              "loftq_config": {},
         | 
| 16 | 
            +
              "lora_alpha": 64,
         | 
| 17 | 
            +
              "lora_dropout": 0.05,
         | 
| 18 | 
            +
              "megatron_config": null,
         | 
| 19 | 
            +
              "megatron_core": "megatron.core",
         | 
| 20 | 
            +
              "modules_to_save": null,
         | 
| 21 | 
            +
              "peft_type": "LORA",
         | 
| 22 | 
            +
              "r": 16,
         | 
| 23 | 
            +
              "rank_pattern": {},
         | 
| 24 | 
            +
              "revision": null,
         | 
| 25 | 
            +
              "target_modules": [
         | 
| 26 | 
            +
                "attention.wo",
         | 
| 27 | 
            +
                "attention.wqkv",
         | 
| 28 | 
            +
                "output",
         | 
| 29 | 
            +
                "feed_forward.w1",
         | 
| 30 | 
            +
                "feed_forward.w3",
         | 
| 31 | 
            +
                "feed_forward.w2"
         | 
| 32 | 
            +
              ],
         | 
| 33 | 
            +
              "task_type": null,
         | 
| 34 | 
            +
              "use_dora": false,
         | 
| 35 | 
            +
              "use_rslora": false
         | 
| 36 | 
            +
            }
         | 
    	
        adapter_model.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:1e2adfee65052aa3a0bf5d2266f2e1a0f0aa4b5eca765d57979ccb8b63999e4f
         | 
| 3 | 
            +
            size 157228352
         | 
    	
        added_tokens.json
    ADDED
    
    | @@ -0,0 +1,11 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "</box>": 92552,
         | 
| 3 | 
            +
              "</img>": 92545,
         | 
| 4 | 
            +
              "</quad>": 92548,
         | 
| 5 | 
            +
              "</ref>": 92550,
         | 
| 6 | 
            +
              "<IMG_CONTEXT>": 92546,
         | 
| 7 | 
            +
              "<box>": 92551,
         | 
| 8 | 
            +
              "<img>": 92544,
         | 
| 9 | 
            +
              "<quad>": 92547,
         | 
| 10 | 
            +
              "<ref>": 92549
         | 
| 11 | 
            +
            }
         | 
    	
        runs/Nov06_00-49-17_0411-032836-q3c3gg0s-10-129-61-251/events.out.tfevents.1730854159.0411-032836-q3c3gg0s-10-129-61-251.2778.5
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:5367560de6cf45eec20656030322c193268c6c70be632da37f0e03dd17e36349
         | 
| 3 | 
            +
            size 13070
         | 
    	
        runs/Nov06_01-26-20_0411-032836-q3c3gg0s-10-129-61-251/events.out.tfevents.1730856382.0411-032836-q3c3gg0s-10-129-61-251.2753.2
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:5788eb92d49c28eac3f11358f22dd845e7f10bef2a295ec5f285eabae197a46e
         | 
| 3 | 
            +
            size 13070
         | 
    	
        runs/Nov06_01-39-19_0411-032836-q3c3gg0s-10-129-61-251/events.out.tfevents.1730857168.0411-032836-q3c3gg0s-10-129-61-251.13398.0
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:efd314f6665d0a968c6745891e265137f22f38c205dd237f332d42c3a8e5d993
         | 
| 3 | 
            +
            size 13070
         | 
    	
        runs/Nov06_23-46-05_0411-032836-q3c3gg0s-10-129-59-142/events.out.tfevents.1730936774.0411-032836-q3c3gg0s-10-129-59-142.9804.0
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:6712353a9f8af1f0a7493a35ddd3b189e77048d76fb24e4bfeeb31e861f59373
         | 
| 3 | 
            +
            size 13096
         | 
    	
        runs/Nov08_04-15-34_0411-032836-q3c3gg0s-10-129-49-189/events.out.tfevents.1731039345.0411-032836-q3c3gg0s-10-129-49-189.3304.0
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:e89072a87b3dac0f7854935a00703c0e46db61a9b984943d678cb1349e31820a
         | 
| 3 | 
            +
            size 13096
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,47 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<|im_start|>",
         | 
| 4 | 
            +
                "<|im_end|>",
         | 
| 5 | 
            +
                "<|action_start|>",
         | 
| 6 | 
            +
                "<|action_end|>",
         | 
| 7 | 
            +
                "<|interpreter|>",
         | 
| 8 | 
            +
                "<|plugin|>",
         | 
| 9 | 
            +
                "<img>",
         | 
| 10 | 
            +
                "</img>",
         | 
| 11 | 
            +
                "<IMG_CONTEXT>",
         | 
| 12 | 
            +
                "<quad>",
         | 
| 13 | 
            +
                "</quad>",
         | 
| 14 | 
            +
                "<ref>",
         | 
| 15 | 
            +
                "</ref>",
         | 
| 16 | 
            +
                "<box>",
         | 
| 17 | 
            +
                "</box>"
         | 
| 18 | 
            +
              ],
         | 
| 19 | 
            +
              "bos_token": {
         | 
| 20 | 
            +
                "content": "<s>",
         | 
| 21 | 
            +
                "lstrip": false,
         | 
| 22 | 
            +
                "normalized": false,
         | 
| 23 | 
            +
                "rstrip": false,
         | 
| 24 | 
            +
                "single_word": false
         | 
| 25 | 
            +
              },
         | 
| 26 | 
            +
              "eos_token": {
         | 
| 27 | 
            +
                "content": "</s>",
         | 
| 28 | 
            +
                "lstrip": false,
         | 
| 29 | 
            +
                "normalized": false,
         | 
| 30 | 
            +
                "rstrip": false,
         | 
| 31 | 
            +
                "single_word": false
         | 
| 32 | 
            +
              },
         | 
| 33 | 
            +
              "pad_token": {
         | 
| 34 | 
            +
                "content": "</s>",
         | 
| 35 | 
            +
                "lstrip": false,
         | 
| 36 | 
            +
                "normalized": false,
         | 
| 37 | 
            +
                "rstrip": false,
         | 
| 38 | 
            +
                "single_word": false
         | 
| 39 | 
            +
              },
         | 
| 40 | 
            +
              "unk_token": {
         | 
| 41 | 
            +
                "content": "<unk>",
         | 
| 42 | 
            +
                "lstrip": false,
         | 
| 43 | 
            +
                "normalized": false,
         | 
| 44 | 
            +
                "rstrip": false,
         | 
| 45 | 
            +
                "single_word": false
         | 
| 46 | 
            +
              }
         | 
| 47 | 
            +
            }
         | 
    	
        tokenization_internlm2.py
    ADDED
    
    | @@ -0,0 +1,257 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Code copied from https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
         | 
| 6 | 
            +
            #
         | 
| 7 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 8 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 9 | 
            +
            # You may obtain a copy of the License at
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 12 | 
            +
            #
         | 
| 13 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 14 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 15 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 16 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 17 | 
            +
            # limitations under the License.
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            import sys
         | 
| 20 | 
            +
            import os
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            # SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
         | 
| 23 | 
            +
            sys.path.append(os.path.dirname('/Workspace/Users/[email protected]/doc-llm-master/doc_llm/'))
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            """Tokenization classes for InternLM."""
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            import os
         | 
| 28 | 
            +
            from shutil import copyfile
         | 
| 29 | 
            +
            from typing import Any, Dict, List, Optional, Tuple
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            import sentencepiece as spm
         | 
| 32 | 
            +
            from transformers.tokenization_utils import PreTrainedTokenizer
         | 
| 33 | 
            +
            from transformers.utils import logging
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            logger = logging.get_logger(__name__)
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            PRETRAINED_VOCAB_FILES_MAP = {}
         | 
| 40 | 
            +
             | 
| 41 | 
            +
             | 
| 42 | 
            +
            # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
         | 
| 43 | 
            +
            class InternLM2Tokenizer(PreTrainedTokenizer):
         | 
| 44 | 
            +
                """
         | 
| 45 | 
            +
                Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                Args:
         | 
| 48 | 
            +
                    vocab_file (`str`):
         | 
| 49 | 
            +
                        Path to the vocabulary file.
         | 
| 50 | 
            +
                """
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                vocab_files_names = VOCAB_FILES_NAMES
         | 
| 53 | 
            +
                pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
         | 
| 54 | 
            +
                model_input_names = ["input_ids", "attention_mask"]
         | 
| 55 | 
            +
                _auto_class = "AutoTokenizer"
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                def __init__(
         | 
| 58 | 
            +
                    self,
         | 
| 59 | 
            +
                    vocab_file,
         | 
| 60 | 
            +
                    unk_token="<unk>",
         | 
| 61 | 
            +
                    bos_token="<s>",
         | 
| 62 | 
            +
                    eos_token="</s>",
         | 
| 63 | 
            +
                    pad_token="</s>",
         | 
| 64 | 
            +
                    sp_model_kwargs: Optional[Dict[str, Any]] = None,
         | 
| 65 | 
            +
                    add_bos_token=True,
         | 
| 66 | 
            +
                    add_eos_token=False,
         | 
| 67 | 
            +
                    decode_with_prefix_space=False,
         | 
| 68 | 
            +
                    clean_up_tokenization_spaces=False,
         | 
| 69 | 
            +
                    **kwargs,
         | 
| 70 | 
            +
                ):
         | 
| 71 | 
            +
                    self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
         | 
| 72 | 
            +
                    self.vocab_file = vocab_file
         | 
| 73 | 
            +
                    self.add_bos_token = add_bos_token
         | 
| 74 | 
            +
                    self.add_eos_token = add_eos_token
         | 
| 75 | 
            +
                    self.decode_with_prefix_space = decode_with_prefix_space
         | 
| 76 | 
            +
                    self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         | 
| 77 | 
            +
                    self.sp_model.Load(vocab_file)
         | 
| 78 | 
            +
                    self._no_prefix_space_tokens = None
         | 
| 79 | 
            +
                    super().__init__(
         | 
| 80 | 
            +
                        bos_token=bos_token,
         | 
| 81 | 
            +
                        eos_token=eos_token,
         | 
| 82 | 
            +
                        unk_token=unk_token,
         | 
| 83 | 
            +
                        pad_token=pad_token,
         | 
| 84 | 
            +
                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
         | 
| 85 | 
            +
                        **kwargs,
         | 
| 86 | 
            +
                    )
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                @property
         | 
| 89 | 
            +
                def no_prefix_space_tokens(self):
         | 
| 90 | 
            +
                    if self._no_prefix_space_tokens is None:
         | 
| 91 | 
            +
                        vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
         | 
| 92 | 
            +
                        self._no_prefix_space_tokens = {
         | 
| 93 | 
            +
                            i for i, tok in enumerate(vocab) if not tok.startswith("▁")
         | 
| 94 | 
            +
                        }
         | 
| 95 | 
            +
                    return self._no_prefix_space_tokens
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                @property
         | 
| 98 | 
            +
                def vocab_size(self):
         | 
| 99 | 
            +
                    """Returns vocab size"""
         | 
| 100 | 
            +
                    return self.sp_model.get_piece_size()
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                @property
         | 
| 103 | 
            +
                def bos_token_id(self) -> Optional[int]:
         | 
| 104 | 
            +
                    return self.sp_model.bos_id()
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                @property
         | 
| 107 | 
            +
                def eos_token_id(self) -> Optional[int]:
         | 
| 108 | 
            +
                    return self.sp_model.eos_id()
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                def get_vocab(self):
         | 
| 111 | 
            +
                    """Returns vocab as a dict"""
         | 
| 112 | 
            +
                    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         | 
| 113 | 
            +
                    vocab.update(self.added_tokens_encoder)
         | 
| 114 | 
            +
                    return vocab
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                def _tokenize(self, text):
         | 
| 117 | 
            +
                    """Returns a tokenized string."""
         | 
| 118 | 
            +
                    return self.sp_model.encode(text, out_type=str)
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                def _convert_token_to_id(self, token):
         | 
| 121 | 
            +
                    """Converts a token (str) in an id using the vocab."""
         | 
| 122 | 
            +
                    return self.sp_model.piece_to_id(token)
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                def _convert_id_to_token(self, index):
         | 
| 125 | 
            +
                    """Converts an index (integer) in a token (str) using the vocab."""
         | 
| 126 | 
            +
                    token = self.sp_model.IdToPiece(index)
         | 
| 127 | 
            +
                    return token
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                def _maybe_add_prefix_space(self, tokens, decoded):
         | 
| 130 | 
            +
                    if tokens and tokens[0] not in self.no_prefix_space_tokens:
         | 
| 131 | 
            +
                        return " " + decoded
         | 
| 132 | 
            +
                    else:
         | 
| 133 | 
            +
                        return decoded
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                def convert_tokens_to_string(self, tokens):
         | 
| 136 | 
            +
                    """Converts a sequence of tokens (string) in a single string."""
         | 
| 137 | 
            +
                    current_sub_tokens = []
         | 
| 138 | 
            +
                    out_string = ""
         | 
| 139 | 
            +
                    prev_is_special = False
         | 
| 140 | 
            +
                    for token in tokens:
         | 
| 141 | 
            +
                        # make sure that special tokens are not decoded using sentencepiece model
         | 
| 142 | 
            +
                        if token in self.all_special_tokens:
         | 
| 143 | 
            +
                            if not prev_is_special:
         | 
| 144 | 
            +
                                out_string += " "
         | 
| 145 | 
            +
                            out_string += self.sp_model.decode(current_sub_tokens) + token
         | 
| 146 | 
            +
                            prev_is_special = True
         | 
| 147 | 
            +
                            current_sub_tokens = []
         | 
| 148 | 
            +
                        else:
         | 
| 149 | 
            +
                            current_sub_tokens.append(token)
         | 
| 150 | 
            +
                            prev_is_special = False
         | 
| 151 | 
            +
                    out_string += self.sp_model.decode(current_sub_tokens)
         | 
| 152 | 
            +
                    out_string = self.clean_up_tokenization(out_string)
         | 
| 153 | 
            +
                    out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
         | 
| 154 | 
            +
                    return out_string[1:]
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                def save_vocabulary(
         | 
| 157 | 
            +
                    self, save_directory, filename_prefix: Optional[str] = None
         | 
| 158 | 
            +
                ) -> Tuple[str]:
         | 
| 159 | 
            +
                    """
         | 
| 160 | 
            +
                    Save the vocabulary and special tokens file to a directory.
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    Args:
         | 
| 163 | 
            +
                        save_directory (`str`):
         | 
| 164 | 
            +
                            The directory in which to save the vocabulary.
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    Returns:
         | 
| 167 | 
            +
                        `Tuple(str)`: Paths to the files saved.
         | 
| 168 | 
            +
                    """
         | 
| 169 | 
            +
                    if not os.path.isdir(save_directory):
         | 
| 170 | 
            +
                        logger.error(f"Vocabulary path ({save_directory}) should be a directory")
         | 
| 171 | 
            +
                        return
         | 
| 172 | 
            +
                    out_vocab_file = os.path.join(
         | 
| 173 | 
            +
                        save_directory,
         | 
| 174 | 
            +
                        (filename_prefix + "-" if filename_prefix else "")
         | 
| 175 | 
            +
                        + VOCAB_FILES_NAMES["vocab_file"],
         | 
| 176 | 
            +
                    )
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                    if os.path.abspath(self.vocab_file) != os.path.abspath(
         | 
| 179 | 
            +
                        out_vocab_file
         | 
| 180 | 
            +
                    ) and os.path.isfile(self.vocab_file):
         | 
| 181 | 
            +
                        copyfile(self.vocab_file, out_vocab_file)
         | 
| 182 | 
            +
                    elif not os.path.isfile(self.vocab_file):
         | 
| 183 | 
            +
                        with open(out_vocab_file, "wb") as fi:
         | 
| 184 | 
            +
                            content_spiece_model = self.sp_model.serialized_model_proto()
         | 
| 185 | 
            +
                            fi.write(content_spiece_model)
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    return (out_vocab_file,)
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         | 
| 190 | 
            +
                    if self.add_bos_token:
         | 
| 191 | 
            +
                        bos_token_ids = [self.bos_token_id]
         | 
| 192 | 
            +
                    else:
         | 
| 193 | 
            +
                        bos_token_ids = []
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                    output = bos_token_ids + token_ids_0
         | 
| 196 | 
            +
             | 
| 197 | 
            +
                    if token_ids_1 is not None:
         | 
| 198 | 
            +
                        output = output + token_ids_1
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                    if self.add_eos_token:
         | 
| 201 | 
            +
                        output = output + [self.eos_token_id]
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                    return output
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                def get_special_tokens_mask(
         | 
| 206 | 
            +
                    self,
         | 
| 207 | 
            +
                    token_ids_0: List[int],
         | 
| 208 | 
            +
                    token_ids_1: Optional[List[int]] = None,
         | 
| 209 | 
            +
                    already_has_special_tokens: bool = False,
         | 
| 210 | 
            +
                ) -> List[int]:
         | 
| 211 | 
            +
                    """
         | 
| 212 | 
            +
                    Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         | 
| 213 | 
            +
                    special tokens using the tokenizer `prepare_for_model` method.
         | 
| 214 | 
            +
             | 
| 215 | 
            +
                    Args:
         | 
| 216 | 
            +
                        token_ids_0 (`List[int]`):
         | 
| 217 | 
            +
                            List of IDs.
         | 
| 218 | 
            +
                        token_ids_1 (`List[int]`, *optional*):
         | 
| 219 | 
            +
                            Optional second list of IDs for sequence pairs.
         | 
| 220 | 
            +
                        already_has_special_tokens (`bool`, *optional*, defaults to `False`):
         | 
| 221 | 
            +
                            Whether or not the token list is already formatted with special tokens for the model.
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                    Returns:
         | 
| 224 | 
            +
                        `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         | 
| 225 | 
            +
                    """
         | 
| 226 | 
            +
                    if already_has_special_tokens:
         | 
| 227 | 
            +
                        return super().get_special_tokens_mask(
         | 
| 228 | 
            +
                            token_ids_0=token_ids_0,
         | 
| 229 | 
            +
                            token_ids_1=token_ids_1,
         | 
| 230 | 
            +
                            already_has_special_tokens=True,
         | 
| 231 | 
            +
                        )
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                    if token_ids_1 is None:
         | 
| 234 | 
            +
                        return [1] + ([0] * len(token_ids_0)) + [1]
         | 
| 235 | 
            +
                    return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
         | 
| 236 | 
            +
             | 
| 237 | 
            +
                def create_token_type_ids_from_sequences(
         | 
| 238 | 
            +
                    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
         | 
| 239 | 
            +
                ) -> List[int]:
         | 
| 240 | 
            +
                    """
         | 
| 241 | 
            +
                    Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
         | 
| 242 | 
            +
                    use of token type ids, therefore a list of zeros is returned.
         | 
| 243 | 
            +
             | 
| 244 | 
            +
                    Args:
         | 
| 245 | 
            +
                        token_ids_0 (`List[int]`):
         | 
| 246 | 
            +
                            List of IDs.
         | 
| 247 | 
            +
                        token_ids_1 (`List[int]`, *optional*):
         | 
| 248 | 
            +
                            Optional second list of IDs for sequence pairs.
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                    Returns:
         | 
| 251 | 
            +
                        `List[int]`: List of zeros.
         | 
| 252 | 
            +
                    """
         | 
| 253 | 
            +
                    eos = [self.eos_token_id]
         | 
| 254 | 
            +
             | 
| 255 | 
            +
                    if token_ids_1 is None:
         | 
| 256 | 
            +
                        return len(token_ids_0 + eos) * [0]
         | 
| 257 | 
            +
                    return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
         | 
    	
        tokenizer.model
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
         | 
| 3 | 
            +
            size 1477754
         | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,179 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "added_tokens_decoder": {
         | 
| 3 | 
            +
                "0": {
         | 
| 4 | 
            +
                  "content": "<unk>",
         | 
| 5 | 
            +
                  "lstrip": false,
         | 
| 6 | 
            +
                  "normalized": false,
         | 
| 7 | 
            +
                  "rstrip": false,
         | 
| 8 | 
            +
                  "single_word": false,
         | 
| 9 | 
            +
                  "special": true
         | 
| 10 | 
            +
                },
         | 
| 11 | 
            +
                "1": {
         | 
| 12 | 
            +
                  "content": "<s>",
         | 
| 13 | 
            +
                  "lstrip": false,
         | 
| 14 | 
            +
                  "normalized": false,
         | 
| 15 | 
            +
                  "rstrip": false,
         | 
| 16 | 
            +
                  "single_word": false,
         | 
| 17 | 
            +
                  "special": true
         | 
| 18 | 
            +
                },
         | 
| 19 | 
            +
                "2": {
         | 
| 20 | 
            +
                  "content": "</s>",
         | 
| 21 | 
            +
                  "lstrip": false,
         | 
| 22 | 
            +
                  "normalized": false,
         | 
| 23 | 
            +
                  "rstrip": false,
         | 
| 24 | 
            +
                  "single_word": false,
         | 
| 25 | 
            +
                  "special": true
         | 
| 26 | 
            +
                },
         | 
| 27 | 
            +
                "92538": {
         | 
| 28 | 
            +
                  "content": "<|plugin|>",
         | 
| 29 | 
            +
                  "lstrip": false,
         | 
| 30 | 
            +
                  "normalized": false,
         | 
| 31 | 
            +
                  "rstrip": false,
         | 
| 32 | 
            +
                  "single_word": false,
         | 
| 33 | 
            +
                  "special": true
         | 
| 34 | 
            +
                },
         | 
| 35 | 
            +
                "92539": {
         | 
| 36 | 
            +
                  "content": "<|interpreter|>",
         | 
| 37 | 
            +
                  "lstrip": false,
         | 
| 38 | 
            +
                  "normalized": false,
         | 
| 39 | 
            +
                  "rstrip": false,
         | 
| 40 | 
            +
                  "single_word": false,
         | 
| 41 | 
            +
                  "special": true
         | 
| 42 | 
            +
                },
         | 
| 43 | 
            +
                "92540": {
         | 
| 44 | 
            +
                  "content": "<|action_end|>",
         | 
| 45 | 
            +
                  "lstrip": false,
         | 
| 46 | 
            +
                  "normalized": false,
         | 
| 47 | 
            +
                  "rstrip": false,
         | 
| 48 | 
            +
                  "single_word": false,
         | 
| 49 | 
            +
                  "special": true
         | 
| 50 | 
            +
                },
         | 
| 51 | 
            +
                "92541": {
         | 
| 52 | 
            +
                  "content": "<|action_start|>",
         | 
| 53 | 
            +
                  "lstrip": false,
         | 
| 54 | 
            +
                  "normalized": false,
         | 
| 55 | 
            +
                  "rstrip": false,
         | 
| 56 | 
            +
                  "single_word": false,
         | 
| 57 | 
            +
                  "special": true
         | 
| 58 | 
            +
                },
         | 
| 59 | 
            +
                "92542": {
         | 
| 60 | 
            +
                  "content": "<|im_end|>",
         | 
| 61 | 
            +
                  "lstrip": false,
         | 
| 62 | 
            +
                  "normalized": false,
         | 
| 63 | 
            +
                  "rstrip": false,
         | 
| 64 | 
            +
                  "single_word": false,
         | 
| 65 | 
            +
                  "special": true
         | 
| 66 | 
            +
                },
         | 
| 67 | 
            +
                "92543": {
         | 
| 68 | 
            +
                  "content": "<|im_start|>",
         | 
| 69 | 
            +
                  "lstrip": false,
         | 
| 70 | 
            +
                  "normalized": false,
         | 
| 71 | 
            +
                  "rstrip": false,
         | 
| 72 | 
            +
                  "single_word": false,
         | 
| 73 | 
            +
                  "special": true
         | 
| 74 | 
            +
                },
         | 
| 75 | 
            +
                "92544": {
         | 
| 76 | 
            +
                  "content": "<img>",
         | 
| 77 | 
            +
                  "lstrip": false,
         | 
| 78 | 
            +
                  "normalized": false,
         | 
| 79 | 
            +
                  "rstrip": false,
         | 
| 80 | 
            +
                  "single_word": false,
         | 
| 81 | 
            +
                  "special": true
         | 
| 82 | 
            +
                },
         | 
| 83 | 
            +
                "92545": {
         | 
| 84 | 
            +
                  "content": "</img>",
         | 
| 85 | 
            +
                  "lstrip": false,
         | 
| 86 | 
            +
                  "normalized": false,
         | 
| 87 | 
            +
                  "rstrip": false,
         | 
| 88 | 
            +
                  "single_word": false,
         | 
| 89 | 
            +
                  "special": true
         | 
| 90 | 
            +
                },
         | 
| 91 | 
            +
                "92546": {
         | 
| 92 | 
            +
                  "content": "<IMG_CONTEXT>",
         | 
| 93 | 
            +
                  "lstrip": false,
         | 
| 94 | 
            +
                  "normalized": false,
         | 
| 95 | 
            +
                  "rstrip": false,
         | 
| 96 | 
            +
                  "single_word": false,
         | 
| 97 | 
            +
                  "special": true
         | 
| 98 | 
            +
                },
         | 
| 99 | 
            +
                "92547": {
         | 
| 100 | 
            +
                  "content": "<quad>",
         | 
| 101 | 
            +
                  "lstrip": false,
         | 
| 102 | 
            +
                  "normalized": false,
         | 
| 103 | 
            +
                  "rstrip": false,
         | 
| 104 | 
            +
                  "single_word": false,
         | 
| 105 | 
            +
                  "special": true
         | 
| 106 | 
            +
                },
         | 
| 107 | 
            +
                "92548": {
         | 
| 108 | 
            +
                  "content": "</quad>",
         | 
| 109 | 
            +
                  "lstrip": false,
         | 
| 110 | 
            +
                  "normalized": false,
         | 
| 111 | 
            +
                  "rstrip": false,
         | 
| 112 | 
            +
                  "single_word": false,
         | 
| 113 | 
            +
                  "special": true
         | 
| 114 | 
            +
                },
         | 
| 115 | 
            +
                "92549": {
         | 
| 116 | 
            +
                  "content": "<ref>",
         | 
| 117 | 
            +
                  "lstrip": false,
         | 
| 118 | 
            +
                  "normalized": false,
         | 
| 119 | 
            +
                  "rstrip": false,
         | 
| 120 | 
            +
                  "single_word": false,
         | 
| 121 | 
            +
                  "special": true
         | 
| 122 | 
            +
                },
         | 
| 123 | 
            +
                "92550": {
         | 
| 124 | 
            +
                  "content": "</ref>",
         | 
| 125 | 
            +
                  "lstrip": false,
         | 
| 126 | 
            +
                  "normalized": false,
         | 
| 127 | 
            +
                  "rstrip": false,
         | 
| 128 | 
            +
                  "single_word": false,
         | 
| 129 | 
            +
                  "special": true
         | 
| 130 | 
            +
                },
         | 
| 131 | 
            +
                "92551": {
         | 
| 132 | 
            +
                  "content": "<box>",
         | 
| 133 | 
            +
                  "lstrip": false,
         | 
| 134 | 
            +
                  "normalized": false,
         | 
| 135 | 
            +
                  "rstrip": false,
         | 
| 136 | 
            +
                  "single_word": false,
         | 
| 137 | 
            +
                  "special": true
         | 
| 138 | 
            +
                },
         | 
| 139 | 
            +
                "92552": {
         | 
| 140 | 
            +
                  "content": "</box>",
         | 
| 141 | 
            +
                  "lstrip": false,
         | 
| 142 | 
            +
                  "normalized": false,
         | 
| 143 | 
            +
                  "rstrip": false,
         | 
| 144 | 
            +
                  "single_word": false,
         | 
| 145 | 
            +
                  "special": true
         | 
| 146 | 
            +
                }
         | 
| 147 | 
            +
              },
         | 
| 148 | 
            +
              "additional_special_tokens": [
         | 
| 149 | 
            +
                "<|im_start|>",
         | 
| 150 | 
            +
                "<|im_end|>",
         | 
| 151 | 
            +
                "<|action_start|>",
         | 
| 152 | 
            +
                "<|action_end|>",
         | 
| 153 | 
            +
                "<|interpreter|>",
         | 
| 154 | 
            +
                "<|plugin|>",
         | 
| 155 | 
            +
                "<img>",
         | 
| 156 | 
            +
                "</img>",
         | 
| 157 | 
            +
                "<IMG_CONTEXT>",
         | 
| 158 | 
            +
                "<quad>",
         | 
| 159 | 
            +
                "</quad>",
         | 
| 160 | 
            +
                "<ref>",
         | 
| 161 | 
            +
                "</ref>",
         | 
| 162 | 
            +
                "<box>",
         | 
| 163 | 
            +
                "</box>"
         | 
| 164 | 
            +
              ],
         | 
| 165 | 
            +
              "auto_map": {
         | 
| 166 | 
            +
                "AutoTokenizer": [
         | 
| 167 | 
            +
                  "tokenization_internlm2.InternLM2Tokenizer",
         | 
| 168 | 
            +
                  null
         | 
| 169 | 
            +
                ]
         | 
| 170 | 
            +
              },
         | 
| 171 | 
            +
              "bos_token": "<s>",
         | 
| 172 | 
            +
              "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
         | 
| 173 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 174 | 
            +
              "eos_token": "</s>",
         | 
| 175 | 
            +
              "model_max_length": 8192,
         | 
| 176 | 
            +
              "pad_token": "</s>",
         | 
| 177 | 
            +
              "tokenizer_class": "InternLM2Tokenizer",
         | 
| 178 | 
            +
              "unk_token": "<unk>"
         | 
| 179 | 
            +
            }
         | 
    	
        training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:0ba603c4a27340cfd9ca068e61012f1ef5c85e9db79e06b08355b0289e5e68ce
         | 
| 3 | 
            +
            size 5179
         |