feat(model): update model parameters

Browse files

Files changed (8) hide show

README.md +8 -0
config.json +49 -0
configuration_bert.py +5 -0
modeling_bert.py +97 -0
pytorch_model.bin +3 -0
similar.py +83 -0
tokenizer_config.json +6 -0
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,11 @@
 ---
 license: apache-2.0
 ---

 ---
+language:
+    - zh
 license: apache-2.0
+tags:
+    - bert
+    - similar
+pipeline_tag: other
 ---
+### BERT 中文相似度计算

config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "_name_or_path": "minskiter/cossim-bert-chinese-wwm-ext",
+  "architectures": [
+    "CosSimBertModel"
+  ],
+  "tokenizer_class": "BertTokenizer",
+  "custom_pipelines": {
+    "sentences_sim": {
+      "impl": "minskiter/cossim-bert-chinese-wwm-ext--similar.SimilarPipeline",
+      "pt": "AutoModel",
+      "tf": []
+    },
+    "textencode": {
+      "impl": "minskiter/cossim-bert-chinese-wwm-ext--similar.EncodePipeline",
+      "pt": "AutoModel",
+      "tf": []
+    }
+  },
+  "auto_map":{
+    "AutoModel": "minskiter/cossim-bert-chinese-wwm-ext--modeling_bert.CosSimBertModel",
+    "AutoConfig": "minskiter/cossim-bert-chinese-wwm-ext--configuration_bert.SimBertConfig"
+  },
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "simbert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

configuration_bert.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from transformers import BertConfig
+class SimBertConfig(BertConfig):
+    model_type = "simbert"

modeling_bert.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from transformers import PretrainedConfig, PreTrainedModel, BertModel, BertConfig
+from .configuration_bert import SimBertConfig
+from torch import nn
+class SimBertModel(PreTrainedModel):
+    """ SimBert Model
+    """
+    config_class = SimBertConfig
+    def __init__(
+            self,
+            config: PretrainedConfig
+        ) -> None:
+            super().__init__(config)
+            self.bert = BertModel(config=config, add_pooling_layer=True)
+            self.fc = nn.Linear(config.hidden_size, 2)
+            # self.loss_fct = nn.CrossEntropyLoss()
+            self.loss_fct = nn.MSELoss()
+            self.softmax = nn.Softmax(dim=1)
+    def forward(
+        self,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        labels=None
+    ):
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids
+        )
+        pooled_output = outputs.pooler_output
+        logits = self.fc(pooled_output)
+        logits = self.softmax(logits)[:,1]
+        if labels is not None:
+            loss = self.loss_fct(logits.view(-1), labels.view(-1))
+            return loss, logits
+        return None, logits
+class CosSimBertModel(PreTrainedModel):
+    """ CosSimBert Model
+    """
+    config_class = SimBertConfig
+    def __init__(
+            self,
+            config: PretrainedConfig
+        ) -> None:
+            super().__init__(config)
+            self.bert = BertModel(config=config, add_pooling_layer=True)
+            self.loss_fct = nn.MSELoss()
+            self.softmax = nn.Softmax(dim=1)
+    def forward(
+        self,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        labels=None
+    ):
+        seq_length = input_ids.size(-1)
+        a = {
+            "input_ids": input_ids[:,:seq_length//2],
+            "token_type_ids": token_type_ids[:,:seq_length//2],
+            "attention_mask": attention_mask[:,:seq_length//2]
+        }
+        b = {
+            "input_ids": input_ids[:,seq_length//2:],
+            "token_type_ids": token_type_ids[:,seq_length//2:],
+            "attention_mask": attention_mask[:,seq_length//2:]
+        }
+        outputs_a = self.bert(**a)
+        outputs_b = self.bert(**b)
+        pooled_a_output = outputs_a.pooler_output
+        pooled_b_output = outputs_b.pooler_output
+        logits = nn.functional.cosine_similarity(pooled_a_output, pooled_b_output)
+        if labels is not None:
+            loss = self.loss_fct(logits.view(-1), labels.view(-1))
+            return loss, logits
+        return None, logits
+    def encode(
+        self,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+    ):
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids
+        )
+        pooled_output = outputs.pooler_output
+        return pooled_output

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a89f77c91c7e9b1bdce180a136bc257fedaf753168ffffb47be07736b01ab80d
+size 409142765

similar.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from typing import Any, Dict, Tuple
+from transformers import Pipeline
+from transformers.pipelines.base import GenericTensor
+from transformers.utils import ModelOutput
+from typing import Union,List
+import torch
+class EncodePipeline(Pipeline):
+    def __init__(self, max_length=256,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_length = max_length
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {},{},{}
+    def preprocess(self, input: Union[Tuple[str],List[Tuple[str]]], **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
+        tensors = self.tokenizer(
+            input,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        return tensors
+    def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
+        logits = self.model.encode(**input_tensors)
+        return logits.tolist()
+    def postprocess(
+            self,
+            model_outputs: ModelOutput,
+            **postprocess_parameters: Dict
+        ) -> Any:
+        return model_outputs
+class SimilarPipeline(Pipeline):
+    def __init__(self, max_length=256,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_length = max_length
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {},{},{}
+    def preprocess(self, input: Union[Tuple[str],List[Tuple[str]]], **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
+        if isinstance(input, list):
+            a = list(map(lambda x: x[0], input))
+            b = list(map(lambda x: x[1], input))
+        else:
+            a = input[0]
+            b = input[1]
+        tensors = self.tokenizer(
+            a,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        tensors_b = self.tokenizer(
+            b,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        for key in tensors:
+            tensors[key] = torch.cat((tensors[key],tensors_b[key]),dim=0)
+        return tensors
+    def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
+        _,logits = self.model(**input_tensors)
+        logits_a = logits[:logits.size(0)//2]
+        logits_b = logits[logits.size(0)//2:]
+        logits = torch.nn.functional.cosine_similarity(logits_a, logits_b)
+        return logits.tolist()
+    def postprocess(
+            self,
+            model_outputs: ModelOutput,
+            **postprocess_parameters: Dict
+        ) -> Any:
+        return model_outputs

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "padding": "max_length",
+    "max_length": 512,
+    "name_or_path": "hfl/chinese-bert-wwm-ext",
+    "tokenizer_class": "BertTokenizer"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff