dd

Browse files

Files changed (14) hide show

README.md +66 -0
__init__.py +0 -0
config.json +233 -0
configuration_stacked.py +99 -0
generic_ner.py +0 -2
label_map.json +1 -0
model.safetensors +3 -0
modeling_stacked.py +136 -0
push_to_hf.py +145 -0
special_tokens_map.json +37 -0
test.py +46 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+library_name: transformers
+language:
+- en
+- fr
+- de
+tags:
+- v1.0.0
+---
+The **Impresso NER model** is based on the stacked Transformer architecture published in [CoNLL 2020](https://aclanthology.org/2020.conll-1.35/) trained on the Impresso HIPE-2020 portion of the [HIPE-2022 dataset](https://github.com/hipe-eval/HIPE-2022-data). It recognizes entity types such as person, location, and organization while supporting the complete [HIPE typology](https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-hipe2020.md), including coarse and fine-grained entity types as well as components like names, titles, and roles. Additionally, the NER model's backbone ([dbmdz/bert-medium-historic-multilingual-cased](https://huggingface.co/dbmdz/bert-medium-historic-multilingual-cased)) was trained on various European historical datasets, giving it a broader language capability.  This training included data from the Europeana and British Library collections across multiple languages: German, French, English, Finnish, and Swedish. Due to this multilingual backbone, the NER model may also recognize entities in other languages beyond French and German.
+#### How to use
+You can use this model with Transformers *pipeline* for NER.
+<!-- Provide a longer summary of what this model is. -->
+```python
+# Import necessary Python modules from the Transformers library
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+from transformers import pipeline
+# Define the model name to be used for token classification, we use the Impresso NER
+# that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
+MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
+# Load the tokenizer corresponding to the specified model name
+ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+ner_pipeline = pipeline("generic-ner", model=MODEL_NAME,
+                        tokenizer=ner_tokenizer,
+                        trust_remote_code=True,
+                        device='cpu')
+sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
+entities = ner_pipeline(sentence)
+print(entities)
+```
+```
+[
+  {'type': 'time', 'confidence_ner': 85.0, 'surface': "an 1348", 'lOffset': 0, 'rOffset': 12},
+  {'type': 'loc', 'confidence_ner': 90.75, 'surface': 'Europe', 'lOffset': 69, 'rOffset': 75},
+  {'type': 'loc', 'confidence_ner': 75.45, 'surface': 'Royaume de France', 'lOffset': 80, 'rOffset': 97},
+  {'type': 'pers', 'confidence_ner': 85.27, 'surface': 'roi Philippe VI', 'lOffset': 181, 'rOffset': 196, 'title': 'roi', 'name': 'roi Philippe VI'},
+  {'type': 'loc', 'confidence_ner': 30.59, 'surface': 'Louvre', 'lOffset': 210, 'rOffset': 216},
+  {'type': 'loc', 'confidence_ner': 94.46, 'surface': 'Paris', 'lOffset': 266, 'rOffset': 271},
+  {'type': 'pers', 'confidence_ner': 96.1, 'surface': 'chancelier Guillaume de Nogaret', 'lOffset': 350, 'rOffset': 381, 'title': 'chancelier', 'name': 'chancelier Guillaume de Nogaret'},
+  {'type': 'loc', 'confidence_ner': 49.35, 'surface': 'Royaume', 'lOffset': 80, 'rOffset': 87},
+  {'type': 'loc', 'confidence_ner': 24.18, 'surface': 'France', 'lOffset': 91, 'rOffset': 97}
+]
+```
+### BibTeX entry and citation info
+```
+@inproceedings{boros2020alleviating,
+  title={Alleviating digitization errors in named entity recognition for historical documents},
+  author={Boros, Emanuela and Hamdi, Ahmed and Pontes, Elvys Linhares and Cabrera-Diego, Luis-Adri{\'a}n and Moreno, Jose G and Sidere, Nicolas and Doucet, Antoine},
+  booktitle={Proceedings of the 24th conference on computational natural language learning},
+  pages={431--441},
+  year={2020}
+}
+```

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+  "_name_or_path": "experiments_final/model_dbmdz_bert_medium_historic_multilingual_cased_max_sequence_length_512_epochs_5_run_extended_suffix_baseline/checkpoint-450",
+  "architectures": [
+    "ExtendedMultitaskModelForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_stacked.ImpressoConfig",
+    "AutoModelForTokenClassification": "modeling_stacked.ExtendedMultitaskModelForTokenClassification"
+  },
+  "classifier_dropout": null,
+  "custom_pipelines": {
+    "generic-ner": {
+      "impl": "generic_ner.MultitaskTokenClassificationPipeline",
+      "pt": "AutoModelForTokenClassification"
+    }
+  },
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "label_map": {
+    "NE-COARSE-LIT": {
+      "B-loc": 8,
+      "B-org": 0,
+      "B-pers": 7,
+      "B-prod": 4,
+      "B-time": 5,
+      "I-loc": 1,
+      "I-org": 2,
+      "I-pers": 9,
+      "I-prod": 10,
+      "I-time": 6,
+      "O": 3
+    },
+    "NE-COARSE-METO": {
+      "B-loc": 3,
+      "B-org": 0,
+      "B-time": 5,
+      "I-loc": 4,
+      "I-org": 2,
+      "O": 1
+    },
+    "NE-FINE-COMP": {
+      "B-comp.demonym": 8,
+      "B-comp.function": 5,
+      "B-comp.name": 1,
+      "B-comp.qualifier": 9,
+      "B-comp.title": 2,
+      "I-comp.demonym": 7,
+      "I-comp.function": 3,
+      "I-comp.name": 0,
+      "I-comp.qualifier": 10,
+      "I-comp.title": 4,
+      "O": 6
+    },
+    "NE-FINE-LIT": {
+      "B-loc.add.elec": 32,
+      "B-loc.add.phys": 5,
+      "B-loc.adm.nat": 34,
+      "B-loc.adm.reg": 39,
+      "B-loc.adm.sup": 12,
+      "B-loc.adm.town": 33,
+      "B-loc.fac": 36,
+      "B-loc.oro": 19,
+      "B-loc.phys.geo": 13,
+      "B-loc.phys.hydro": 28,
+      "B-loc.unk": 4,
+      "B-org.adm": 3,
+      "B-org.ent": 24,
+      "B-org.ent.pressagency": 37,
+      "B-pers.coll": 9,
+      "B-pers.ind": 0,
+      "B-pers.ind.articleauthor": 20,
+      "B-prod.doctr": 2,
+      "B-prod.media": 10,
+      "B-time.date.abs": 23,
+      "I-loc.add.elec": 22,
+      "I-loc.add.phys": 6,
+      "I-loc.adm.nat": 11,
+      "I-loc.adm.reg": 35,
+      "I-loc.adm.sup": 15,
+      "I-loc.adm.town": 8,
+      "I-loc.fac": 27,
+      "I-loc.oro": 21,
+      "I-loc.phys.geo": 25,
+      "I-loc.phys.hydro": 17,
+      "I-loc.unk": 40,
+      "I-org.adm": 29,
+      "I-org.ent": 1,
+      "I-org.ent.pressagency": 14,
+      "I-pers.coll": 26,
+      "I-pers.ind": 16,
+      "I-pers.ind.articleauthor": 31,
+      "I-prod.doctr": 30,
+      "I-prod.media": 38,
+      "I-time.date.abs": 7,
+      "O": 18
+    },
+    "NE-FINE-METO": {
+      "B-loc.adm.town": 6,
+      "B-loc.fac": 3,
+      "B-loc.oro": 5,
+      "B-org.adm": 1,
+      "B-org.ent": 7,
+      "B-time.date.abs": 9,
+      "I-loc.fac": 8,
+      "I-org.adm": 2,
+      "I-org.ent": 0,
+      "O": 4
+    },
+    "NE-NESTED": {
+      "B-loc.adm.nat": 13,
+      "B-loc.adm.reg": 15,
+      "B-loc.adm.sup": 10,
+      "B-loc.adm.town": 9,
+      "B-loc.fac": 18,
+      "B-loc.oro": 17,
+      "B-loc.phys.geo": 11,
+      "B-loc.phys.hydro": 1,
+      "B-org.adm": 4,
+      "B-org.ent": 20,
+      "B-pers.coll": 7,
+      "B-pers.ind": 2,
+      "B-prod.media": 23,
+      "I-loc.adm.nat": 8,
+      "I-loc.adm.reg": 14,
+      "I-loc.adm.town": 6,
+      "I-loc.fac": 0,
+      "I-loc.oro": 19,
+      "I-loc.phys.geo": 21,
+      "I-loc.phys.hydro": 22,
+      "I-org.adm": 5,
+      "I-org.ent": 3,
+      "I-pers.ind": 12,
+      "I-prod.media": 24,
+      "O": 16
+    }
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "stacked_bert",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "pretrained_config": {
+    "_name_or_path": "dbmdz/bert-medium-historic-multilingual-cased",
+    "add_cross_attention": false,
+    "architectures": [
+      "BertForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": null,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 512,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "bert",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 8,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "position_embedding_type": "absolute",
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "type_vocab_size": 2,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 32000
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32000
+}

configuration_stacked.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from transformers import PretrainedConfig
+import torch
+class ImpressoConfig(PretrainedConfig):
+    model_type = "stacked_bert"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        pretrained_config=None,
+        values_override=None,
+        label_map=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.pretrained_config = pretrained_config
+        self.label_map = label_map
+        self.values_override = values_override or {}
+        self.outputs = {
+            "logits": {"shape": [None, None, self.hidden_size], "dtype": "float32"}
+        }
+    @classmethod
+    def is_torch_support_available(cls):
+        """
+        Indicate whether Torch support is available for this configuration.
+        Required for compatibility with certain parts of the Transformers library.
+        """
+        return True
+    @classmethod
+    def patch_ops(self):
+        """
+        A method required by some Hugging Face utilities to modify operator mappings.
+        Currently, it performs no operation and is included for compatibility.
+        Args:
+            ops: A dictionary of operations to potentially patch.
+        Returns:
+            The (unmodified) ops dictionary.
+        """
+        return None
+    def generate_dummy_inputs(self, tokenizer, batch_size=1, seq_length=8, framework="pt"):
+        """
+        Generate dummy inputs for testing or export.
+        Args:
+            tokenizer: The tokenizer used to tokenize inputs.
+            batch_size: Number of input samples in the batch.
+            seq_length: Length of each sequence.
+            framework: Framework ("pt" for PyTorch, "tf" for TensorFlow).
+        Returns:
+            Dummy inputs as a dictionary.
+        """
+        if framework == "pt":
+            input_ids = torch.randint(
+                low=0,
+                high=self.vocab_size,
+                size=(batch_size, seq_length),
+                dtype=torch.long
+            )
+            attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long)
+            return {"input_ids": input_ids, "attention_mask": attention_mask}
+        else:
+            raise ValueError("Framework '{}' not supported.".format(framework))
+# Register the configuration with the transformers library
+ImpressoConfig.register_for_auto_class()

generic_ner.py CHANGED Viewed

@@ -4,7 +4,6 @@ import numpy as np
 import torch
 import nltk
-# new test
 nltk.download("averaged_perceptron_tagger")
 nltk.download("averaged_perceptron_tagger_eng")
 nltk.download("stopwords")
@@ -688,7 +687,6 @@ def remove_trailing_stopwords(entities):
         print(f"Remained entities: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):

 import torch
 import nltk
 nltk.download("averaged_perceptron_tagger")
 nltk.download("averaged_perceptron_tagger_eng")
 nltk.download("stopwords")
         print(f"Remained entities: {len(new_entities)}")
     return new_entities
 class MultitaskTokenClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):

label_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"NE-COARSE-LIT": {"B-org": 0, "I-loc": 1, "I-org": 2, "O": 3, "B-prod": 4, "B-time": 5, "I-time": 6, "B-pers": 7, "B-loc": 8, "I-pers": 9, "I-prod": 10}, "NE-COARSE-METO": {"B-org": 0, "O": 1, "I-org": 2, "B-loc": 3, "I-loc": 4, "B-time": 5}, "NE-FINE-LIT": {"B-pers.ind": 0, "I-org.ent": 1, "B-prod.doctr": 2, "B-org.adm": 3, "B-loc.unk": 4, "B-loc.add.phys": 5, "I-loc.add.phys": 6, "I-time.date.abs": 7, "I-loc.adm.town": 8, "B-pers.coll": 9, "B-prod.media": 10, "I-loc.adm.nat": 11, "B-loc.adm.sup": 12, "B-loc.phys.geo": 13, "I-org.ent.pressagency": 14, "I-loc.adm.sup": 15, "I-pers.ind": 16, "I-loc.phys.hydro": 17, "O": 18, "B-loc.oro": 19, "B-pers.ind.articleauthor": 20, "I-loc.oro": 21, "I-loc.add.elec": 22, "B-time.date.abs": 23, "B-org.ent": 24, "I-loc.phys.geo": 25, "I-pers.coll": 26, "I-loc.fac": 27, "B-loc.phys.hydro": 28, "I-org.adm": 29, "I-prod.doctr": 30, "I-pers.ind.articleauthor": 31, "B-loc.add.elec": 32, "B-loc.adm.town": 33, "B-loc.adm.nat": 34, "I-loc.adm.reg": 35, "B-loc.fac": 36, "B-org.ent.pressagency": 37, "I-prod.media": 38, "B-loc.adm.reg": 39, "I-loc.unk": 40}, "NE-FINE-METO": {"I-org.ent": 0, "B-org.adm": 1, "I-org.adm": 2, "B-loc.fac": 3, "O": 4, "B-loc.oro": 5, "B-loc.adm.town": 6, "B-org.ent": 7, "I-loc.fac": 8, "B-time.date.abs": 9}, "NE-FINE-COMP": {"I-comp.name": 0, "B-comp.name": 1, "B-comp.title": 2, "I-comp.function": 3, "I-comp.title": 4, "B-comp.function": 5, "O": 6, "I-comp.demonym": 7, "B-comp.demonym": 8, "B-comp.qualifier": 9, "I-comp.qualifier": 10}, "NE-NESTED": {"I-loc.fac": 0, "B-loc.phys.hydro": 1, "B-pers.ind": 2, "I-org.ent": 3, "B-org.adm": 4, "I-org.adm": 5, "I-loc.adm.town": 6, "B-pers.coll": 7, "I-loc.adm.nat": 8, "B-loc.adm.town": 9, "B-loc.adm.sup": 10, "B-loc.phys.geo": 11, "I-pers.ind": 12, "B-loc.adm.nat": 13, "I-loc.adm.reg": 14, "B-loc.adm.reg": 15, "O": 16, "B-loc.oro": 17, "B-loc.fac": 18, "I-loc.oro": 19, "B-org.ent": 20, "I-loc.phys.geo": 21, "I-loc.phys.hydro": 22, "B-prod.media": 23, "I-prod.media": 24}}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03a807b124debff782406c816eacb7ced1f2e25b9a5198b27e1616a41faa0662
+size 193971960

modeling_stacked.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from transformers.modeling_outputs import TokenClassifierOutput
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, AutoModel, AutoConfig, BertConfig
+from torch.nn import CrossEntropyLoss
+from typing import Optional, Tuple, Union
+import logging, json, os
+from .configuration_stacked import ImpressoConfig
+logger = logging.getLogger(__name__)
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
+    config_class = ImpressoConfig
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_token_labels_dict = get_info(config.label_map)
+        self.config = config
+        self.bert = AutoModel.from_pretrained(
+            config.pretrained_config["_name_or_path"], config=config.pretrained_config
+        )
+        if "classifier_dropout" not in config.__dict__:
+            classifier_dropout = 0.1
+        else:
+            classifier_dropout = (
+                config.classifier_dropout
+                if config.classifier_dropout is not None
+                else config.hidden_dropout_prob
+            )
+        self.dropout = nn.Dropout(classifier_dropout)
+        # Additional transformer layers
+        self.transformer_encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=config.hidden_size, nhead=config.num_attention_heads
+            ),
+            num_layers=2,
+        )
+        # For token classification, create a classifier for each task
+        self.token_classifiers = nn.ModuleDict(
+            {
+                task: nn.Linear(config.hidden_size, num_labels)
+                for task, num_labels in self.num_token_labels_dict.items()
+            }
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        token_labels: Optional[dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        token_labels (`dict` of `torch.LongTensor` of shape `(batch_size, seq_length)`, *optional*):
+            Labels for computing the token classification loss. Keys should match the tasks.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        bert_kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+            "head_mask": head_mask,
+            "inputs_embeds": inputs_embeds,
+            "output_attentions": output_attentions,
+            "output_hidden_states": output_hidden_states,
+            "return_dict": return_dict,
+        }
+        if any(
+            keyword in self.config.name_or_path.lower()
+            for keyword in ["llama", "deberta"]
+        ):
+            bert_kwargs.pop("token_type_ids")
+            bert_kwargs.pop("head_mask")
+        outputs = self.bert(**bert_kwargs)
+        # For token classification
+        token_output = outputs[0]
+        token_output = self.dropout(token_output)
+        # Pass through additional transformer layers
+        token_output = self.transformer_encoder(token_output.transpose(0, 1)).transpose(
+            0, 1
+        )
+        # Collect the logits and compute the loss for each task
+        task_logits = {}
+        total_loss = 0
+        for task, classifier in self.token_classifiers.items():
+            logits = classifier(token_output)
+            task_logits[task] = logits
+            if token_labels and task in token_labels:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_token_labels_dict[task]),
+                    token_labels[task].view(-1),
+                )
+                total_loss += loss
+        if not return_dict:
+            output = (task_logits,) + outputs[2:]
+            return ((total_loss,) + output) if total_loss != 0 else output
+        return TokenClassifierOutput(
+            loss=total_loss,
+            logits=task_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

push_to_hf.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import shutil
+import argparse
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+    AutoModelForTokenClassification,
+    BertConfig,
+)
+from huggingface_hub import HfApi, Repository
+# import json
+from .configuration_stacked import ImpressoConfig
+from .modeling_stacked import ExtendedMultitaskModelForTokenClassification
+import subprocess
+def get_latest_checkpoint(checkpoint_dir):
+    checkpoints = [
+        d
+        for d in os.listdir(checkpoint_dir)
+        if os.path.isdir(os.path.join(checkpoint_dir, d))
+        and d.startswith("checkpoint-")
+    ]
+    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
+    return os.path.join(checkpoint_dir, checkpoints[0])
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+def push_model_to_hub(checkpoint_dir, repo_name, script_path):
+    checkpoint_path = get_latest_checkpoint(checkpoint_dir)
+    config = ImpressoConfig.from_pretrained(checkpoint_path)
+    config.pretrained_config = AutoConfig.from_pretrained(config.name_or_path)
+    config.save_pretrained("stacked_bert")
+    config = ImpressoConfig.from_pretrained("stacked_bert")
+    model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
+        checkpoint_path, config=config
+    )
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
+    local_repo_path = "./repo"
+    repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
+    repo = Repository(local_dir=local_repo_path, clone_from=repo_url)
+    try:
+        # Try to pull the latest changes from the remote repository using subprocess
+        subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
+    except subprocess.CalledProcessError as e:
+        # If fast-forward is not possible, reset the local branch to match the remote branch
+        subprocess.run(
+            ["git", "reset", "--hard", "origin/main"],
+            check=True,
+            cwd=local_repo_path,
+        )
+    # Copy all Python files to the local repository directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    for filename in os.listdir(current_dir):
+        if filename.endswith(".py") or filename.endswith(".json"):
+            shutil.copy(
+                os.path.join(current_dir, filename),
+                os.path.join(local_repo_path, filename),
+            )
+    ImpressoConfig.register_for_auto_class()
+    AutoConfig.register("stacked_bert", ImpressoConfig)
+    AutoModelForTokenClassification.register(
+        ImpressoConfig, ExtendedMultitaskModelForTokenClassification
+    )
+    ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
+        "AutoModelForTokenClassification"
+    )
+    model.save_pretrained(local_repo_path)
+    tokenizer.save_pretrained(local_repo_path)
+    # Add, commit and push the changes to the repository
+    subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
+    subprocess.run(
+        ["git", "commit", "-m", "Initial commit including model and configuration"],
+        check=True,
+        cwd=local_repo_path,
+    )
+    subprocess.run(["git", "push"], check=True, cwd=local_repo_path)
+    # Push the model to the hub (this includes the README template)
+    model.push_to_hub(repo_name)
+    tokenizer.push_to_hub(repo_name)
+    print(f"Model and repo pushed to: {repo_url}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        required=True,
+        help="Type of the model (e.g., stacked-bert)",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        required=True,
+        help="Language of the model (e.g., multilingual)",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        help="Directory containing checkpoint folders",
+    )
+    parser.add_argument(
+        "--script_path", type=str, required=True, help="Path to the models.py script"
+    )
+    args = parser.parse_args()
+    repo_name = f"impresso-project/ner-{args.model_type}-{args.language}"
+    push_model_to_hub(args.checkpoint_dir, repo_name, args.script_path)
+    # PIPELINE_REGISTRY.register_pipeline(
+    #     "generic-ner",
+    #     pipeline_class=MultitaskTokenClassificationPipeline,
+    #     pt_model=ExtendedMultitaskModelForTokenClassification,
+    # )
+    # model.config.custom_pipelines = {
+    #     "generic-ner": {
+    #         "impl": "generic_ner.MultitaskTokenClassificationPipeline",
+    #         "pt": ["ExtendedMultitaskModelForTokenClassification"],
+    #         "tf": [],
+    #     }
+    # }
+    # classifier = pipeline(
+    #     "generic-ner", model=model, tokenizer=tokenizer, label_map=label_map
+    # )
+    # from pprint import pprint
+    #
+    # pprint(
+    #     classifier(
+    #         "1. Le public est averti que Charlotte née Bourgoin, femme-de Joseph Digiez, et Maurice Bourgoin, enfant mineur représenté par le sieur Jaques Charles Gicot son curateur, ont été admis par arrêt du Conseil d'Etat du 5 décembre 1797, à solliciter une renonciation générale et absolue aux biens et aux dettes présentes et futures de Jean-Baptiste Bourgoin leur père."
+    #     )
+    # )
+    # repo.push_to_hub(commit_message="Initial commit of the trained NER model with code")

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Import necessary modules from the transformers library
+from transformers import pipeline
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+# Define the model name to be used for token classification, we use the Impresso NER
+# that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
+MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
+# Load the tokenizer corresponding to the specified model name
+ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+ner_pipeline = pipeline(
+    "generic-ner",
+    model=MODEL_NAME,
+    tokenizer=ner_tokenizer,
+    trust_remote_code=True,
+    device="cpu",
+)
+sentences = [
+    """In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles,
+                where Marie Antoinette, the Queen of France, alongside Maximilien Robespierre, a leading member of the National Assembly,
+                debated with Jean-Jacques Rousseau, the famous philosopher, and Charles de Talleyrand, the Bishop of Autun,
+                regarding the future of the French monarchy. At the same time, across the Atlantic in Philadelphia,
+                George Washington, the first President of the United States, and Thomas Jefferson, the nation's Secretary of State,
+                were drafting policies for the newly established American government following the signing of the Constitution."""
+]
+print(sentences[0])
+# Helper function to print entities one per row
+def print_nicely(entities):
+    for entity in entities:
+        print(
+            f"Entity: {entity['entity']} | Confidence: {entity['score']:.2f}% | Text: {entity['word'].strip()} | Start: {entity['start']} | End: {entity['end']}"
+        )
+# Visualize stacked entities for each sentence
+for sentence in sentences:
+    results = ner_pipeline(sentence)
+    # Extract coarse and fine entities
+    for key in results.keys():
+        # Visualize the coarse entities
+        print_nicely(results[key])

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "max_len": 512,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": false,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff