emanuelaboros
/

lang-detect

@@ -41,22 +41,20 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
     def forward(self, input_ids, attention_mask=None, **kwargs):
         # Convert input_ids to strings using tokenizer
-        if input_ids is not None:
-            tokenizer = kwargs.get("tokenizer")
-            texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
-        else:
-            texts = kwargs.get("text", None)
-        if texts:
-            # Floret expects strings, not tensors
-            predictions = [self.model_floret(text) for text in texts]
-            # Convert predictions to tensors for Hugging Face compatibility
-            return torch.tensor(predictions)
-        else:
-            # If no text is found, return dummy output
-            return torch.zeros(
-                (1, 2)
-            )  # Dummy tensor with shape (batch_size, num_classes)
     def state_dict(self, *args, **kwargs):
         # Return an empty state dictionary

     def forward(self, input_ids, attention_mask=None, **kwargs):
         # Convert input_ids to strings using tokenizer
+        # if input_ids is not None:
+        #     tokenizer = kwargs.get("tokenizer")
+        #     texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+        # else:
+        #     texts = kwargs.get("text", None)
+        #
+        # if texts:
+        #     # Floret expects strings, not tensors
+        #     predictions = [self.model_floret(text) for text in texts]
+        #     # Convert predictions to tensors for Hugging Face compatibility
+        #     return torch.tensor(predictions)
+        # else:
+        # If no text is found, return dummy output
+        return torch.zeros((1, 2))  # Dummy tensor with shape (batch_size, num_classes)
     def state_dict(self, *args, **kwargs):
         # Return an empty state dictionary

test.py CHANGED Viewed

@@ -1,46 +1,21 @@
-# Import necessary modules from the transformers library
-from transformers import pipeline
 from transformers import AutoModelForTokenClassification, AutoTokenizer
 # Define the model name to be used for token classification, we use the Impresso NER
 # that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
-MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
 # Load the tokenizer corresponding to the specified model name
 ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-ner_pipeline = pipeline(
-    "generic-ner",
-    model=MODEL_NAME,
-    tokenizer=ner_tokenizer,
-    trust_remote_code=True,
-    device="cpu",
-)
-sentences = [
-    """In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles,
-                where Marie Antoinette, the Queen of France, alongside Maximilien Robespierre, a leading member of the National Assembly,
-                debated with Jean-Jacques Rousseau, the famous philosopher, and Charles de Talleyrand, the Bishop of Autun,
-                regarding the future of the French monarchy. At the same time, across the Atlantic in Philadelphia,
-                George Washington, the first President of the United States, and Thomas Jefferson, the nation's Secretary of State,
-                were drafting policies for the newly established American government following the signing of the Constitution."""
-]
-print(sentences[0])
-# Helper function to print entities one per row
-def print_nicely(entities):
-    for entity in entities:
-        print(
-            f"Entity: {entity['entity']} | Confidence: {entity['score']:.2f}% | Text: {entity['word'].strip()} | Start: {entity['start']} | End: {entity['end']}"
-        )
-# Visualize stacked entities for each sentence
-for sentence in sentences:
-    results = ner_pipeline(sentence)
-    # Extract coarse and fine entities
-    for key in results.keys():
-        # Visualize the coarse entities
-        print_nicely(results[key])

+# Import necessary Python modules from the Transformers library
 from transformers import AutoModelForTokenClassification, AutoTokenizer
+from transformers import pipeline
 # Define the model name to be used for token classification, we use the Impresso NER
 # that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
+MODEL_NAME = "emanuelaboros/ner-stacked-bert-multilingual"
 # Load the tokenizer corresponding to the specified model name
 ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+ner_pipeline = pipeline("lang-detect", model=MODEL_NAME,
+                        # tokenizer=ner_tokenizer,
+                        trust_remote_code=True,
+                        device='cpu')
+sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
+entities = ner_pipeline(sentence)
+entities