emanuelaboros commited on
Commit
6f4cd0c
·
1 Parent(s): da75bdd

lets try to change the pipeline

Browse files
Files changed (2) hide show
  1. modeling_stacked.py +14 -16
  2. test.py +10 -35
modeling_stacked.py CHANGED
@@ -41,22 +41,20 @@ class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
41
 
42
  def forward(self, input_ids, attention_mask=None, **kwargs):
43
  # Convert input_ids to strings using tokenizer
44
- if input_ids is not None:
45
- tokenizer = kwargs.get("tokenizer")
46
- texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
47
- else:
48
- texts = kwargs.get("text", None)
49
-
50
- if texts:
51
- # Floret expects strings, not tensors
52
- predictions = [self.model_floret(text) for text in texts]
53
- # Convert predictions to tensors for Hugging Face compatibility
54
- return torch.tensor(predictions)
55
- else:
56
- # If no text is found, return dummy output
57
- return torch.zeros(
58
- (1, 2)
59
- ) # Dummy tensor with shape (batch_size, num_classes)
60
 
61
  def state_dict(self, *args, **kwargs):
62
  # Return an empty state dictionary
 
41
 
42
  def forward(self, input_ids, attention_mask=None, **kwargs):
43
  # Convert input_ids to strings using tokenizer
44
+ # if input_ids is not None:
45
+ # tokenizer = kwargs.get("tokenizer")
46
+ # texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
47
+ # else:
48
+ # texts = kwargs.get("text", None)
49
+ #
50
+ # if texts:
51
+ # # Floret expects strings, not tensors
52
+ # predictions = [self.model_floret(text) for text in texts]
53
+ # # Convert predictions to tensors for Hugging Face compatibility
54
+ # return torch.tensor(predictions)
55
+ # else:
56
+ # If no text is found, return dummy output
57
+ return torch.zeros((1, 2)) # Dummy tensor with shape (batch_size, num_classes)
 
 
58
 
59
  def state_dict(self, *args, **kwargs):
60
  # Return an empty state dictionary
test.py CHANGED
@@ -1,46 +1,21 @@
1
- # Import necessary modules from the transformers library
2
- from transformers import pipeline
3
  from transformers import AutoModelForTokenClassification, AutoTokenizer
 
4
 
5
  # Define the model name to be used for token classification, we use the Impresso NER
6
  # that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
7
- MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
8
 
9
  # Load the tokenizer corresponding to the specified model name
10
  ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
 
12
- ner_pipeline = pipeline(
13
- "generic-ner",
14
- model=MODEL_NAME,
15
- tokenizer=ner_tokenizer,
16
- trust_remote_code=True,
17
- device="cpu",
18
- )
19
- sentences = [
20
- """In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles,
21
- where Marie Antoinette, the Queen of France, alongside Maximilien Robespierre, a leading member of the National Assembly,
22
- debated with Jean-Jacques Rousseau, the famous philosopher, and Charles de Talleyrand, the Bishop of Autun,
23
- regarding the future of the French monarchy. At the same time, across the Atlantic in Philadelphia,
24
- George Washington, the first President of the United States, and Thomas Jefferson, the nation's Secretary of State,
25
- were drafting policies for the newly established American government following the signing of the Constitution."""
26
- ]
27
-
28
- print(sentences[0])
29
-
30
-
31
- # Helper function to print entities one per row
32
- def print_nicely(entities):
33
- for entity in entities:
34
- print(
35
- f"Entity: {entity['entity']} | Confidence: {entity['score']:.2f}% | Text: {entity['word'].strip()} | Start: {entity['start']} | End: {entity['end']}"
36
- )
37
 
 
38
 
39
- # Visualize stacked entities for each sentence
40
- for sentence in sentences:
41
- results = ner_pipeline(sentence)
42
 
43
- # Extract coarse and fine entities
44
- for key in results.keys():
45
- # Visualize the coarse entities
46
- print_nicely(results[key])
 
1
+ # Import necessary Python modules from the Transformers library
 
2
  from transformers import AutoModelForTokenClassification, AutoTokenizer
3
+ from transformers import pipeline
4
 
5
  # Define the model name to be used for token classification, we use the Impresso NER
6
  # that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
7
+ MODEL_NAME = "emanuelaboros/ner-stacked-bert-multilingual"
8
 
9
  # Load the tokenizer corresponding to the specified model name
10
  ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
 
12
+ ner_pipeline = pipeline("lang-detect", model=MODEL_NAME,
13
+ # tokenizer=ner_tokenizer,
14
+ trust_remote_code=True,
15
+ device='cpu')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
18
 
19
+ entities = ner_pipeline(sentence)
20
+ entities
 
21