File size: 4,069 Bytes
7120e17 776fd18 7120e17 776fd18 7120e17 2249e1f 7120e17 2249e1f 7120e17 776fd18 7120e17 776fd18 7120e17 9112af4 7120e17 9112af4 7120e17 9112af4 7120e17 9112af4 7120e17 776fd18 7120e17 776fd18 7120e17 776fd18 7120e17 9112af4 7120e17 9112af4 7120e17 9112af4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import torch
from transformers import Pipeline
from transformers import AutoTokenizer
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
from transformers import AutoModelForTokenClassification
from huggingface_hub import Repository
import sys
import os
class TokenizeAndAlignLabelsStep():
# Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(self, examples, tokenizer):
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128, is_split_into_words=True)
# Map tokens to their respective word.
word_ids = tokenized_inputs.word_ids()
previous_word_idx = None
labels_mask = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
labels_mask.append(False)
# Only label the first token of a given word.
elif word_idx != previous_word_idx:
labels_mask.append(True)
else:
labels_mask.append(False)
previous_word_idx = word_idx
tokenized_inputs["labels_mask"] = labels_mask
return tokenized_inputs
class BERT_CRF_Pipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
return {}, {}, {}
def preprocess(self, inputs):
tokens = inputs['tokens']
tokenizer = AutoTokenizer.from_pretrained(
"neuralmind/bert-base-portuguese-cased", do_lower_case=False)
return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=tokens, tokenizer=tokenizer)
def _forward(self, tokenizer_results):
input_ids = torch.tensor(
tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
token_type_ids = torch.tensor(
tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
attention_mask = torch.tensor(
tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
labels_mask = torch.tensor(
tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
# input_ids, token_type_ids, attention_mask, labels, labels_mask
outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
return outputs
def postprocess(self, model_outputs):
# From Ner_tags to Ner_labels
for i, label in enumerate(model_outputs[0]):
model_outputs[0][i] = self.model.config.id2label[label]
return model_outputs[0]
def main():
PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Selective-pipeline",
pipeline_class=BERT_CRF_Pipeline,
pt_model=AutoModelForTokenClassification,
)
classifier = pipeline("PT-BERT-Large-CRF-HAREM-Selective-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Selective",
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
out_path = os.path.join(sys.path[0], 'out', 'pipeline')
repo = Repository(
out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Selective", use_auth_token=True)
# repo.git_pull()
classifier.save_pretrained(out_path)
repo.push_to_hub() |