from typing import Tuple import logging import spacy from presidio_analyzer import RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider from transformers_class import TransformerRecognizer logger = logging.getLogger("presidio-streamlit") def create_nlp_engine_with_spacy( model_path: str, ) -> Tuple[NlpEngine, RecognizerRegistry]: """ Instantiate an NlpEngine with a spaCy model :param model_path: spaCy model path. """ if not spacy.util.is_package(model_path): spacy.cli.download(model_path) nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}], } nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() registry = RecognizerRegistry() # registry.load_predefined_recognizers() registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"]) registry.add_recognizers_from_yaml("recognizers.yaml") return nlp_engine, registry def create_nlp_engine_with_transformers( model_path: str, ) -> Tuple[NlpEngine, RecognizerRegistry]: """ Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model. The TransformersRecognizer would return results from Transformers models, the spaCy model would return NlpArtifacts such as POS and lemmas. :param model_path: HuggingFace model path. """ # if not spacy.util.is_package("en_core_web_sm"): # spacy.cli.download("en_core_web_sm") # # Using a small spaCy model + a HF NER model # transformers_recognizer = TransformersRecognizer(model_path=model_path) # # if model_path == "StanfordAIMI/stanford-deidentifier-base": # transformers_recognizer.load_transformer(**STANFORD_COFIGURATION) # elif model_path == "obi/deid_roberta_i2b2": # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) # else: # print(f"Warning: Model has no configuration, loading default.") # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) # Use small spaCy model, no need for both spacy and HF models # The transformers model is used here as a recognizer, not as an NlpEngine if not spacy.util.is_package(model_path): spacy.cli.download(model_path) nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}], } nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() registry = RecognizerRegistry() registry = load_predefined_recognizers(registry) mapping_labels = {"PER": "PERSON", 'LOC': 'LOCATION'} model_name = "AliaeAI/camembert_anonymizer_production_v2" # "Jean-Baptiste/camembert-ner" , "AliaeAI/camembert_anonymizer_production" transformers_recognizer = TransformerRecognizer(model_name, mapping_labels) registry.add_recognizer(transformers_recognizer) registry.remove_recognizer("SpacyRecognizer") return nlp_engine, registry from presidio_analyzer.predefined_recognizers import PhoneRecognizer, EmailRecognizer, CreditCardRecognizer, CryptoRecognizer, DateRecognizer, IpRecognizer, IbanRecognizer, UrlRecognizer import phonenumbers def load_predefined_recognizers(registry, lang='fr'): # phone number phone_recognizer_fr = PhoneRecognizer(supported_language=lang, supported_regions=phonenumbers.SUPPORTED_REGIONS,context=['téléphone']) registry.add_recognizer(phone_recognizer_fr) # email email_recognizer_fr = EmailRecognizer(supported_language=lang, context=["email", "mail", "e-mail"]) registry.add_recognizer(email_recognizer_fr) # credit card creditcard_recognizer_fr = CreditCardRecognizer(supported_language=lang,context=["crédit", "carte", "carte de crédit"]) registry.add_recognizer(creditcard_recognizer_fr) # crypto crypto_recognizer_fr = CryptoRecognizer(supported_language=lang, context=["crypto"]) registry.add_recognizer(crypto_recognizer_fr) # date time date_recognizer_fr = DateRecognizer(supported_language=lang, context=["mois", "date", "jour", "année"]) registry.add_recognizer(date_recognizer_fr) # ip address ip_recognizer_fr = IpRecognizer(supported_language=lang, context=["IP", "ip"]) registry.add_recognizer(ip_recognizer_fr) # iban iban_recognizer_fr = IbanRecognizer(supported_language=lang, context = ["IBAN", "iban", "bancaire", "compte"]) registry.add_recognizer(iban_recognizer_fr) # URL url_recognizer_fr = UrlRecognizer(supported_language=lang, context = ["site", "web"]) registry.add_recognizer(url_recognizer_fr) # load from yaml registry.add_recognizers_from_yaml("recognizers.yaml") return registry