import spacy from spacy.tokenizer import Tokenizer @spacy.registry.tokenizers("custom_tokenizer") def create_custom_tokenizer(): def create_tokenizer(nlp): infixes = nlp.Defaults.infixes + [ r"/", r"-", r",", r":", r"\+", ] prefixes = nlp.Defaults.prefixes + [ r"-", r"\(", ] suffixes = nlp.Defaults.suffixes + [ r"\)", r"-", ] prefix_regex = spacy.util.compile_prefix_regex(prefixes) infix_regex = spacy.util.compile_infix_regex(infixes) suffix_regex = spacy.util.compile_suffix_regex(suffixes) return Tokenizer( nlp.vocab, infix_finditer=infix_regex.finditer, prefix_search=prefix_regex.search, suffix_search=suffix_regex.search, ) return create_tokenizer