en_tech / custom_functions.py
magepol's picture
Update spaCy pipeline
a4e98a4 verified
raw
history blame contribute delete
914 Bytes
import spacy
from spacy.tokenizer import Tokenizer
@spacy.registry.tokenizers("custom_tokenizer")
def create_custom_tokenizer():
def create_tokenizer(nlp):
infixes = nlp.Defaults.infixes + [
r"/",
r"-",
r",",
r":",
r"\+",
]
prefixes = nlp.Defaults.prefixes + [
r"-",
r"\(",
]
suffixes = nlp.Defaults.suffixes + [
r"\)",
r"-",
]
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
infix_regex = spacy.util.compile_infix_regex(infixes)
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
return Tokenizer(
nlp.vocab,
infix_finditer=infix_regex.finditer,
prefix_search=prefix_regex.search,
suffix_search=suffix_regex.search,
)
return create_tokenizer