|
import spacy |
|
from spacy.tokenizer import Tokenizer |
|
|
|
|
|
@spacy.registry.tokenizers("custom_tokenizer") |
|
def create_custom_tokenizer(): |
|
def create_tokenizer(nlp): |
|
infixes = nlp.Defaults.infixes + [ |
|
r"/", |
|
r"-", |
|
r",", |
|
r":", |
|
r"\+", |
|
] |
|
prefixes = nlp.Defaults.prefixes + [ |
|
r"-", |
|
r"\(", |
|
] |
|
suffixes = nlp.Defaults.suffixes + [ |
|
r"\)", |
|
r"-", |
|
] |
|
prefix_regex = spacy.util.compile_prefix_regex(prefixes) |
|
infix_regex = spacy.util.compile_infix_regex(infixes) |
|
suffix_regex = spacy.util.compile_suffix_regex(suffixes) |
|
|
|
return Tokenizer( |
|
nlp.vocab, |
|
infix_finditer=infix_regex.finditer, |
|
prefix_search=prefix_regex.search, |
|
suffix_search=suffix_regex.search, |
|
) |
|
|
|
return create_tokenizer |
|
|