from spacy.tokenizer import Tokenizer from spacy.util import registry import unicodedata from typing import Dict, Any from spacy.language import Language @registry.tokenizers("latin_core_tokenizer") def create_latin_tokenizer(): def create_tokenizer(nlp): return LatinTokenizer(nlp.vocab) return create_tokenizer class LatinTokenizer(Tokenizer): def separate_ligatures(self, text: str) -> str: """Convert ligatures while preserving case""" result = text result = result.replace("Æ", "Ae").replace("Œ", "Oe") result = result.replace("æ", "ae").replace("œ", "oe") return result def normalize_chars(self, text: str) -> str: """Normalize v/u and j/i while preserving case""" result = text result = result.replace("V", "U").replace("J", "I") result = result.replace("v", "u").replace("j", "i") return result def remove_macrons(self, text: str) -> str: """Remove macrons while preserving case""" macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY") return text.translate(macron_map) def remove_accents(self, text: str) -> str: """Remove diacritical marks""" return "".join( c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn" ) def preprocess(self, text: str) -> str: """Apply all preprocessing steps in sequence""" text = self.separate_ligatures(text) text = self.normalize_chars(text) text = self.remove_macrons(text) text = self.remove_accents(text) return text def __call__(self, text): """Process text before tokenization""" processed_text = self.preprocess(text) return super().__call__(processed_text) if __name__ == "__main__": # Test the tokenizer import spacy nlp = spacy.blank("la") nlp.tokenizer = LatinTokenizer(nlp.vocab) test = "Hæc nārrantur ā poētīs" doc = nlp(test) print( [token.text for token in doc] ) # Should print: ['Haec', 'narrantur', 'a', 'poetis']