File size: 2,157 Bytes

7effaea

from spacy.tokenizer import Tokenizer
from spacy.util import registry
import unicodedata
from typing import Dict, Any
from spacy.language import Language


@registry.tokenizers("latin_core_tokenizer")
def create_latin_tokenizer():
    def create_tokenizer(nlp):
        return LatinTokenizer(nlp.vocab)

    return create_tokenizer


class LatinTokenizer(Tokenizer):
    def separate_ligatures(self, text: str) -> str:
        """Convert ligatures while preserving case"""
        result = text
        result = result.replace("Æ", "Ae").replace("Œ", "Oe")
        result = result.replace("æ", "ae").replace("œ", "oe")
        return result

    def normalize_chars(self, text: str) -> str:
        """Normalize v/u and j/i while preserving case"""
        result = text
        result = result.replace("V", "U").replace("J", "I")
        result = result.replace("v", "u").replace("j", "i")
        return result

    def remove_macrons(self, text: str) -> str:
        """Remove macrons while preserving case"""
        macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY")
        return text.translate(macron_map)

    def remove_accents(self, text: str) -> str:
        """Remove diacritical marks"""
        return "".join(
            c
            for c in unicodedata.normalize("NFD", text)
            if unicodedata.category(c) != "Mn"
        )

    def preprocess(self, text: str) -> str:
        """Apply all preprocessing steps in sequence"""
        text = self.separate_ligatures(text)
        text = self.normalize_chars(text)
        text = self.remove_macrons(text)
        text = self.remove_accents(text)
        return text

    def __call__(self, text):
        """Process text before tokenization"""
        processed_text = self.preprocess(text)
        return super().__call__(processed_text)


if __name__ == "__main__":
    # Test the tokenizer
    import spacy

    nlp = spacy.blank("la")
    nlp.tokenizer = LatinTokenizer(nlp.vocab)

    test = "Hæc nārrantur ā poētīs"
    doc = nlp(test)
    print(
        [token.text for token in doc]
    )  # Should print: ['Haec', 'narrantur', 'a', 'poetis']