|
from spacy.tokenizer import Tokenizer |
|
from spacy.util import registry |
|
import unicodedata |
|
from typing import Dict, Any |
|
from spacy.language import Language |
|
|
|
|
|
@registry.tokenizers("latin_core_tokenizer") |
|
def create_latin_tokenizer(): |
|
def create_tokenizer(nlp): |
|
return LatinTokenizer(nlp.vocab) |
|
|
|
return create_tokenizer |
|
|
|
|
|
class LatinTokenizer(Tokenizer): |
|
def separate_ligatures(self, text: str) -> str: |
|
"""Convert ligatures while preserving case""" |
|
result = text |
|
result = result.replace("Æ", "Ae").replace("Œ", "Oe") |
|
result = result.replace("æ", "ae").replace("œ", "oe") |
|
return result |
|
|
|
def normalize_chars(self, text: str) -> str: |
|
"""Normalize v/u and j/i while preserving case""" |
|
result = text |
|
result = result.replace("V", "U").replace("J", "I") |
|
result = result.replace("v", "u").replace("j", "i") |
|
return result |
|
|
|
def remove_macrons(self, text: str) -> str: |
|
"""Remove macrons while preserving case""" |
|
macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY") |
|
return text.translate(macron_map) |
|
|
|
def remove_accents(self, text: str) -> str: |
|
"""Remove diacritical marks""" |
|
return "".join( |
|
c |
|
for c in unicodedata.normalize("NFD", text) |
|
if unicodedata.category(c) != "Mn" |
|
) |
|
|
|
def preprocess(self, text: str) -> str: |
|
"""Apply all preprocessing steps in sequence""" |
|
text = self.separate_ligatures(text) |
|
text = self.normalize_chars(text) |
|
text = self.remove_macrons(text) |
|
text = self.remove_accents(text) |
|
return text |
|
|
|
def __call__(self, text): |
|
"""Process text before tokenization""" |
|
processed_text = self.preprocess(text) |
|
return super().__call__(processed_text) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
import spacy |
|
|
|
nlp = spacy.blank("la") |
|
nlp.tokenizer = LatinTokenizer(nlp.vocab) |
|
|
|
test = "Hæc nārrantur ā poētīs" |
|
doc = nlp(test) |
|
print( |
|
[token.text for token in doc] |
|
) |
|
|