File size: 2,157 Bytes
7effaea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
from spacy.tokenizer import Tokenizer
from spacy.util import registry
import unicodedata
from typing import Dict, Any
from spacy.language import Language
@registry.tokenizers("latin_core_tokenizer")
def create_latin_tokenizer():
def create_tokenizer(nlp):
return LatinTokenizer(nlp.vocab)
return create_tokenizer
class LatinTokenizer(Tokenizer):
def separate_ligatures(self, text: str) -> str:
"""Convert ligatures while preserving case"""
result = text
result = result.replace("Æ", "Ae").replace("Œ", "Oe")
result = result.replace("æ", "ae").replace("œ", "oe")
return result
def normalize_chars(self, text: str) -> str:
"""Normalize v/u and j/i while preserving case"""
result = text
result = result.replace("V", "U").replace("J", "I")
result = result.replace("v", "u").replace("j", "i")
return result
def remove_macrons(self, text: str) -> str:
"""Remove macrons while preserving case"""
macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY")
return text.translate(macron_map)
def remove_accents(self, text: str) -> str:
"""Remove diacritical marks"""
return "".join(
c
for c in unicodedata.normalize("NFD", text)
if unicodedata.category(c) != "Mn"
)
def preprocess(self, text: str) -> str:
"""Apply all preprocessing steps in sequence"""
text = self.separate_ligatures(text)
text = self.normalize_chars(text)
text = self.remove_macrons(text)
text = self.remove_accents(text)
return text
def __call__(self, text):
"""Process text before tokenization"""
processed_text = self.preprocess(text)
return super().__call__(processed_text)
if __name__ == "__main__":
# Test the tokenizer
import spacy
nlp = spacy.blank("la")
nlp.tokenizer = LatinTokenizer(nlp.vocab)
test = "Hæc nārrantur ā poētīs"
doc = nlp(test)
print(
[token.text for token in doc]
) # Should print: ['Haec', 'narrantur', 'a', 'poetis']
|