latincy
/

la_core_web_sm

Token Classification

Model card Files Files and versions Community

la_core_web_sm / functions_2.py

diyclassics's picture

Update spaCy pipeline

7effaea verified 19 days ago

history blame contribute delete

2.16 kB

	from spacy.tokenizer import Tokenizer
	from spacy.util import registry
	import unicodedata
	from typing import Dict, Any
	from spacy.language import Language


	@registry.tokenizers("latin_core_tokenizer")
	def create_latin_tokenizer():
	def create_tokenizer(nlp):
	return LatinTokenizer(nlp.vocab)

	return create_tokenizer


	class LatinTokenizer(Tokenizer):
	def separate_ligatures(self, text: str) -> str:
	"""Convert ligatures while preserving case"""
	result = text
	result = result.replace("Æ", "Ae").replace("Œ", "Oe")
	result = result.replace("æ", "ae").replace("œ", "oe")
	return result

	def normalize_chars(self, text: str) -> str:
	"""Normalize v/u and j/i while preserving case"""
	result = text
	result = result.replace("V", "U").replace("J", "I")
	result = result.replace("v", "u").replace("j", "i")
	return result

	def remove_macrons(self, text: str) -> str:
	"""Remove macrons while preserving case"""
	macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY")
	return text.translate(macron_map)

	def remove_accents(self, text: str) -> str:
	"""Remove diacritical marks"""
	return "".join(
	c
	for c in unicodedata.normalize("NFD", text)
	if unicodedata.category(c) != "Mn"
	)

	def preprocess(self, text: str) -> str:
	"""Apply all preprocessing steps in sequence"""
	text = self.separate_ligatures(text)
	text = self.normalize_chars(text)
	text = self.remove_macrons(text)
	text = self.remove_accents(text)
	return text

	def __call__(self, text):
	"""Process text before tokenization"""
	processed_text = self.preprocess(text)
	return super().__call__(processed_text)


	if __name__ == "__main__":
	# Test the tokenizer
	import spacy

	nlp = spacy.blank("la")
	nlp.tokenizer = LatinTokenizer(nlp.vocab)

	test = "Hæc nārrantur ā poētīs"
	doc = nlp(test)
	print(
	[token.text for token in doc]
	) # Should print: ['Haec', 'narrantur', 'a', 'poetis']