Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /stem /lancaster.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

12.6 kB

	# Natural Language Toolkit: Stemmers
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Steven Tomcavage <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
	Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
	"""
	import re

	from nltk.stem.api import StemmerI


	class LancasterStemmer(StemmerI):
	"""
	Lancaster Stemmer

	>>> from nltk.stem.lancaster import LancasterStemmer
	>>> st = LancasterStemmer()
	>>> st.stem('maximum') # Remove "-um" when word is intact
	'maxim'
	>>> st.stem('presumably') # Don't remove "-um" when word is not intact
	'presum'
	>>> st.stem('multiply') # No action taken if word ends with "-ply"
	'multiply'
	>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
	'provid'
	>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
	'ow'
	>>> st.stem('ear') # ditto
	'ear'
	>>> st.stem('saying') # Words starting with consonant must contain at least 3
	'say'
	>>> st.stem('crying') # letters and one of those letters must be a vowel
	'cry'
	>>> st.stem('string') # ditto
	'string'
	>>> st.stem('meant') # ditto
	'meant'
	>>> st.stem('cement') # ditto
	'cem'
	>>> st_pre = LancasterStemmer(strip_prefix_flag=True)
	>>> st_pre.stem('kilometer') # Test Prefix
	'met'
	>>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
	>>> st_custom.stem("ness") # Change s to t
	'nest'
	"""

	# The rule list is static since it doesn't change between instances
	default_rule_tuple = (
	"ai*2.", # -ia > - if intact
	"a*1.", # -a > - if intact
	"bb1.", # -bb > -b
	"city3s.", # -ytic > -ys
	"ci2>", # -ic > -
	"cn1t>", # -nc > -nt
	"dd1.", # -dd > -d
	"dei3y>", # -ied > -y
	"deec2ss.", # -ceed >", -cess
	"dee1.", # -eed > -ee
	"de2>", # -ed > -
	"dooh4>", # -hood > -
	"e1>", # -e > -
	"feil1v.", # -lief > -liev
	"fi2>", # -if > -
	"gni3>", # -ing > -
	"gai3y.", # -iag > -y
	"ga2>", # -ag > -
	"gg1.", # -gg > -g
	"ht*2.", # -th > - if intact
	"hsiug5ct.", # -guish > -ct
	"hsi3>", # -ish > -
	"i*1.", # -i > - if intact
	"i1y>", # -i > -y
	"ji1d.", # -ij > -id -- see nois4j> & vis3j>
	"juf1s.", # -fuj > -fus
	"ju1d.", # -uj > -ud
	"jo1d.", # -oj > -od
	"jeh1r.", # -hej > -her
	"jrev1t.", # -verj > -vert
	"jsim2t.", # -misj > -mit
	"jn1d.", # -nj > -nd
	"j1s.", # -j > -s
	"lbaifi6.", # -ifiabl > -
	"lbai4y.", # -iabl > -y
	"lba3>", # -abl > -
	"lbi3.", # -ibl > -
	"lib2l>", # -bil > -bl
	"lc1.", # -cl > c
	"lufi4y.", # -iful > -y
	"luf3>", # -ful > -
	"lu2.", # -ul > -
	"lai3>", # -ial > -
	"lau3>", # -ual > -
	"la2>", # -al > -
	"ll1.", # -ll > -l
	"mui3.", # -ium > -
	"mu*2.", # -um > - if intact
	"msi3>", # -ism > -
	"mm1.", # -mm > -m
	"nois4j>", # -sion > -j
	"noix4ct.", # -xion > -ct
	"noi3>", # -ion > -
	"nai3>", # -ian > -
	"na2>", # -an > -
	"nee0.", # protect -een
	"ne2>", # -en > -
	"nn1.", # -nn > -n
	"pihs4>", # -ship > -
	"pp1.", # -pp > -p
	"re2>", # -er > -
	"rae0.", # protect -ear
	"ra2.", # -ar > -
	"ro2>", # -or > -
	"ru2>", # -ur > -
	"rr1.", # -rr > -r
	"rt1>", # -tr > -t
	"rei3y>", # -ier > -y
	"sei3y>", # -ies > -y
	"sis2.", # -sis > -s
	"si2>", # -is > -
	"ssen4>", # -ness > -
	"ss0.", # protect -ss
	"suo3>", # -ous > -
	"su*2.", # -us > - if intact
	"s*1>", # -s > - if intact
	"s0.", # -s > -s
	"tacilp4y.", # -plicat > -ply
	"ta2>", # -at > -
	"tnem4>", # -ment > -
	"tne3>", # -ent > -
	"tna3>", # -ant > -
	"tpir2b.", # -ript > -rib
	"tpro2b.", # -orpt > -orb
	"tcud1.", # -duct > -duc
	"tpmus2.", # -sumpt > -sum
	"tpec2iv.", # -cept > -ceiv
	"tulo2v.", # -olut > -olv
	"tsis0.", # protect -sist
	"tsi3>", # -ist > -
	"tt1.", # -tt > -t
	"uqi3.", # -iqu > -
	"ugo1.", # -ogu > -og
	"vis3j>", # -siv > -j
	"vie0.", # protect -eiv
	"vi2>", # -iv > -
	"ylb1>", # -bly > -bl
	"yli3y>", # -ily > -y
	"ylp0.", # protect -ply
	"yl2>", # -ly > -
	"ygo1.", # -ogy > -og
	"yhp1.", # -phy > -ph
	"ymo1.", # -omy > -om
	"ypo1.", # -opy > -op
	"yti3>", # -ity > -
	"yte3>", # -ety > -
	"ytl2.", # -lty > -l
	"yrtsi5.", # -istry > -
	"yra3>", # -ary > -
	"yro3>", # -ory > -
	"yfi3.", # -ify > -
	"ycn2t>", # -ncy > -nt
	"yca3>", # -acy > -
	"zi2>", # -iz > -
	"zy1s.", # -yz > -ys
	)

	def __init__(self, rule_tuple=None, strip_prefix_flag=False):
	"""Create an instance of the Lancaster stemmer."""
	# Setup an empty rule dictionary - this will be filled in later
	self.rule_dictionary = {}
	# Check if a user wants to strip prefix
	self._strip_prefix = strip_prefix_flag
	# Check if a user wants to use his/her own rule tuples.
	self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple

	def parseRules(self, rule_tuple=None):
	"""Validate the set of rules used in this stemmer.

	If this function is called as an individual method, without using stem
	method, rule_tuple argument will be compiled into self.rule_dictionary.
	If this function is called within stem, self._rule_tuple will be used.

	"""
	# If there is no argument for the function, use class' own rule tuple.
	rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
	valid_rule = re.compile(r"^[a-z]+\?\d[a-z][>\.]?$")
	# Empty any old rules from the rule set before adding new ones
	self.rule_dictionary = {}

	for rule in rule_tuple:
	if not valid_rule.match(rule):
	raise ValueError(f"The rule {rule} is invalid")
	first_letter = rule[0:1]
	if first_letter in self.rule_dictionary:
	self.rule_dictionary[first_letter].append(rule)
	else:
	self.rule_dictionary[first_letter] = [rule]

	def stem(self, word):
	"""Stem a word using the Lancaster stemmer."""
	# Lower-case the word, since all the rules are lower-cased
	word = word.lower()
	word = self.__stripPrefix(word) if self._strip_prefix else word

	# Save a copy of the original word
	intact_word = word

	# If rule dictionary is empty, parse rule tuple.
	if not self.rule_dictionary:
	self.parseRules()

	return self.__doStemming(word, intact_word)

	def __doStemming(self, word, intact_word):
	"""Perform the actual word stemming"""

	valid_rule = re.compile(r"^([a-z]+)(\?)(\d)([a-z])([>\.]?)$")

	proceed = True

	while proceed:

	# Find the position of the last letter of the word to be stemmed
	last_letter_position = self.__getLastLetter(word)

	# Only stem the word if it has a last letter and a rule matching that last letter
	if (
	last_letter_position < 0
	or word[last_letter_position] not in self.rule_dictionary
	):
	proceed = False

	else:
	rule_was_applied = False

	# Go through each rule that matches the word's final letter
	for rule in self.rule_dictionary[word[last_letter_position]]:
	rule_match = valid_rule.match(rule)
	if rule_match:
	(
	ending_string,
	intact_flag,
	remove_total,
	append_string,
	cont_flag,
	) = rule_match.groups()

	# Convert the number of chars to remove when stemming
	# from a string to an integer
	remove_total = int(remove_total)

	# Proceed if word's ending matches rule's word ending
	if word.endswith(ending_string[::-1]):
	if intact_flag:
	if word == intact_word and self.__isAcceptable(
	word, remove_total
	):
	word = self.__applyRule(
	word, remove_total, append_string
	)
	rule_was_applied = True
	if cont_flag == ".":
	proceed = False
	break
	elif self.__isAcceptable(word, remove_total):
	word = self.__applyRule(
	word, remove_total, append_string
	)
	rule_was_applied = True
	if cont_flag == ".":
	proceed = False
	break
	# If no rules apply, the word doesn't need any more stemming
	if rule_was_applied == False:
	proceed = False
	return word

	def __getLastLetter(self, word):
	"""Get the zero-based index of the last alphabetic character in this string"""
	last_letter = -1
	for position in range(len(word)):
	if word[position].isalpha():
	last_letter = position
	else:
	break
	return last_letter

	def __isAcceptable(self, word, remove_total):
	"""Determine if the word is acceptable for stemming."""
	word_is_acceptable = False
	# If the word starts with a vowel, it must be at least 2
	# characters long to be stemmed
	if word[0] in "aeiouy":
	if len(word) - remove_total >= 2:
	word_is_acceptable = True
	# If the word starts with a consonant, it must be at least 3
	# characters long (including one vowel) to be stemmed
	elif len(word) - remove_total >= 3:
	if word[1] in "aeiouy":
	word_is_acceptable = True
	elif word[2] in "aeiouy":
	word_is_acceptable = True
	return word_is_acceptable

	def __applyRule(self, word, remove_total, append_string):
	"""Apply the stemming rule to the word"""
	# Remove letters from the end of the word
	new_word_length = len(word) - remove_total
	word = word[0:new_word_length]

	# And add new letters to the end of the truncated word
	if append_string:
	word += append_string
	return word

	def __stripPrefix(self, word):
	"""Remove prefix from a word.

	This function originally taken from Whoosh.

	"""
	for prefix in (
	"kilo",
	"micro",
	"milli",
	"intra",
	"ultra",
	"mega",
	"nano",
	"pico",
	"pseudo",
	):
	if word.startswith(prefix):
	return word[len(prefix) :]
	return word

	def __repr__(self):
	return "<LancasterStemmer>"