sunnychenxiwang's picture
update nltk
d916065
raw
history blame
1.58 kB
# Natural Language Toolkit: Stemmers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Trevor Cohn <[email protected]>
# Edward Loper <[email protected]>
# Steven Bird <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.stem.api import StemmerI
class RegexpStemmer(StemmerI):
"""
A stemmer that uses regular expressions to identify morphological
affixes. Any substrings that match the regular expressions will
be removed.
>>> from nltk.stem import RegexpStemmer
>>> st = RegexpStemmer('ing$|s$|e$|able$', min=4)
>>> st.stem('cars')
'car'
>>> st.stem('mass')
'mas'
>>> st.stem('was')
'was'
>>> st.stem('bee')
'bee'
>>> st.stem('compute')
'comput'
>>> st.stem('advisable')
'advis'
:type regexp: str or regexp
:param regexp: The regular expression that should be used to
identify morphological affixes.
:type min: int
:param min: The minimum length of string to stem
"""
def __init__(self, regexp, min=0):
if not hasattr(regexp, "pattern"):
regexp = re.compile(regexp)
self._regexp = regexp
self._min = min
def stem(self, word):
if len(word) < self._min:
return word
else:
return self._regexp.sub("", word)
def __repr__(self):
return f"<RegexpStemmer: {self._regexp.pattern!r}>"