Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Transformation-based learning | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Marcus Uneson <[email protected]> | |
# based on previous (nltk2) version by | |
# Christopher Maloof, Edward Loper, Steven Bird | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
from abc import ABCMeta, abstractmethod | |
class Feature(metaclass=ABCMeta): | |
""" | |
An abstract base class for Features. A Feature is a combination of | |
a specific property-computing method and a list of relative positions | |
to apply that method to. | |
The property-computing method, M{extract_property(tokens, index)}, | |
must be implemented by every subclass. It extracts or computes a specific | |
property for the token at the current index. Typical extract_property() | |
methods return features such as the token text or tag; but more involved | |
methods may consider the entire sequence M{tokens} and | |
for instance compute the length of the sentence the token belongs to. | |
In addition, the subclass may have a PROPERTY_NAME, which is how | |
it will be printed (in Rules and Templates, etc). If not given, defaults | |
to the classname. | |
""" | |
json_tag = "nltk.tbl.Feature" | |
PROPERTY_NAME = None | |
def __init__(self, positions, end=None): | |
""" | |
Construct a Feature which may apply at C{positions}. | |
>>> # For instance, importing some concrete subclasses (Feature is abstract) | |
>>> from nltk.tag.brill import Word, Pos | |
>>> # Feature Word, applying at one of [-2, -1] | |
>>> Word([-2,-1]) | |
Word([-2, -1]) | |
>>> # Positions need not be contiguous | |
>>> Word([-2,-1, 1]) | |
Word([-2, -1, 1]) | |
>>> # Contiguous ranges can alternatively be specified giving the | |
>>> # two endpoints (inclusive) | |
>>> Pos(-3, -1) | |
Pos([-3, -2, -1]) | |
>>> # In two-arg form, start <= end is enforced | |
>>> Pos(2, 1) | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in <module> | |
File "nltk/tbl/template.py", line 306, in __init__ | |
raise TypeError | |
ValueError: illegal interval specification: (start=2, end=1) | |
:type positions: list of int | |
:param positions: the positions at which this features should apply | |
:raises ValueError: illegal position specifications | |
An alternative calling convention, for contiguous positions only, | |
is Feature(start, end): | |
:type start: int | |
:param start: start of range where this feature should apply | |
:type end: int | |
:param end: end of range (NOTE: inclusive!) where this feature should apply | |
""" | |
self.positions = None # to avoid warnings | |
if end is None: | |
self.positions = tuple(sorted({int(i) for i in positions})) | |
else: # positions was actually not a list, but only the start index | |
try: | |
if positions > end: | |
raise TypeError | |
self.positions = tuple(range(positions, end + 1)) | |
except TypeError as e: | |
# let any kind of erroneous spec raise ValueError | |
raise ValueError( | |
"illegal interval specification: (start={}, end={})".format( | |
positions, end | |
) | |
) from e | |
# set property name given in subclass, or otherwise name of subclass | |
self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__ | |
def encode_json_obj(self): | |
return self.positions | |
def decode_json_obj(cls, obj): | |
positions = obj | |
return cls(positions) | |
def __repr__(self): | |
return f"{self.__class__.__name__}({list(self.positions)!r})" | |
def expand(cls, starts, winlens, excludezero=False): | |
""" | |
Return a list of features, one for each start point in starts | |
and for each window length in winlen. If excludezero is True, | |
no Features containing 0 in its positions will be generated | |
(many tbl trainers have a special representation for the | |
target feature at [0]) | |
For instance, importing a concrete subclass (Feature is abstract) | |
>>> from nltk.tag.brill import Word | |
First argument gives the possible start positions, second the | |
possible window lengths | |
>>> Word.expand([-3,-2,-1], [1]) | |
[Word([-3]), Word([-2]), Word([-1])] | |
>>> Word.expand([-2,-1], [1]) | |
[Word([-2]), Word([-1])] | |
>>> Word.expand([-3,-2,-1], [1,2]) | |
[Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])] | |
>>> Word.expand([-2,-1], [1]) | |
[Word([-2]), Word([-1])] | |
A third optional argument excludes all Features whose positions contain zero | |
>>> Word.expand([-2,-1,0], [1,2], excludezero=False) | |
[Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] | |
>>> Word.expand([-2,-1,0], [1,2], excludezero=True) | |
[Word([-2]), Word([-1]), Word([-2, -1])] | |
All window lengths must be positive | |
>>> Word.expand([-2,-1], [0]) | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in <module> | |
File "nltk/tag/tbl/template.py", line 371, in expand | |
:param starts: where to start looking for Feature | |
ValueError: non-positive window length in [0] | |
:param starts: where to start looking for Feature | |
:type starts: list of ints | |
:param winlens: window lengths where to look for Feature | |
:type starts: list of ints | |
:param excludezero: do not output any Feature with 0 in any of its positions. | |
:type excludezero: bool | |
:returns: list of Features | |
:raises ValueError: for non-positive window lengths | |
""" | |
if not all(x > 0 for x in winlens): | |
raise ValueError(f"non-positive window length in {winlens}") | |
xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1)) | |
return [cls(x) for x in xs if not (excludezero and 0 in x)] | |
def issuperset(self, other): | |
""" | |
Return True if this Feature always returns True when other does | |
More precisely, return True if this feature refers to the same property as other; | |
and this Feature looks at all positions that other does (and possibly | |
other positions in addition). | |
#For instance, importing a concrete subclass (Feature is abstract) | |
>>> from nltk.tag.brill import Word, Pos | |
>>> Word([-3,-2,-1]).issuperset(Word([-3,-2])) | |
True | |
>>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0])) | |
False | |
#Feature subclasses must agree | |
>>> Word([-3,-2,-1]).issuperset(Pos([-3,-2])) | |
False | |
:param other: feature with which to compare | |
:type other: (subclass of) Feature | |
:return: True if this feature is superset, otherwise False | |
:rtype: bool | |
""" | |
return self.__class__ is other.__class__ and set(self.positions) >= set( | |
other.positions | |
) | |
def intersects(self, other): | |
""" | |
Return True if the positions of this Feature intersects with those of other | |
More precisely, return True if this feature refers to the same property as other; | |
and there is some overlap in the positions they look at. | |
#For instance, importing a concrete subclass (Feature is abstract) | |
>>> from nltk.tag.brill import Word, Pos | |
>>> Word([-3,-2,-1]).intersects(Word([-3,-2])) | |
True | |
>>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0])) | |
True | |
>>> Word([-3,-2,-1]).intersects(Word([0])) | |
False | |
#Feature subclasses must agree | |
>>> Word([-3,-2,-1]).intersects(Pos([-3,-2])) | |
False | |
:param other: feature with which to compare | |
:type other: (subclass of) Feature | |
:return: True if feature classes agree and there is some overlap in the positions they look at | |
:rtype: bool | |
""" | |
return bool( | |
self.__class__ is other.__class__ | |
and set(self.positions) & set(other.positions) | |
) | |
# Rich comparisons for Features. With @functools.total_ordering (Python 2.7+), | |
# it will be enough to define __lt__ and __eq__ | |
def __eq__(self, other): | |
return self.__class__ is other.__class__ and self.positions == other.positions | |
def __lt__(self, other): | |
return ( | |
self.__class__.__name__ < other.__class__.__name__ | |
or | |
# self.positions is a sorted tuple of ints | |
self.positions < other.positions | |
) | |
def __ne__(self, other): | |
return not (self == other) | |
def __gt__(self, other): | |
return other < self | |
def __ge__(self, other): | |
return not self < other | |
def __le__(self, other): | |
return self < other or self == other | |
def extract_property(tokens, index): | |
""" | |
Any subclass of Feature must define static method extract_property(tokens, index) | |
:param tokens: the sequence of tokens | |
:type tokens: list of tokens | |
:param index: the current index | |
:type index: int | |
:return: feature value | |
:rtype: any (but usually scalar) | |
""" | |