Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /tbl /feature.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

9.69 kB

	# Natural Language Toolkit: Transformation-based learning
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Marcus Uneson <[email protected]>
	# based on previous (nltk2) version by
	# Christopher Maloof, Edward Loper, Steven Bird
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	from abc import ABCMeta, abstractmethod


	class Feature(metaclass=ABCMeta):
	"""
	An abstract base class for Features. A Feature is a combination of
	a specific property-computing method and a list of relative positions
	to apply that method to.

	The property-computing method, M{extract_property(tokens, index)},
	must be implemented by every subclass. It extracts or computes a specific
	property for the token at the current index. Typical extract_property()
	methods return features such as the token text or tag; but more involved
	methods may consider the entire sequence M{tokens} and
	for instance compute the length of the sentence the token belongs to.

	In addition, the subclass may have a PROPERTY_NAME, which is how
	it will be printed (in Rules and Templates, etc). If not given, defaults
	to the classname.

	"""

	json_tag = "nltk.tbl.Feature"
	PROPERTY_NAME = None

	def __init__(self, positions, end=None):
	"""
	Construct a Feature which may apply at C{positions}.

	>>> # For instance, importing some concrete subclasses (Feature is abstract)
	>>> from nltk.tag.brill import Word, Pos

	>>> # Feature Word, applying at one of [-2, -1]
	>>> Word([-2,-1])
	Word([-2, -1])

	>>> # Positions need not be contiguous
	>>> Word([-2,-1, 1])
	Word([-2, -1, 1])

	>>> # Contiguous ranges can alternatively be specified giving the
	>>> # two endpoints (inclusive)
	>>> Pos(-3, -1)
	Pos([-3, -2, -1])

	>>> # In two-arg form, start <= end is enforced
	>>> Pos(2, 1)
	Traceback (most recent call last):
	File "<stdin>", line 1, in <module>
	File "nltk/tbl/template.py", line 306, in __init__
	raise TypeError
	ValueError: illegal interval specification: (start=2, end=1)

	:type positions: list of int
	:param positions: the positions at which this features should apply
	:raises ValueError: illegal position specifications

	An alternative calling convention, for contiguous positions only,
	is Feature(start, end):

	:type start: int
	:param start: start of range where this feature should apply
	:type end: int
	:param end: end of range (NOTE: inclusive!) where this feature should apply
	"""
	self.positions = None # to avoid warnings
	if end is None:
	self.positions = tuple(sorted({int(i) for i in positions}))
	else: # positions was actually not a list, but only the start index
	try:
	if positions > end:
	raise TypeError
	self.positions = tuple(range(positions, end + 1))
	except TypeError as e:
	# let any kind of erroneous spec raise ValueError
	raise ValueError(
	"illegal interval specification: (start={}, end={})".format(
	positions, end
	)
	) from e

	# set property name given in subclass, or otherwise name of subclass
	self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__

	def encode_json_obj(self):
	return self.positions

	@classmethod
	def decode_json_obj(cls, obj):
	positions = obj
	return cls(positions)

	def __repr__(self):
	return f"{self.__class__.__name__}({list(self.positions)!r})"

	@classmethod
	def expand(cls, starts, winlens, excludezero=False):
	"""
	Return a list of features, one for each start point in starts
	and for each window length in winlen. If excludezero is True,
	no Features containing 0 in its positions will be generated
	(many tbl trainers have a special representation for the
	target feature at [0])

	For instance, importing a concrete subclass (Feature is abstract)

	>>> from nltk.tag.brill import Word

	First argument gives the possible start positions, second the
	possible window lengths

	>>> Word.expand([-3,-2,-1], [1])
	[Word([-3]), Word([-2]), Word([-1])]

	>>> Word.expand([-2,-1], [1])
	[Word([-2]), Word([-1])]

	>>> Word.expand([-3,-2,-1], [1,2])
	[Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]

	>>> Word.expand([-2,-1], [1])
	[Word([-2]), Word([-1])]

	A third optional argument excludes all Features whose positions contain zero

	>>> Word.expand([-2,-1,0], [1,2], excludezero=False)
	[Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]

	>>> Word.expand([-2,-1,0], [1,2], excludezero=True)
	[Word([-2]), Word([-1]), Word([-2, -1])]

	All window lengths must be positive

	>>> Word.expand([-2,-1], [0])
	Traceback (most recent call last):
	File "<stdin>", line 1, in <module>
	File "nltk/tag/tbl/template.py", line 371, in expand
	:param starts: where to start looking for Feature
	ValueError: non-positive window length in [0]

	:param starts: where to start looking for Feature
	:type starts: list of ints
	:param winlens: window lengths where to look for Feature
	:type starts: list of ints
	:param excludezero: do not output any Feature with 0 in any of its positions.
	:type excludezero: bool
	:returns: list of Features
	:raises ValueError: for non-positive window lengths
	"""
	if not all(x > 0 for x in winlens):
	raise ValueError(f"non-positive window length in {winlens}")
	xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1))
	return [cls(x) for x in xs if not (excludezero and 0 in x)]

	def issuperset(self, other):
	"""
	Return True if this Feature always returns True when other does

	More precisely, return True if this feature refers to the same property as other;
	and this Feature looks at all positions that other does (and possibly
	other positions in addition).

	#For instance, importing a concrete subclass (Feature is abstract)
	>>> from nltk.tag.brill import Word, Pos

	>>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
	True

	>>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
	False

	#Feature subclasses must agree
	>>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
	False

	:param other: feature with which to compare
	:type other: (subclass of) Feature
	:return: True if this feature is superset, otherwise False
	:rtype: bool


	"""
	return self.__class__ is other.__class__ and set(self.positions) >= set(
	other.positions
	)

	def intersects(self, other):
	"""
	Return True if the positions of this Feature intersects with those of other

	More precisely, return True if this feature refers to the same property as other;
	and there is some overlap in the positions they look at.

	#For instance, importing a concrete subclass (Feature is abstract)
	>>> from nltk.tag.brill import Word, Pos

	>>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
	True

	>>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
	True

	>>> Word([-3,-2,-1]).intersects(Word([0]))
	False

	#Feature subclasses must agree
	>>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
	False

	:param other: feature with which to compare
	:type other: (subclass of) Feature
	:return: True if feature classes agree and there is some overlap in the positions they look at
	:rtype: bool
	"""

	return bool(
	self.__class__ is other.__class__
	and set(self.positions) & set(other.positions)
	)

	# Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
	# it will be enough to define __lt__ and __eq__
	def __eq__(self, other):
	return self.__class__ is other.__class__ and self.positions == other.positions

	def __lt__(self, other):
	return (
	self.__class__.__name__ < other.__class__.__name__
	or
	# self.positions is a sorted tuple of ints
	self.positions < other.positions
	)

	def __ne__(self, other):
	return not (self == other)

	def __gt__(self, other):
	return other < self

	def __ge__(self, other):
	return not self < other

	def __le__(self, other):
	return self < other or self == other

	@staticmethod
	@abstractmethod
	def extract_property(tokens, index):
	"""
	Any subclass of Feature must define static method extract_property(tokens, index)

	:param tokens: the sequence of tokens
	:type tokens: list of tokens
	:param index: the current index
	:type index: int
	:return: feature value
	:rtype: any (but usually scalar)
	"""