Spaces:
Sleeping
Sleeping
File size: 11,515 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 |
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Marcus Uneson <[email protected]>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
from nltk import jsontags
######################################################################
# Tag Rules
######################################################################
class TagRule(metaclass=ABCMeta):
"""
An interface for tag transformations on a tagged corpus, as
performed by tbl taggers. Each transformation finds all tokens
in the corpus that are tagged with a specific original tag and
satisfy a specific condition, and replaces their tags with a
replacement tag. For any given transformation, the original
tag, replacement tag, and condition are fixed. Conditions may
depend on the token under consideration, as well as any other
tokens in the corpus.
Tag rules must be comparable and hashable.
"""
def __init__(self, original_tag, replacement_tag):
self.original_tag = original_tag
"""The tag which this TagRule may cause to be replaced."""
self.replacement_tag = replacement_tag
"""The tag with which this TagRule may replace another tag."""
def apply(self, tokens, positions=None):
"""
Apply this rule at every position in positions where it
applies to the given sentence. I.e., for each position p
in *positions*, if *tokens[p]* is tagged with this rule's
original tag, and satisfies this rule's condition, then set
its tag to be this rule's replacement tag.
:param tokens: The tagged sentence
:type tokens: list(tuple(str, str))
:type positions: list(int)
:param positions: The positions where the transformation is to
be tried. If not specified, try it at all positions.
:return: The indices of tokens whose tags were changed by this
rule.
:rtype: int
"""
if positions is None:
positions = list(range(len(tokens)))
# Determine the indices at which this rule applies.
change = [i for i in positions if self.applies(tokens, i)]
# Make the changes. Note: this must be done in a separate
# step from finding applicable locations, since we don't want
# the rule to interact with itself.
for i in change:
tokens[i] = (tokens[i][0], self.replacement_tag)
return change
@abstractmethod
def applies(self, tokens, index):
"""
:return: True if the rule would change the tag of
``tokens[index]``, False otherwise
:rtype: bool
:param tokens: A tagged sentence
:type tokens: list(str)
:param index: The index to check
:type index: int
"""
# Rules must be comparable and hashable for the algorithm to work
def __eq__(self, other):
raise TypeError("Rules must implement __eq__()")
def __ne__(self, other):
raise TypeError("Rules must implement __ne__()")
def __hash__(self):
raise TypeError("Rules must implement __hash__()")
@jsontags.register_tag
class Rule(TagRule):
"""
A Rule checks the current corpus position for a certain set of conditions;
if they are all fulfilled, the Rule is triggered, meaning that it
will change tag A to tag B. For other tags than A, nothing happens.
The conditions are parameters to the Rule instance. Each condition is a feature-value pair,
with a set of positions to check for the value of the corresponding feature.
Conceptually, the positions are joined by logical OR, and the feature set by logical AND.
More formally, the Rule is then applicable to the M{n}th token iff:
- The M{n}th token is tagged with the Rule's original tag; and
- For each (Feature(positions), M{value}) tuple:
- The value of Feature of at least one token in {n+p for p in positions}
is M{value}.
"""
json_tag = "nltk.tbl.Rule"
def __init__(self, templateid, original_tag, replacement_tag, conditions):
"""
Construct a new Rule that changes a token's tag from
C{original_tag} to C{replacement_tag} if all of the properties
specified in C{conditions} hold.
:param templateid: the template id (a zero-padded string, '001' etc,
so it will sort nicely)
:type templateid: string
:param conditions: A list of Feature(positions),
each of which specifies that the property (computed by
Feature.extract_property()) of at least one
token in M{n} + p in positions is C{value}.
:type conditions: C{iterable} of C{Feature}
"""
TagRule.__init__(self, original_tag, replacement_tag)
self._conditions = conditions
self.templateid = templateid
def encode_json_obj(self):
return {
"templateid": self.templateid,
"original": self.original_tag,
"replacement": self.replacement_tag,
"conditions": self._conditions,
}
@classmethod
def decode_json_obj(cls, obj):
return cls(
obj["templateid"],
obj["original"],
obj["replacement"],
tuple(tuple(feat) for feat in obj["conditions"]),
)
def applies(self, tokens, index):
# Inherit docs from TagRule
# Does the given token have this Rule's "original tag"?
if tokens[index][1] != self.original_tag:
return False
# Check to make sure that every condition holds.
for (feature, val) in self._conditions:
# Look for *any* token that satisfies the condition.
for pos in feature.positions:
if not (0 <= index + pos < len(tokens)):
continue
if feature.extract_property(tokens, index + pos) == val:
break
else:
# No token satisfied the condition; return false.
return False
# Every condition checked out, so the Rule is applicable.
return True
def __eq__(self, other):
return self is other or (
other is not None
and other.__class__ == self.__class__
and self.original_tag == other.original_tag
and self.replacement_tag == other.replacement_tag
and self._conditions == other._conditions
)
def __ne__(self, other):
return not (self == other)
def __hash__(self):
# Cache our hash value (justified by profiling.)
try:
return self.__hash
except AttributeError:
self.__hash = hash(repr(self))
return self.__hash
def __repr__(self):
# Cache the repr (justified by profiling -- this is used as
# a sort key when deterministic=True.)
try:
return self.__repr
except AttributeError:
self.__repr = "{}('{}', {}, {}, [{}])".format(
self.__class__.__name__,
self.templateid,
repr(self.original_tag),
repr(self.replacement_tag),
# list(self._conditions) would be simpler but will not generate
# the same Rule.__repr__ in python 2 and 3 and thus break some tests
", ".join(f"({f},{repr(v)})" for (f, v) in self._conditions),
)
return self.__repr
def __str__(self):
def _condition_to_logic(feature, value):
"""
Return a compact, predicate-logic styled string representation
of the given condition.
"""
return "{}:{}@[{}]".format(
feature.PROPERTY_NAME,
value,
",".join(str(w) for w in feature.positions),
)
conditions = " & ".join(
[_condition_to_logic(f, v) for (f, v) in self._conditions]
)
s = f"{self.original_tag}->{self.replacement_tag} if {conditions}"
return s
def format(self, fmt):
"""
Return a string representation of this rule.
>>> from nltk.tbl.rule import Rule
>>> from nltk.tag.brill import Pos
>>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')])
r.format("str") == str(r)
True
>>> r.format("str")
'VB->NN if Pos:DT@[-2,-1]'
r.format("repr") == repr(r)
True
>>> r.format("repr")
"Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])"
>>> r.format("verbose")
'VB -> NN if the Pos of words i-2...i-1 is "DT"'
>>> r.format("not_found")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "nltk/tbl/rule.py", line 256, in format
raise ValueError("unknown rule format spec: {0}".format(fmt))
ValueError: unknown rule format spec: not_found
>>>
:param fmt: format specification
:type fmt: str
:return: string representation
:rtype: str
"""
if fmt == "str":
return self.__str__()
elif fmt == "repr":
return self.__repr__()
elif fmt == "verbose":
return self._verbose_format()
else:
raise ValueError(f"unknown rule format spec: {fmt}")
def _verbose_format(self):
"""
Return a wordy, human-readable string representation
of the given rule.
Not sure how useful this is.
"""
def condition_to_str(feature, value):
return 'the {} of {} is "{}"'.format(
feature.PROPERTY_NAME,
range_to_str(feature.positions),
value,
)
def range_to_str(positions):
if len(positions) == 1:
p = positions[0]
if p == 0:
return "this word"
if p == -1:
return "the preceding word"
elif p == 1:
return "the following word"
elif p < 0:
return "word i-%d" % -p
elif p > 0:
return "word i+%d" % p
else:
# for complete compatibility with the wordy format of nltk2
mx = max(positions)
mn = min(positions)
if mx - mn == len(positions) - 1:
return "words i%+d...i%+d" % (mn, mx)
else:
return "words {{{}}}".format(
",".join("i%+d" % d for d in positions)
)
replacement = f"{self.original_tag} -> {self.replacement_tag}"
conditions = (" if " if self._conditions else "") + ", and ".join(
condition_to_str(f, v) for (f, v) in self._conditions
)
return replacement + conditions
|