File size: 12,461 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
# Natural Language Toolkit: Classifier Utility Functions
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# Steven Bird <[email protected]> (minor additions)
# URL: <>
# For license information, see LICENSE.TXT
Utility functions and classes for classifiers.
import math
# from nltk.util import Deprecated
import nltk.classify.util # for accuracy & log_likelihood
from nltk.util import LazyMap
# { Helper Functions
# alternative name possibility: 'map_featurefunc()'?
# alternative name possibility: 'detect_features()'?
# alternative name possibility: 'map_featuredetect()'?
# or.. just have users use LazyMap directly?
def apply_features(feature_func, toks, labeled=None):
Use the ``LazyMap`` class to construct a lazy list-like
object that is analogous to ``map(feature_func, toks)``. In
particular, if ``labeled=False``, then the returned list-like
object's values are equal to::
[feature_func(tok) for tok in toks]
If ``labeled=True``, then the returned list-like object's values
are equal to::
[(feature_func(tok), label) for (tok, label) in toks]
The primary purpose of this function is to avoid the memory
overhead involved in storing all the featuresets for every token
in a corpus. Instead, these featuresets are constructed lazily,
as-needed. The reduction in memory overhead can be especially
significant when the underlying list of tokens is itself lazy (as
is the case with many corpus readers).
:param feature_func: The function that will be applied to each
token. It should return a featureset -- i.e., a dict
mapping feature names to feature values.
:param toks: The list of tokens to which ``feature_func`` should be
applied. If ``labeled=True``, then the list elements will be
passed directly to ``feature_func()``. If ``labeled=False``,
then the list elements should be tuples ``(tok,label)``, and
``tok`` will be passed to ``feature_func()``.
:param labeled: If true, then ``toks`` contains labeled tokens --
i.e., tuples of the form ``(tok, label)``. (Default:
auto-detect based on types.)
if labeled is None:
labeled = toks and isinstance(toks[0], (tuple, list))
if labeled:
def lazy_func(labeled_token):
return (feature_func(labeled_token[0]), labeled_token[1])
return LazyMap(lazy_func, toks)
return LazyMap(feature_func, toks)
def attested_labels(tokens):
:return: A list of all labels that are attested in the given list
of tokens.
:rtype: list of (immutable)
:param tokens: The list of classified tokens from which to extract
labels. A classified token has the form ``(token, label)``.
:type tokens: list
return tuple({label for (tok, label) in tokens})
def log_likelihood(classifier, gold):
results = classifier.prob_classify_many([fs for (fs, l) in gold])
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
return math.log(sum(ll) / len(ll))
def accuracy(classifier, gold):
results = classifier.classify_many([fs for (fs, l) in gold])
correct = [l == r for ((fs, l), r) in zip(gold, results)]
if correct:
return sum(correct) / len(correct)
return 0
class CutoffChecker:
A helper class that implements cutoff checks based on number of
iterations and log likelihood.
Accuracy cutoffs are also implemented, but they're almost never
a good idea to use.
def __init__(self, cutoffs):
self.cutoffs = cutoffs.copy()
if "min_ll" in cutoffs:
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
if "min_lldelta" in cutoffs:
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
self.ll = None
self.acc = None
self.iter = 1
def check(self, classifier, train_toks):
cutoffs = self.cutoffs
self.iter += 1
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
return True # iteration cutoff.
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
if math.isnan(new_ll):
return True
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
return True # log likelihood cutoff
if (
"min_lldelta" in cutoffs
and self.ll
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
return True # log likelihood delta cutoff
self.ll = new_ll
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
return True # log likelihood cutoff
if (
"min_accdelta" in cutoffs
and self.acc
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
return True # log likelihood delta cutoff
self.acc = new_acc
return False # no cutoff reached.
# { Demos
def names_demo_features(name):
features = {}
features["alwayson"] = True
features["startswith"] = name[0].lower()
features["endswith"] = name[-1].lower()
for letter in "abcdefghijklmnopqrstuvwxyz":
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = letter in name.lower()
return features
def binary_names_demo_features(name):
features = {}
features["alwayson"] = True
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
for letter in "abcdefghijklmnopqrstuvwxyz":
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = letter in name.lower()
features["startswith(%s)" % letter] = letter == name[0].lower()
features["endswith(%s)" % letter] = letter == name[-1].lower()
return features
def names_demo(trainer, features=names_demo_features):
import random
from nltk.corpus import names
# Construct a list of classified names, using the names corpus.
namelist = [(name, "male") for name in names.words("male.txt")] + [
(name, "female") for name in names.words("female.txt")
# Randomly split the names into a test & train set.
train = namelist[:5000]
test = namelist[5000:5500]
# Train up a classifier.
print("Training classifier...")
classifier = trainer([(features(n), g) for (n, g) in train])
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
test_featuresets = [features(n) for (n, g) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
if gender == "male":
fmt = " %-15s *%6.4f %6.4f"
fmt = " %-15s %6.4f *%6.4f"
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
except NotImplementedError:
# Return the classifier
return classifier
def partial_names_demo(trainer, features=names_demo_features):
import random
from nltk.corpus import names
male_names = names.words("male.txt")
female_names = names.words("female.txt")
# Create a list of male names to be used as positive-labeled examples for training
positive = map(features, male_names[:2000])
# Create a list of male and female names to be used as unlabeled examples
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
# Create a test set with correctly-labeled male and female names
test = [(name, True) for name in male_names[2500:2750]] + [
(name, False) for name in female_names[500:750]
# Train up a classifier.
print("Training classifier...")
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
test_featuresets = [features(n) for (n, m) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
if is_male == True:
fmt = " %-15s *%6.4f %6.4f"
fmt = " %-15s %6.4f *%6.4f"
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
except NotImplementedError:
# Return the classifier
return classifier
_inst_cache = {}
def wsd_demo(trainer, word, features, n=1000):
import random
from nltk.corpus import senseval
# Get the instances.
print("Reading data...")
global _inst_cache
if word not in _inst_cache:
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
instances = _inst_cache[word][:]
if n > len(instances):
n = len(instances)
senses = list({l for (i, l) in instances})
print(" Senses: " + " ".join(senses))
# Randomly split the names into a test & train set.
print("Splitting into test & train...")
train = instances[: int(0.8 * n)]
test = instances[int(0.8 * n) : n]
# Train up a classifier.
print("Training classifier...")
classifier = trainer([(features(i), l) for (i, l) in train])
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
test_featuresets = [features(i) for (i, n) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
except NotImplementedError:
# Return the classifier
return classifier
def check_megam_config():
Checks whether the MEGAM binary is configured.
except NameError as e:
err_msg = str(
"Please configure your megam binary first, e.g.\n"
">>> nltk.config_megam('/usr/bin/local/megam')"
raise NameError(err_msg) from e