Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /classify /util.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

12.5 kB

	# Natural Language Toolkit: Classifier Utility Functions
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Edward Loper <[email protected]>
	# Steven Bird <[email protected]> (minor additions)
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	Utility functions and classes for classifiers.
	"""

	import math

	# from nltk.util import Deprecated
	import nltk.classify.util # for accuracy & log_likelihood
	from nltk.util import LazyMap

	######################################################################
	# { Helper Functions
	######################################################################

	# alternative name possibility: 'map_featurefunc()'?
	# alternative name possibility: 'detect_features()'?
	# alternative name possibility: 'map_featuredetect()'?
	# or.. just have users use LazyMap directly?
	def apply_features(feature_func, toks, labeled=None):
	"""
	Use the ``LazyMap`` class to construct a lazy list-like
	object that is analogous to ``map(feature_func, toks)``. In
	particular, if ``labeled=False``, then the returned list-like
	object's values are equal to::

	[feature_func(tok) for tok in toks]

	If ``labeled=True``, then the returned list-like object's values
	are equal to::

	[(feature_func(tok), label) for (tok, label) in toks]

	The primary purpose of this function is to avoid the memory
	overhead involved in storing all the featuresets for every token
	in a corpus. Instead, these featuresets are constructed lazily,
	as-needed. The reduction in memory overhead can be especially
	significant when the underlying list of tokens is itself lazy (as
	is the case with many corpus readers).

	:param feature_func: The function that will be applied to each
	token. It should return a featureset -- i.e., a dict
	mapping feature names to feature values.
	:param toks: The list of tokens to which ``feature_func`` should be
	applied. If ``labeled=True``, then the list elements will be
	passed directly to ``feature_func()``. If ``labeled=False``,
	then the list elements should be tuples ``(tok,label)``, and
	``tok`` will be passed to ``feature_func()``.
	:param labeled: If true, then ``toks`` contains labeled tokens --
	i.e., tuples of the form ``(tok, label)``. (Default:
	auto-detect based on types.)
	"""
	if labeled is None:
	labeled = toks and isinstance(toks[0], (tuple, list))
	if labeled:

	def lazy_func(labeled_token):
	return (feature_func(labeled_token[0]), labeled_token[1])

	return LazyMap(lazy_func, toks)
	else:
	return LazyMap(feature_func, toks)


	def attested_labels(tokens):
	"""
	:return: A list of all labels that are attested in the given list
	of tokens.
	:rtype: list of (immutable)
	:param tokens: The list of classified tokens from which to extract
	labels. A classified token has the form ``(token, label)``.
	:type tokens: list
	"""
	return tuple({label for (tok, label) in tokens})


	def log_likelihood(classifier, gold):
	results = classifier.prob_classify_many([fs for (fs, l) in gold])
	ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
	return math.log(sum(ll) / len(ll))


	def accuracy(classifier, gold):
	results = classifier.classify_many([fs for (fs, l) in gold])
	correct = [l == r for ((fs, l), r) in zip(gold, results)]
	if correct:
	return sum(correct) / len(correct)
	else:
	return 0


	class CutoffChecker:
	"""
	A helper class that implements cutoff checks based on number of
	iterations and log likelihood.

	Accuracy cutoffs are also implemented, but they're almost never
	a good idea to use.
	"""

	def __init__(self, cutoffs):
	self.cutoffs = cutoffs.copy()
	if "min_ll" in cutoffs:
	cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
	if "min_lldelta" in cutoffs:
	cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
	self.ll = None
	self.acc = None
	self.iter = 1

	def check(self, classifier, train_toks):
	cutoffs = self.cutoffs
	self.iter += 1
	if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
	return True # iteration cutoff.

	new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
	if math.isnan(new_ll):
	return True

	if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
	if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
	return True # log likelihood cutoff
	if (
	"min_lldelta" in cutoffs
	and self.ll
	and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
	):
	return True # log likelihood delta cutoff
	self.ll = new_ll

	if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
	new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
	if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
	return True # log likelihood cutoff
	if (
	"min_accdelta" in cutoffs
	and self.acc
	and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
	):
	return True # log likelihood delta cutoff
	self.acc = new_acc

	return False # no cutoff reached.


	######################################################################
	# { Demos
	######################################################################


	def names_demo_features(name):
	features = {}
	features["alwayson"] = True
	features["startswith"] = name[0].lower()
	features["endswith"] = name[-1].lower()
	for letter in "abcdefghijklmnopqrstuvwxyz":
	features["count(%s)" % letter] = name.lower().count(letter)
	features["has(%s)" % letter] = letter in name.lower()
	return features


	def binary_names_demo_features(name):
	features = {}
	features["alwayson"] = True
	features["startswith(vowel)"] = name[0].lower() in "aeiouy"
	features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
	for letter in "abcdefghijklmnopqrstuvwxyz":
	features["count(%s)" % letter] = name.lower().count(letter)
	features["has(%s)" % letter] = letter in name.lower()
	features["startswith(%s)" % letter] = letter == name[0].lower()
	features["endswith(%s)" % letter] = letter == name[-1].lower()
	return features


	def names_demo(trainer, features=names_demo_features):
	import random

	from nltk.corpus import names

	# Construct a list of classified names, using the names corpus.
	namelist = [(name, "male") for name in names.words("male.txt")] + [
	(name, "female") for name in names.words("female.txt")
	]

	# Randomly split the names into a test & train set.
	random.seed(123456)
	random.shuffle(namelist)
	train = namelist[:5000]
	test = namelist[5000:5500]

	# Train up a classifier.
	print("Training classifier...")
	classifier = trainer([(features(n), g) for (n, g) in train])

	# Run the classifier on the test data.
	print("Testing classifier...")
	acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
	print("Accuracy: %6.4f" % acc)

	# For classifiers that can find probabilities, show the log
	# likelihood and some sample probability distributions.
	try:
	test_featuresets = [features(n) for (n, g) in test]
	pdists = classifier.prob_classify_many(test_featuresets)
	ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
	print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
	print()
	print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
	for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
	if gender == "male":
	fmt = " %-15s *%6.4f %6.4f"
	else:
	fmt = " %-15s %6.4f *%6.4f"
	print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
	except NotImplementedError:
	pass

	# Return the classifier
	return classifier


	def partial_names_demo(trainer, features=names_demo_features):
	import random

	from nltk.corpus import names

	male_names = names.words("male.txt")
	female_names = names.words("female.txt")

	random.seed(654321)
	random.shuffle(male_names)
	random.shuffle(female_names)

	# Create a list of male names to be used as positive-labeled examples for training
	positive = map(features, male_names[:2000])

	# Create a list of male and female names to be used as unlabeled examples
	unlabeled = map(features, male_names[2000:2500] + female_names[:500])

	# Create a test set with correctly-labeled male and female names
	test = [(name, True) for name in male_names[2500:2750]] + [
	(name, False) for name in female_names[500:750]
	]

	random.shuffle(test)

	# Train up a classifier.
	print("Training classifier...")
	classifier = trainer(positive, unlabeled)

	# Run the classifier on the test data.
	print("Testing classifier...")
	acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
	print("Accuracy: %6.4f" % acc)

	# For classifiers that can find probabilities, show the log
	# likelihood and some sample probability distributions.
	try:
	test_featuresets = [features(n) for (n, m) in test]
	pdists = classifier.prob_classify_many(test_featuresets)
	ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
	print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
	print()
	print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
	for ((name, is_male), pdist) in zip(test, pdists)[:5]:
	if is_male == True:
	fmt = " %-15s *%6.4f %6.4f"
	else:
	fmt = " %-15s %6.4f *%6.4f"
	print(fmt % (name, pdist.prob(True), pdist.prob(False)))
	except NotImplementedError:
	pass

	# Return the classifier
	return classifier


	_inst_cache = {}


	def wsd_demo(trainer, word, features, n=1000):
	import random

	from nltk.corpus import senseval

	# Get the instances.
	print("Reading data...")
	global _inst_cache
	if word not in _inst_cache:
	_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
	instances = _inst_cache[word][:]
	if n > len(instances):
	n = len(instances)
	senses = list({l for (i, l) in instances})
	print(" Senses: " + " ".join(senses))

	# Randomly split the names into a test & train set.
	print("Splitting into test & train...")
	random.seed(123456)
	random.shuffle(instances)
	train = instances[: int(0.8 * n)]
	test = instances[int(0.8 * n) : n]

	# Train up a classifier.
	print("Training classifier...")
	classifier = trainer([(features(i), l) for (i, l) in train])

	# Run the classifier on the test data.
	print("Testing classifier...")
	acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
	print("Accuracy: %6.4f" % acc)

	# For classifiers that can find probabilities, show the log
	# likelihood and some sample probability distributions.
	try:
	test_featuresets = [features(i) for (i, n) in test]
	pdists = classifier.prob_classify_many(test_featuresets)
	ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
	print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
	except NotImplementedError:
	pass

	# Return the classifier
	return classifier


	def check_megam_config():
	"""
	Checks whether the MEGAM binary is configured.
	"""
	try:
	_megam_bin
	except NameError as e:
	err_msg = str(
	"Please configure your megam binary first, e.g.\n"
	">>> nltk.config_megam('/usr/bin/local/megam')"
	)
	raise NameError(err_msg) from e