Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /classify /naivebayes.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

10.7 kB

	# Natural Language Toolkit: Naive Bayes Classifiers
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Edward Loper <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	A classifier based on the Naive Bayes algorithm. In order to find the
	probability for a label, this algorithm first uses the Bayes rule to
	express P(label\|features) in terms of P(label) and P(features\|label):

	\| P(label) * P(features\|label)
	\| P(label\|features) = ------------------------------
	\| P(features)

	The algorithm then makes the 'naive' assumption that all features are
	independent, given the label:

	\| P(label) * P(f1\|label) * ... * P(fn\|label)
	\| P(label\|features) = --------------------------------------------
	\| P(features)

	Rather than computing P(features) explicitly, the algorithm just
	calculates the numerator for each label, and normalizes them so they
	sum to one:

	\| P(label) * P(f1\|label) * ... * P(fn\|label)
	\| P(label\|features) = --------------------------------------------
	\| SUM[l]( P(l) * P(f1\|l) * ... * P(fn\|l) )
	"""

	from collections import defaultdict

	from nltk.classify.api import ClassifierI
	from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs

	##//////////////////////////////////////////////////////
	## Naive Bayes Classifier
	##//////////////////////////////////////////////////////


	class NaiveBayesClassifier(ClassifierI):
	"""
	A Naive Bayes classifier. Naive Bayes classifiers are
	paramaterized by two probability distributions:

	- P(label) gives the probability that an input will receive each
	label, given no information about the input's features.

	- P(fname=fval\|label) gives the probability that a given feature
	(fname) will receive a given value (fval), given that the
	label (label).

	If the classifier encounters an input with a feature that has
	never been seen with any label, then rather than assigning a
	probability of 0 to all labels, it will ignore that feature.

	The feature value 'None' is reserved for unseen feature values;
	you generally should not use 'None' as a feature value for one of
	your own features.
	"""

	def __init__(self, label_probdist, feature_probdist):
	"""
	:param label_probdist: P(label), the probability distribution
	over labels. It is expressed as a ``ProbDistI`` whose
	samples are labels. I.e., P(label) =
	``label_probdist.prob(label)``.

	:param feature_probdist: P(fname=fval\|label), the probability
	distribution for feature values, given labels. It is
	expressed as a dictionary whose keys are ``(label, fname)``
	pairs and whose values are ``ProbDistI`` objects over feature
	values. I.e., P(fname=fval\|label) =
	``feature_probdist[label,fname].prob(fval)``. If a given
	``(label,fname)`` is not a key in ``feature_probdist``, then
	it is assumed that the corresponding P(fname=fval\|label)
	is 0 for all values of ``fval``.
	"""
	self._label_probdist = label_probdist
	self._feature_probdist = feature_probdist
	self._labels = list(label_probdist.samples())

	def labels(self):
	return self._labels

	def classify(self, featureset):
	return self.prob_classify(featureset).max()

	def prob_classify(self, featureset):
	# Discard any feature names that we've never seen before.
	# Otherwise, we'll just assign a probability of 0 to
	# everything.
	featureset = featureset.copy()
	for fname in list(featureset.keys()):
	for label in self._labels:
	if (label, fname) in self._feature_probdist:
	break
	else:
	# print('Ignoring unseen feature %s' % fname)
	del featureset[fname]

	# Find the log probability of each label, given the features.
	# Start with the log probability of the label itself.
	logprob = {}
	for label in self._labels:
	logprob[label] = self._label_probdist.logprob(label)

	# Then add in the log probability of features given labels.
	for label in self._labels:
	for (fname, fval) in featureset.items():
	if (label, fname) in self._feature_probdist:
	feature_probs = self._feature_probdist[label, fname]
	logprob[label] += feature_probs.logprob(fval)
	else:
	# nb: This case will never come up if the
	# classifier was created by
	# NaiveBayesClassifier.train().
	logprob[label] += sum_logs([]) # = -INF.

	return DictionaryProbDist(logprob, normalize=True, log=True)

	def show_most_informative_features(self, n=10):
	# Determine the most relevant features, and display them.
	cpdist = self._feature_probdist
	print("Most Informative Features")

	for (fname, fval) in self.most_informative_features(n):

	def labelprob(l):
	return cpdist[l, fname].prob(fval)

	labels = sorted(
	(l for l in self._labels if fval in cpdist[l, fname].samples()),
	key=lambda element: (-labelprob(element), element),
	reverse=True,
	)
	if len(labels) == 1:
	continue
	l0 = labels[0]
	l1 = labels[-1]
	if cpdist[l0, fname].prob(fval) == 0:
	ratio = "INF"
	else:
	ratio = "%8.1f" % (
	cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
	)
	print(
	"%24s = %-14r %6s : %-6s = %s : 1.0"
	% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
	)

	def most_informative_features(self, n=100):
	"""
	Return a list of the 'most informative' features used by this
	classifier. For the purpose of this function, the
	informativeness of a feature ``(fname,fval)`` is equal to the
	highest value of P(fname=fval\|label), for any label, divided by
	the lowest value of P(fname=fval\|label), for any label:

	\| max[ P(fname=fval\|label1) / P(fname=fval\|label2) ]
	"""
	if hasattr(self, "_most_informative_features"):
	return self._most_informative_features[:n]
	else:
	# The set of (fname, fval) pairs used by this classifier.
	features = set()
	# The max & min probability associated w/ each (fname, fval)
	# pair. Maps (fname,fval) -> float.
	maxprob = defaultdict(lambda: 0.0)
	minprob = defaultdict(lambda: 1.0)

	for (label, fname), probdist in self._feature_probdist.items():
	for fval in probdist.samples():
	feature = (fname, fval)
	features.add(feature)
	p = probdist.prob(fval)
	maxprob[feature] = max(p, maxprob[feature])
	minprob[feature] = min(p, minprob[feature])
	if minprob[feature] == 0:
	features.discard(feature)

	# Convert features to a list, & sort it by how informative
	# features are.
	self._most_informative_features = sorted(
	features,
	key=lambda feature_: (
	minprob[feature_] / maxprob[feature_],
	feature_[0],
	feature_[1] in [None, False, True],
	str(feature_[1]).lower(),
	),
	)
	return self._most_informative_features[:n]

	@classmethod
	def train(cls, labeled_featuresets, estimator=ELEProbDist):
	"""
	:param labeled_featuresets: A list of classified featuresets,
	i.e., a list of tuples ``(featureset, label)``.
	"""
	label_freqdist = FreqDist()
	feature_freqdist = defaultdict(FreqDist)
	feature_values = defaultdict(set)
	fnames = set()

	# Count up how many times each feature value occurred, given
	# the label and featurename.
	for featureset, label in labeled_featuresets:
	label_freqdist[label] += 1
	for fname, fval in featureset.items():
	# Increment freq(fval\|label, fname)
	feature_freqdist[label, fname][fval] += 1
	# Record that fname can take the value fval.
	feature_values[fname].add(fval)
	# Keep a list of all feature names.
	fnames.add(fname)

	# If a feature didn't have a value given for an instance, then
	# we assume that it gets the implicit value 'None.' This loop
	# counts up the number of 'missing' feature values for each
	# (label,fname) pair, and increments the count of the fval
	# 'None' by that amount.
	for label in label_freqdist:
	num_samples = label_freqdist[label]
	for fname in fnames:
	count = feature_freqdist[label, fname].N()
	# Only add a None key when necessary, i.e. if there are
	# any samples with feature 'fname' missing.
	if num_samples - count > 0:
	feature_freqdist[label, fname][None] += num_samples - count
	feature_values[fname].add(None)

	# Create the P(label) distribution
	label_probdist = estimator(label_freqdist)

	# Create the P(fval\|label, fname) distribution
	feature_probdist = {}
	for ((label, fname), freqdist) in feature_freqdist.items():
	probdist = estimator(freqdist, bins=len(feature_values[fname]))
	feature_probdist[label, fname] = probdist

	return cls(label_probdist, feature_probdist)


	##//////////////////////////////////////////////////////
	## Demo
	##//////////////////////////////////////////////////////


	def demo():
	from nltk.classify.util import names_demo

	classifier = names_demo(NaiveBayesClassifier.train)
	classifier.show_most_informative_features()


	if __name__ == "__main__":
	demo()