Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /metrics /association.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

16.6 kB

	# Natural Language Toolkit: Ngram Association Measures
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Joel Nothman <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	Provides scoring functions for a number of association measures through a
	generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
	``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
	"""

	import math as _math
	from abc import ABCMeta, abstractmethod
	from functools import reduce

	_log2 = lambda x: _math.log2(x)
	_ln = _math.log

	_product = lambda s: reduce(lambda x, y: x * y, s)

	_SMALL = 1e-20

	try:
	from scipy.stats import fisher_exact
	except ImportError:

	def fisher_exact(_args, *_kwargs):
	raise NotImplementedError


	### Indices to marginals arguments:

	NGRAM = 0
	"""Marginals index for the ngram count"""

	UNIGRAMS = -2
	"""Marginals index for a tuple of each unigram count"""

	TOTAL = -1
	"""Marginals index for the number of words in the data"""


	class NgramAssocMeasures(metaclass=ABCMeta):
	"""
	An abstract class defining a collection of generic association measures.
	Each public method returns a score, taking the following arguments::

	score_fn(count_of_ngram,
	(count_of_n-1gram_1, ..., count_of_n-1gram_j),
	(count_of_n-2gram_1, ..., count_of_n-2gram_k),
	...,
	(count_of_1gram_1, ..., count_of_1gram_n),
	count_of_total_words)

	See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``

	Inheriting classes should define a property _n, and a method _contingency
	which calculates contingency values from marginals in order for all
	association measures defined here to be usable.
	"""

	_n = 0

	@staticmethod
	@abstractmethod
	def _contingency(*marginals):
	"""Calculates values of a contingency table from marginal values."""
	raise NotImplementedError(
	"The contingency table is not available" "in the general ngram case"
	)

	@staticmethod
	@abstractmethod
	def _marginals(*contingency):
	"""Calculates values of contingency table marginals from its values."""
	raise NotImplementedError(
	"The contingency table is not available" "in the general ngram case"
	)

	@classmethod
	def _expected_values(cls, cont):
	"""Calculates expected values for a contingency table."""
	n_all = sum(cont)
	bits = [1 << i for i in range(cls._n)]

	# For each contingency table cell
	for i in range(len(cont)):
	# Yield the expected value
	yield (
	_product(
	sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j))
	for j in bits
	)
	/ (n_all ** (cls._n - 1))
	)

	@staticmethod
	def raw_freq(*marginals):
	"""Scores ngrams by their frequency"""
	return marginals[NGRAM] / marginals[TOTAL]

	@classmethod
	def student_t(cls, *marginals):
	"""Scores ngrams using Student's t test with independence hypothesis
	for unigrams, as in Manning and Schutze 5.3.1.
	"""
	return (
	marginals[NGRAM]
	- _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
	) / (marginals[NGRAM] + _SMALL) ** 0.5

	@classmethod
	def chi_sq(cls, *marginals):
	"""Scores ngrams using Pearson's chi-square as in Manning and Schutze
	5.3.3.
	"""
	cont = cls._contingency(*marginals)
	exps = cls._expected_values(cont)
	return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))

	@staticmethod
	def mi_like(marginals, *kwargs):
	"""Scores ngrams using a variant of mutual information. The keyword
	argument power sets an exponent (default 3) for the numerator. No
	logarithm of the result is calculated.
	"""
	return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
	marginals[UNIGRAMS]
	)

	@classmethod
	def pmi(cls, *marginals):
	"""Scores ngrams by pointwise mutual information, as in Manning and
	Schutze 5.4.
	"""
	return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
	_product(marginals[UNIGRAMS])
	)

	@classmethod
	def likelihood_ratio(cls, *marginals):
	"""Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4."""
	cont = cls._contingency(*marginals)
	return 2 * sum(
	obs * _ln(obs / (exp + _SMALL) + _SMALL)
	for obs, exp in zip(cont, cls._expected_values(cont))
	)

	@classmethod
	def poisson_stirling(cls, *marginals):
	"""Scores ngrams using the Poisson-Stirling measure."""
	exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
	return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)

	@classmethod
	def jaccard(cls, *marginals):
	"""Scores ngrams using the Jaccard index."""
	cont = cls._contingency(*marginals)
	return cont[0] / sum(cont[:-1])


	class BigramAssocMeasures(NgramAssocMeasures):
	"""
	A collection of bigram association measures. Each association measure
	is provided as a function with three arguments::

	bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)

	The arguments constitute the marginals of a contingency table, counting
	the occurrences of particular events in a corpus. The letter i in the
	suffix refers to the appearance of the word in question, while x indicates
	the appearance of any word. Thus, for example:

	- n_ii counts ``(w1, w2)``, i.e. the bigram being scored
	- n_ix counts ``(w1, *)``
	- n_xi counts ``(*, w2)``
	- n_xx counts ``(, )``, i.e. any bigram

	This may be shown with respect to a contingency table::

	w1 ~w1
	------ ------
	w2 \| n_ii \| n_oi \| = n_xi
	------ ------
	~w2 \| n_io \| n_oo \|
	------ ------
	= n_ix TOTAL = n_xx
	"""

	_n = 2

	@staticmethod
	def _contingency(n_ii, n_ix_xi_tuple, n_xx):
	"""Calculates values of a bigram contingency table from marginal values."""
	(n_ix, n_xi) = n_ix_xi_tuple
	n_oi = n_xi - n_ii
	n_io = n_ix - n_ii
	return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)

	@staticmethod
	def _marginals(n_ii, n_oi, n_io, n_oo):
	"""Calculates values of contingency table marginals from its values."""
	return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)

	@staticmethod
	def _expected_values(cont):
	"""Calculates expected values for a contingency table."""
	n_xx = sum(cont)
	# For each contingency table cell
	for i in range(4):
	yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx

	@classmethod
	def phi_sq(cls, *marginals):
	"""Scores bigrams using phi-square, the square of the Pearson correlation
	coefficient.
	"""
	n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

	return (n_ii * n_oo - n_io * n_oi) ** 2 / (
	(n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
	)

	@classmethod
	def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
	"""Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
	of bigrams, as in Manning and Schutze 5.3.3.
	"""
	(n_ix, n_xi) = n_ix_xi_tuple
	return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)

	@classmethod
	def fisher(cls, *marginals):
	"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
	sensitive to small counts than PMI or Chi Sq, but also more expensive
	to compute. Requires scipy.
	"""

	n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

	(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
	return pvalue

	@staticmethod
	def dice(n_ii, n_ix_xi_tuple, n_xx):
	"""Scores bigrams using Dice's coefficient."""
	(n_ix, n_xi) = n_ix_xi_tuple
	return 2 * n_ii / (n_ix + n_xi)


	class TrigramAssocMeasures(NgramAssocMeasures):
	"""
	A collection of trigram association measures. Each association measure
	is provided as a function with four arguments::

	trigram_score_fn(n_iii,
	(n_iix, n_ixi, n_xii),
	(n_ixx, n_xix, n_xxi),
	n_xxx)

	The arguments constitute the marginals of a contingency table, counting
	the occurrences of particular events in a corpus. The letter i in the
	suffix refers to the appearance of the word in question, while x indicates
	the appearance of any word. Thus, for example:

	- n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
	- n_ixx counts ``(w1, , )``
	- n_xxx counts ``(, , *)``, i.e. any trigram
	"""

	_n = 3

	@staticmethod
	def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
	"""Calculates values of a trigram contingency table (or cube) from
	marginal values.
	>>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
	(1, 0, 0, 0, 0, 72, 0, 1927)
	"""
	(n_iix, n_ixi, n_xii) = n_iix_tuple
	(n_ixx, n_xix, n_xxi) = n_ixx_tuple
	n_oii = n_xii - n_iii
	n_ioi = n_ixi - n_iii
	n_iio = n_iix - n_iii
	n_ooi = n_xxi - n_iii - n_oii - n_ioi
	n_oio = n_xix - n_iii - n_oii - n_iio
	n_ioo = n_ixx - n_iii - n_ioi - n_iio
	n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo

	return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)

	@staticmethod
	def _marginals(*contingency):
	"""Calculates values of contingency table marginals from its values.
	>>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
	(1, (1, 1, 1), (1, 73, 1), 2000)
	"""
	n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
	return (
	n_iii,
	(n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
	(
	n_iii + n_ioi + n_iio + n_ioo,
	n_iii + n_oii + n_iio + n_oio,
	n_iii + n_oii + n_ioi + n_ooi,
	),
	sum(contingency),
	)


	class QuadgramAssocMeasures(NgramAssocMeasures):
	"""
	A collection of quadgram association measures. Each association measure
	is provided as a function with five arguments::

	trigram_score_fn(n_iiii,
	(n_iiix, n_iixi, n_ixii, n_xiii),
	(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
	(n_ixxx, n_xixx, n_xxix, n_xxxi),
	n_all)

	The arguments constitute the marginals of a contingency table, counting
	the occurrences of particular events in a corpus. The letter i in the
	suffix refers to the appearance of the word in question, while x indicates
	the appearance of any word. Thus, for example:

	- n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
	- n_ixxi counts ``(w1, , , w4)``
	- n_xxxx counts ``(, , , )``, i.e. any quadgram
	"""

	_n = 4

	@staticmethod
	def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
	"""Calculates values of a quadgram contingency table from
	marginal values.
	"""
	(n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
	(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
	(n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
	n_oiii = n_xiii - n_iiii
	n_ioii = n_ixii - n_iiii
	n_iioi = n_iixi - n_iiii
	n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
	n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
	n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
	n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
	n_iiio = n_iiix - n_iiii
	n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
	n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
	n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
	n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
	n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
	n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
	n_oooo = (
	n_xxxx
	- n_iiii
	- n_oiii
	- n_ioii
	- n_iioi
	- n_ooii
	- n_oioi
	- n_iooi
	- n_oooi
	- n_iiio
	- n_oiio
	- n_ioio
	- n_ooio
	- n_iioo
	- n_oioo
	- n_iooo
	)

	return (
	n_iiii,
	n_oiii,
	n_ioii,
	n_ooii,
	n_iioi,
	n_oioi,
	n_iooi,
	n_oooi,
	n_iiio,
	n_oiio,
	n_ioio,
	n_ooio,
	n_iioo,
	n_oioo,
	n_iooo,
	n_oooo,
	)

	@staticmethod
	def _marginals(*contingency):
	"""Calculates values of contingency table marginals from its values.
	QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
	(1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
	"""
	(
	n_iiii,
	n_oiii,
	n_ioii,
	n_ooii,
	n_iioi,
	n_oioi,
	n_iooi,
	n_oooi,
	n_iiio,
	n_oiio,
	n_ioio,
	n_ooio,
	n_iioo,
	n_oioo,
	n_iooo,
	n_oooo,
	) = contingency

	n_iiix = n_iiii + n_iiio
	n_iixi = n_iiii + n_iioi
	n_ixii = n_iiii + n_ioii
	n_xiii = n_iiii + n_oiii

	n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
	n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
	n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
	n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
	n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
	n_xiix = n_iiii + n_oiii + n_iiio + n_oiio

	n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
	n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
	n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
	n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi

	n_all = sum(contingency)

	return (
	n_iiii,
	(n_iiix, n_iixi, n_ixii, n_xiii),
	(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
	(n_ixxx, n_xixx, n_xxix, n_xxxi),
	n_all,
	)


	class ContingencyMeasures:
	"""Wraps NgramAssocMeasures classes such that the arguments of association
	measures are contingency table values rather than marginals.
	"""

	def __init__(self, measures):
	"""Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
	self.__class__.__name__ = "Contingency" + measures.__class__.__name__
	for k in dir(measures):
	if k.startswith("__"):
	continue
	v = getattr(measures, k)
	if not k.startswith("_"):
	v = self._make_contingency_fn(measures, v)
	setattr(self, k, v)

	@staticmethod
	def _make_contingency_fn(measures, old_fn):
	"""From an association measure function, produces a new function which
	accepts contingency table values as its arguments.
	"""

	def res(*contingency):
	return old_fn(measures._marginals(contingency))

	res.__doc__ = old_fn.__doc__
	res.__name__ = old_fn.__name__
	return res