Spaces:

sunnychenxiwang
/

EasyDetect

Running

App Files Files Community

EasyDetect / pipeline /nltk /collocations.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

15 kB

	# Natural Language Toolkit: Collocations and Association Measures
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Joel Nothman <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT
	#
	"""
	Tools to identify collocations --- words that often appear consecutively
	--- within corpora. They may also be used to find other associations between
	word occurrences.
	See Manning and Schutze ch. 5 at https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
	and the Text::NSP Perl package at http://ngram.sourceforge.net

	Finding collocations requires first calculating the frequencies of words and
	their appearance in the context of other words. Often the collection of words
	will then requiring filtering to only retain useful content terms. Each ngram
	of words may then be scored according to some association measure, in order
	to determine the relative likelihood of each ngram being a collocation.

	The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
	these functionalities, dependent on being provided a function which scores a
	ngram given appropriate frequency counts. A number of standard association
	measures are provided in bigram_measures and trigram_measures.
	"""

	# Possible TODOs:
	# - consider the distinction between f(x,_) and f(x) and whether our
	# approximation is good enough for fragmented data, and mention it
	# - add a n-gram collocation finder with measures which only utilise n-gram
	# and unigram counts (raw_freq, pmi, student_t)

	import itertools as _itertools

	# these two unused imports are referenced in collocations.doctest
	from nltk.metrics import (
	BigramAssocMeasures,
	ContingencyMeasures,
	QuadgramAssocMeasures,
	TrigramAssocMeasures,
	)
	from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
	from nltk.probability import FreqDist
	from nltk.util import ngrams


	class AbstractCollocationFinder:
	"""
	An abstract base class for collocation finders whose purpose is to
	collect collocation candidate frequencies, filter and rank them.

	As a minimum, collocation finders require the frequencies of each
	word in a corpus, and the joint frequency of word tuples. This data
	should be provided through nltk.probability.FreqDist objects or an
	identical interface.
	"""

	def __init__(self, word_fd, ngram_fd):
	self.word_fd = word_fd
	self.N = word_fd.N()
	self.ngram_fd = ngram_fd

	@classmethod
	def _build_new_documents(
	cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None
	):
	"""
	Pad the document with the place holder according to the window_size
	"""
	padding = (pad_symbol,) * (window_size - 1)
	if pad_right:
	return _itertools.chain.from_iterable(
	_itertools.chain(doc, padding) for doc in documents
	)
	if pad_left:
	return _itertools.chain.from_iterable(
	_itertools.chain(padding, doc) for doc in documents
	)

	@classmethod
	def from_documents(cls, documents):
	"""Constructs a collocation finder given a collection of documents,
	each of which is a list (or iterable) of tokens.
	"""
	# return cls.from_words(_itertools.chain(*documents))
	return cls.from_words(
	cls._build_new_documents(documents, cls.default_ws, pad_right=True)
	)

	@staticmethod
	def _ngram_freqdist(words, n):
	return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1))

	def _apply_filter(self, fn=lambda ngram, freq: False):
	"""Generic filter removes ngrams from the frequency distribution
	if the function returns True when passed an ngram tuple.
	"""
	tmp_ngram = FreqDist()
	for ngram, freq in self.ngram_fd.items():
	if not fn(ngram, freq):
	tmp_ngram[ngram] = freq
	self.ngram_fd = tmp_ngram

	def apply_freq_filter(self, min_freq):
	"""Removes candidate ngrams which have frequency less than min_freq."""
	self._apply_filter(lambda ng, freq: freq < min_freq)

	def apply_ngram_filter(self, fn):
	"""Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
	evaluates to True.
	"""
	self._apply_filter(lambda ng, f: fn(*ng))

	def apply_word_filter(self, fn):
	"""Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
	...) evaluates to True.
	"""
	self._apply_filter(lambda ng, f: any(fn(w) for w in ng))

	def _score_ngrams(self, score_fn):
	"""Generates of (ngram, score) pairs as determined by the scoring
	function provided.
	"""
	for tup in self.ngram_fd:
	score = self.score_ngram(score_fn, *tup)
	if score is not None:
	yield tup, score

	def score_ngrams(self, score_fn):
	"""Returns a sequence of (ngram, score) pairs ordered from highest to
	lowest score, as determined by the scoring function provided.
	"""
	return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0]))

	def nbest(self, score_fn, n):
	"""Returns the top n ngrams when scored by the given function."""
	return [p for p, s in self.score_ngrams(score_fn)[:n]]

	def above_score(self, score_fn, min_score):
	"""Returns a sequence of ngrams, ordered by decreasing score, whose
	scores each exceed the given minimum score.
	"""
	for ngram, score in self.score_ngrams(score_fn):
	if score > min_score:
	yield ngram
	else:
	break


	class BigramCollocationFinder(AbstractCollocationFinder):
	"""A tool for the finding and ranking of bigram collocations or other
	association measures. It is often useful to use from_words() rather than
	constructing an instance directly.
	"""

	default_ws = 2

	def __init__(self, word_fd, bigram_fd, window_size=2):
	"""Construct a BigramCollocationFinder, given FreqDists for
	appearances of words and (possibly non-contiguous) bigrams.
	"""
	AbstractCollocationFinder.__init__(self, word_fd, bigram_fd)
	self.window_size = window_size

	@classmethod
	def from_words(cls, words, window_size=2):
	"""Construct a BigramCollocationFinder for all bigrams in the given
	sequence. When window_size > 2, count non-contiguous bigrams, in the
	style of Church and Hanks's (1990) association ratio.
	"""
	wfd = FreqDist()
	bfd = FreqDist()

	if window_size < 2:
	raise ValueError("Specify window_size at least 2")

	for window in ngrams(words, window_size, pad_right=True):
	w1 = window[0]
	if w1 is None:
	continue
	wfd[w1] += 1
	for w2 in window[1:]:
	if w2 is not None:
	bfd[(w1, w2)] += 1
	return cls(wfd, bfd, window_size=window_size)

	def score_ngram(self, score_fn, w1, w2):
	"""Returns the score for a given bigram using the given scoring
	function. Following Church and Hanks (1990), counts are scaled by
	a factor of 1/(window_size - 1).
	"""
	n_all = self.N
	n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0)
	if not n_ii:
	return
	n_ix = self.word_fd[w1]
	n_xi = self.word_fd[w2]
	return score_fn(n_ii, (n_ix, n_xi), n_all)


	class TrigramCollocationFinder(AbstractCollocationFinder):
	"""A tool for the finding and ranking of trigram collocations or other
	association measures. It is often useful to use from_words() rather than
	constructing an instance directly.
	"""

	default_ws = 3

	def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
	"""Construct a TrigramCollocationFinder, given FreqDists for
	appearances of words, bigrams, two words with any word between them,
	and trigrams.
	"""
	AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
	self.wildcard_fd = wildcard_fd
	self.bigram_fd = bigram_fd

	@classmethod
	def from_words(cls, words, window_size=3):
	"""Construct a TrigramCollocationFinder for all trigrams in the given
	sequence.
	"""
	if window_size < 3:
	raise ValueError("Specify window_size at least 3")

	wfd = FreqDist()
	wildfd = FreqDist()
	bfd = FreqDist()
	tfd = FreqDist()
	for window in ngrams(words, window_size, pad_right=True):
	w1 = window[0]
	if w1 is None:
	continue
	for w2, w3 in _itertools.combinations(window[1:], 2):
	wfd[w1] += 1
	if w2 is None:
	continue
	bfd[(w1, w2)] += 1
	if w3 is None:
	continue
	wildfd[(w1, w3)] += 1
	tfd[(w1, w2, w3)] += 1
	return cls(wfd, bfd, wildfd, tfd)

	def bigram_finder(self):
	"""Constructs a bigram collocation finder with the bigram and unigram
	data from this finder. Note that this does not include any filtering
	applied to this finder.
	"""
	return BigramCollocationFinder(self.word_fd, self.bigram_fd)

	def score_ngram(self, score_fn, w1, w2, w3):
	"""Returns the score for a given trigram using the given scoring
	function.
	"""
	n_all = self.N
	n_iii = self.ngram_fd[(w1, w2, w3)]
	if not n_iii:
	return
	n_iix = self.bigram_fd[(w1, w2)]
	n_ixi = self.wildcard_fd[(w1, w3)]
	n_xii = self.bigram_fd[(w2, w3)]
	n_ixx = self.word_fd[w1]
	n_xix = self.word_fd[w2]
	n_xxi = self.word_fd[w3]
	return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all)


	class QuadgramCollocationFinder(AbstractCollocationFinder):
	"""A tool for the finding and ranking of quadgram collocations or other association measures.
	It is often useful to use from_words() rather than constructing an instance directly.
	"""

	default_ws = 4

	def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
	"""Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
	bigrams, trigrams, two words with one word and two words between them, three words
	with a word between them in both variations.
	"""
	AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
	self.iii = iii
	self.ii = ii
	self.ixi = ixi
	self.ixxi = ixxi
	self.iixi = iixi
	self.ixii = ixii

	@classmethod
	def from_words(cls, words, window_size=4):
	if window_size < 4:
	raise ValueError("Specify window_size at least 4")
	ixxx = FreqDist()
	iiii = FreqDist()
	ii = FreqDist()
	iii = FreqDist()
	ixi = FreqDist()
	ixxi = FreqDist()
	iixi = FreqDist()
	ixii = FreqDist()

	for window in ngrams(words, window_size, pad_right=True):
	w1 = window[0]
	if w1 is None:
	continue
	for w2, w3, w4 in _itertools.combinations(window[1:], 3):
	ixxx[w1] += 1
	if w2 is None:
	continue
	ii[(w1, w2)] += 1
	if w3 is None:
	continue
	iii[(w1, w2, w3)] += 1
	ixi[(w1, w3)] += 1
	if w4 is None:
	continue
	iiii[(w1, w2, w3, w4)] += 1
	ixxi[(w1, w4)] += 1
	ixii[(w1, w3, w4)] += 1
	iixi[(w1, w2, w4)] += 1

	return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)

	def score_ngram(self, score_fn, w1, w2, w3, w4):
	n_all = self.N
	n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
	if not n_iiii:
	return
	n_iiix = self.iii[(w1, w2, w3)]
	n_xiii = self.iii[(w2, w3, w4)]
	n_iixi = self.iixi[(w1, w2, w4)]
	n_ixii = self.ixii[(w1, w3, w4)]

	n_iixx = self.ii[(w1, w2)]
	n_xxii = self.ii[(w3, w4)]
	n_xiix = self.ii[(w2, w3)]
	n_ixix = self.ixi[(w1, w3)]
	n_ixxi = self.ixxi[(w1, w4)]
	n_xixi = self.ixi[(w2, w4)]

	n_ixxx = self.word_fd[w1]
	n_xixx = self.word_fd[w2]
	n_xxix = self.word_fd[w3]
	n_xxxi = self.word_fd[w4]
	return score_fn(
	n_iiii,
	(n_iiix, n_iixi, n_ixii, n_xiii),
	(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
	(n_ixxx, n_xixx, n_xxix, n_xxxi),
	n_all,
	)


	def demo(scorer=None, compare_scorer=None):
	"""Finds bigram collocations in the files of the WebText corpus."""
	from nltk.metrics import (
	BigramAssocMeasures,
	ranks_from_scores,
	spearman_correlation,
	)

	if scorer is None:
	scorer = BigramAssocMeasures.likelihood_ratio
	if compare_scorer is None:
	compare_scorer = BigramAssocMeasures.raw_freq

	from nltk.corpus import stopwords, webtext

	ignored_words = stopwords.words("english")
	word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

	for file in webtext.fileids():
	words = [word.lower() for word in webtext.words(file)]

	cf = BigramCollocationFinder.from_words(words)
	cf.apply_freq_filter(3)
	cf.apply_word_filter(word_filter)

	corr = spearman_correlation(
	ranks_from_scores(cf.score_ngrams(scorer)),
	ranks_from_scores(cf.score_ngrams(compare_scorer)),
	)
	print(file)
	print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)])
	print(f"\t Correlation to {compare_scorer.__name__}: {corr:0.4f}")


	# Slows down loading too much
	# bigram_measures = BigramAssocMeasures()
	# trigram_measures = TrigramAssocMeasures()

	if __name__ == "__main__":
	import sys

	from nltk.metrics import BigramAssocMeasures

	try:
	scorer = eval("BigramAssocMeasures." + sys.argv[1])
	except IndexError:
	scorer = None
	try:
	compare_scorer = eval("BigramAssocMeasures." + sys.argv[2])
	except IndexError:
	compare_scorer = None

	demo(scorer, compare_scorer)

	__all__ = [
	"BigramCollocationFinder",
	"TrigramCollocationFinder",
	"QuadgramCollocationFinder",
	]