File size: 16,569 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
# Natural Language Toolkit: Ngram Association Measures
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Joel Nothman <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

Provides scoring functions for a number of association measures through a

generic, abstract implementation in ``NgramAssocMeasures``, and n-specific

``BigramAssocMeasures`` and ``TrigramAssocMeasures``.

"""

import math as _math
from abc import ABCMeta, abstractmethod
from functools import reduce

_log2 = lambda x: _math.log2(x)
_ln = _math.log

_product = lambda s: reduce(lambda x, y: x * y, s)

_SMALL = 1e-20

try:
    from scipy.stats import fisher_exact
except ImportError:

    def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError


### Indices to marginals arguments:

NGRAM = 0
"""Marginals index for the ngram count"""

UNIGRAMS = -2
"""Marginals index for a tuple of each unigram count"""

TOTAL = -1
"""Marginals index for the number of words in the data"""


class NgramAssocMeasures(metaclass=ABCMeta):
    """

    An abstract class defining a collection of generic association measures.

    Each public method returns a score, taking the following arguments::



        score_fn(count_of_ngram,

                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),

                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),

                 ...,

                 (count_of_1gram_1, ..., count_of_1gram_n),

                 count_of_total_words)



    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``



    Inheriting classes should define a property _n, and a method _contingency

    which calculates contingency values from marginals in order for all

    association measures defined here to be usable.

    """

    _n = 0

    @staticmethod
    @abstractmethod
    def _contingency(*marginals):
        """Calculates values of a contingency table from marginal values."""
        raise NotImplementedError(
            "The contingency table is not available" "in the general ngram case"
        )

    @staticmethod
    @abstractmethod
    def _marginals(*contingency):
        """Calculates values of contingency table marginals from its values."""
        raise NotImplementedError(
            "The contingency table is not available" "in the general ngram case"
        )

    @classmethod
    def _expected_values(cls, cont):
        """Calculates expected values for a contingency table."""
        n_all = sum(cont)
        bits = [1 << i for i in range(cls._n)]

        # For each contingency table cell
        for i in range(len(cont)):
            # Yield the expected value
            yield (
                _product(
                    sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j))
                    for j in bits
                )
                / (n_all ** (cls._n - 1))
            )

    @staticmethod
    def raw_freq(*marginals):
        """Scores ngrams by their frequency"""
        return marginals[NGRAM] / marginals[TOTAL]

    @classmethod
    def student_t(cls, *marginals):
        """Scores ngrams using Student's t test with independence hypothesis

        for unigrams, as in Manning and Schutze 5.3.1.

        """
        return (
            marginals[NGRAM]
            - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
        ) / (marginals[NGRAM] + _SMALL) ** 0.5

    @classmethod
    def chi_sq(cls, *marginals):
        """Scores ngrams using Pearson's chi-square as in Manning and Schutze

        5.3.3.

        """
        cont = cls._contingency(*marginals)
        exps = cls._expected_values(cont)
        return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))

    @staticmethod
    def mi_like(*marginals, **kwargs):
        """Scores ngrams using a variant of mutual information. The keyword

        argument power sets an exponent (default 3) for the numerator. No

        logarithm of the result is calculated.

        """
        return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
            marginals[UNIGRAMS]
        )

    @classmethod
    def pmi(cls, *marginals):
        """Scores ngrams by pointwise mutual information, as in Manning and

        Schutze 5.4.

        """
        return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
            _product(marginals[UNIGRAMS])
        )

    @classmethod
    def likelihood_ratio(cls, *marginals):
        """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4."""
        cont = cls._contingency(*marginals)
        return 2 * sum(
            obs * _ln(obs / (exp + _SMALL) + _SMALL)
            for obs, exp in zip(cont, cls._expected_values(cont))
        )

    @classmethod
    def poisson_stirling(cls, *marginals):
        """Scores ngrams using the Poisson-Stirling measure."""
        exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
        return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)

    @classmethod
    def jaccard(cls, *marginals):
        """Scores ngrams using the Jaccard index."""
        cont = cls._contingency(*marginals)
        return cont[0] / sum(cont[:-1])


class BigramAssocMeasures(NgramAssocMeasures):
    """

    A collection of bigram association measures. Each association measure

    is provided as a function with three arguments::



        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)



    The arguments constitute the marginals of a contingency table, counting

    the occurrences of particular events in a corpus. The letter i in the

    suffix refers to the appearance of the word in question, while x indicates

    the appearance of any word. Thus, for example:



    - n_ii counts ``(w1, w2)``, i.e. the bigram being scored

    - n_ix counts ``(w1, *)``

    - n_xi counts ``(*, w2)``

    - n_xx counts ``(*, *)``, i.e. any bigram



    This may be shown with respect to a contingency table::



                w1    ~w1

             ------ ------

         w2 | n_ii | n_oi | = n_xi

             ------ ------

        ~w2 | n_io | n_oo |

             ------ ------

             = n_ix        TOTAL = n_xx

    """

    _n = 2

    @staticmethod
    def _contingency(n_ii, n_ix_xi_tuple, n_xx):
        """Calculates values of a bigram contingency table from marginal values."""
        (n_ix, n_xi) = n_ix_xi_tuple
        n_oi = n_xi - n_ii
        n_io = n_ix - n_ii
        return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)

    @staticmethod
    def _marginals(n_ii, n_oi, n_io, n_oo):
        """Calculates values of contingency table marginals from its values."""
        return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)

    @staticmethod
    def _expected_values(cont):
        """Calculates expected values for a contingency table."""
        n_xx = sum(cont)
        # For each contingency table cell
        for i in range(4):
            yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx

    @classmethod
    def phi_sq(cls, *marginals):
        """Scores bigrams using phi-square, the square of the Pearson correlation

        coefficient.

        """
        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        return (n_ii * n_oo - n_io * n_oi) ** 2 / (
            (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
        )

    @classmethod
    def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
        """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number

        of bigrams, as in Manning and Schutze 5.3.3.

        """
        (n_ix, n_xi) = n_ix_xi_tuple
        return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)

    @classmethod
    def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less

        sensitive to small counts than PMI or Chi Sq, but also more expensive

        to compute. Requires scipy.

        """

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
        return pvalue

    @staticmethod
    def dice(n_ii, n_ix_xi_tuple, n_xx):
        """Scores bigrams using Dice's coefficient."""
        (n_ix, n_xi) = n_ix_xi_tuple
        return 2 * n_ii / (n_ix + n_xi)


class TrigramAssocMeasures(NgramAssocMeasures):
    """

    A collection of trigram association measures. Each association measure

    is provided as a function with four arguments::



        trigram_score_fn(n_iii,

                         (n_iix, n_ixi, n_xii),

                         (n_ixx, n_xix, n_xxi),

                         n_xxx)



    The arguments constitute the marginals of a contingency table, counting

    the occurrences of particular events in a corpus. The letter i in the

    suffix refers to the appearance of the word in question, while x indicates

    the appearance of any word. Thus, for example:



    - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored

    - n_ixx counts ``(w1, *, *)``

    - n_xxx counts ``(*, *, *)``, i.e. any trigram

    """

    _n = 3

    @staticmethod
    def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
        """Calculates values of a trigram contingency table (or cube) from

        marginal values.

        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)

        (1, 0, 0, 0, 0, 72, 0, 1927)

        """
        (n_iix, n_ixi, n_xii) = n_iix_tuple
        (n_ixx, n_xix, n_xxi) = n_ixx_tuple
        n_oii = n_xii - n_iii
        n_ioi = n_ixi - n_iii
        n_iio = n_iix - n_iii
        n_ooi = n_xxi - n_iii - n_oii - n_ioi
        n_oio = n_xix - n_iii - n_oii - n_iio
        n_ioo = n_ixx - n_iii - n_ioi - n_iio
        n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo

        return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)

    @staticmethod
    def _marginals(*contingency):
        """Calculates values of contingency table marginals from its values.

        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)

        (1, (1, 1, 1), (1, 73, 1), 2000)

        """
        n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
        return (
            n_iii,
            (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
            (
                n_iii + n_ioi + n_iio + n_ioo,
                n_iii + n_oii + n_iio + n_oio,
                n_iii + n_oii + n_ioi + n_ooi,
            ),
            sum(contingency),
        )


class QuadgramAssocMeasures(NgramAssocMeasures):
    """

    A collection of quadgram association measures. Each association measure

    is provided as a function with five arguments::



        trigram_score_fn(n_iiii,

                        (n_iiix, n_iixi, n_ixii, n_xiii),

                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),

                        (n_ixxx, n_xixx, n_xxix, n_xxxi),

                        n_all)



    The arguments constitute the marginals of a contingency table, counting

    the occurrences of particular events in a corpus. The letter i in the

    suffix refers to the appearance of the word in question, while x indicates

    the appearance of any word. Thus, for example:



    - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored

    - n_ixxi counts ``(w1, *, *, w4)``

    - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram

    """

    _n = 4

    @staticmethod
    def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
        """Calculates values of a quadgram contingency table from

        marginal values.

        """
        (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
        (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
        n_oiii = n_xiii - n_iiii
        n_ioii = n_ixii - n_iiii
        n_iioi = n_iixi - n_iiii
        n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
        n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
        n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
        n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
        n_iiio = n_iiix - n_iiii
        n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
        n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
        n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
        n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
        n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
        n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
        n_oooo = (
            n_xxxx
            - n_iiii
            - n_oiii
            - n_ioii
            - n_iioi
            - n_ooii
            - n_oioi
            - n_iooi
            - n_oooi
            - n_iiio
            - n_oiio
            - n_ioio
            - n_ooio
            - n_iioo
            - n_oioo
            - n_iooo
        )

        return (
            n_iiii,
            n_oiii,
            n_ioii,
            n_ooii,
            n_iioi,
            n_oioi,
            n_iooi,
            n_oooi,
            n_iiio,
            n_oiio,
            n_ioio,
            n_ooio,
            n_iioo,
            n_oioo,
            n_iooo,
            n_oooo,
        )

    @staticmethod
    def _marginals(*contingency):
        """Calculates values of contingency table marginals from its values.

        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)

        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)

        """
        (
            n_iiii,
            n_oiii,
            n_ioii,
            n_ooii,
            n_iioi,
            n_oioi,
            n_iooi,
            n_oooi,
            n_iiio,
            n_oiio,
            n_ioio,
            n_ooio,
            n_iioo,
            n_oioo,
            n_iooo,
            n_oooo,
        ) = contingency

        n_iiix = n_iiii + n_iiio
        n_iixi = n_iiii + n_iioi
        n_ixii = n_iiii + n_ioii
        n_xiii = n_iiii + n_oiii

        n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
        n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
        n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
        n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
        n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
        n_xiix = n_iiii + n_oiii + n_iiio + n_oiio

        n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
        n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
        n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
        n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi

        n_all = sum(contingency)

        return (
            n_iiii,
            (n_iiix, n_iixi, n_ixii, n_xiii),
            (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
            (n_ixxx, n_xixx, n_xxix, n_xxxi),
            n_all,
        )


class ContingencyMeasures:
    """Wraps NgramAssocMeasures classes such that the arguments of association

    measures are contingency table values rather than marginals.

    """

    def __init__(self, measures):
        """Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
        self.__class__.__name__ = "Contingency" + measures.__class__.__name__
        for k in dir(measures):
            if k.startswith("__"):
                continue
            v = getattr(measures, k)
            if not k.startswith("_"):
                v = self._make_contingency_fn(measures, v)
            setattr(self, k, v)

    @staticmethod
    def _make_contingency_fn(measures, old_fn):
        """From an association measure function, produces a new function which

        accepts contingency table values as its arguments.

        """

        def res(*contingency):
            return old_fn(*measures._marginals(*contingency))

        res.__doc__ = old_fn.__doc__
        res.__name__ = old_fn.__name__
        return res