File size: 4,903 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Ilia Kurenkov <[email protected]>
#         Manu Joseph <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""Language Models"""

from nltk.lm.api import LanguageModel, Smoothing
from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell


class MLE(LanguageModel):
    """Class for providing MLE ngram model scores.



    Inherits initialization from BaseNgramModel.

    """

    def unmasked_score(self, word, context=None):
        """Returns the MLE score for a word given a context.



        Args:

        - word is expected to be a string

        - context is expected to be something reasonably convertible to a tuple

        """
        return self.context_counts(context).freq(word)


class Lidstone(LanguageModel):
    """Provides Lidstone-smoothed scores.



    In addition to initialization arguments from BaseNgramModel also requires

    a number by which to increase the counts, gamma.

    """

    def __init__(self, gamma, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.gamma = gamma

    def unmasked_score(self, word, context=None):
        """Add-one smoothing: Lidstone or Laplace.



        To see what kind, look at `gamma` attribute on the class.



        """
        counts = self.context_counts(context)
        word_count = counts[word]
        norm_count = counts.N()
        return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)


class Laplace(Lidstone):
    """Implements Laplace (add one) smoothing.



    Initialization identical to BaseNgramModel because gamma is always 1.

    """

    def __init__(self, *args, **kwargs):
        super().__init__(1, *args, **kwargs)


class StupidBackoff(LanguageModel):
    """Provides StupidBackoff scores.



    In addition to initialization arguments from BaseNgramModel also requires

    a parameter alpha with which we scale the lower order probabilities.

    Note that this is not a true probability distribution as scores for ngrams

    of the same order do not sum up to unity.

    """

    def __init__(self, alpha=0.4, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha

    def unmasked_score(self, word, context=None):
        if not context:
            # Base recursion
            return self.counts.unigrams.freq(word)
        counts = self.context_counts(context)
        word_count = counts[word]
        norm_count = counts.N()
        if word_count > 0:
            return word_count / norm_count
        else:
            return self.alpha * self.unmasked_score(word, context[1:])


class InterpolatedLanguageModel(LanguageModel):
    """Logic common to all interpolated language models.



    The idea to abstract this comes from Chen & Goodman 1995.

    Do not instantiate this class directly!

    """

    def __init__(self, smoothing_cls, order, **kwargs):
        params = kwargs.pop("params", {})
        super().__init__(order, **kwargs)
        self.estimator = smoothing_cls(self.vocab, self.counts, **params)

    def unmasked_score(self, word, context=None):
        if not context:
            # The base recursion case: no context, we only have a unigram.
            return self.estimator.unigram_score(word)
        if not self.counts[context]:
            # It can also happen that we have no data for this context.
            # In that case we defer to the lower-order ngram.
            # This is the same as setting alpha to 0 and gamma to 1.
            alpha, gamma = 0, 1
        else:
            alpha, gamma = self.estimator.alpha_gamma(word, context)
        return alpha + gamma * self.unmasked_score(word, context[1:])


class WittenBellInterpolated(InterpolatedLanguageModel):
    """Interpolated version of Witten-Bell smoothing."""

    def __init__(self, order, **kwargs):
        super().__init__(WittenBell, order, **kwargs)


class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel):
    """Interpolated version of smoothing with absolute discount."""

    def __init__(self, order, discount=0.75, **kwargs):
        super().__init__(
            AbsoluteDiscounting, order, params={"discount": discount}, **kwargs
        )


class KneserNeyInterpolated(InterpolatedLanguageModel):
    """Interpolated version of Kneser-Ney smoothing."""

    def __init__(self, order, discount=0.1, **kwargs):
        if not (0 <= discount <= 1):
            raise ValueError(
                "Discount must be between 0 and 1 for probabilities to sum to unity."
            )
        super().__init__(
            KneserNey, order, params={"discount": discount, "order": order}, **kwargs
        )