File size: 8,732 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# Natural Language Toolkit: Gale-Church Aligner
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Torsten Marek <[email protected]>
# Contributor: Cassidy Laidlaw, Liling Tan
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""



A port of the Gale-Church Aligner.



Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.

https://aclweb.org/anthology/J93-1004.pdf



"""

import math

try:
    from norm import logsf as norm_logsf
    from scipy.stats import norm
except ImportError:

    def erfcc(x):
        """Complementary error function."""
        z = abs(x)
        t = 1 / (1 + 0.5 * z)
        r = t * math.exp(
            -z * z
            - 1.26551223
            + t
            * (
                1.00002368
                + t
                * (
                    0.37409196
                    + t
                    * (
                        0.09678418
                        + t
                        * (
                            -0.18628806
                            + t
                            * (
                                0.27886807
                                + t
                                * (
                                    -1.13520398
                                    + t
                                    * (1.48851587 + t * (-0.82215223 + t * 0.17087277))
                                )
                            )
                        )
                    )
                )
            )
        )
        if x >= 0.0:
            return r
        else:
            return 2.0 - r

    def norm_cdf(x):
        """Return the area under the normal distribution from M{-∞..x}."""
        return 1 - 0.5 * erfcc(x / math.sqrt(2))

    def norm_logsf(x):
        try:
            return math.log(1 - norm_cdf(x))
        except ValueError:
            return float("-inf")


LOG2 = math.log(2)


class LanguageIndependent:
    # These are the language-independent probabilities and parameters
    # given in Gale & Church

    # for the computation, l_1 is always the language with less characters
    PRIORS = {
        (1, 0): 0.0099,
        (0, 1): 0.0099,
        (1, 1): 0.89,
        (2, 1): 0.089,
        (1, 2): 0.089,
        (2, 2): 0.011,
    }

    AVERAGE_CHARACTERS = 1
    VARIANCE_CHARACTERS = 6.8


def trace(backlinks, source_sents_lens, target_sents_lens):
    """

    Traverse the alignment cost from the tracebacks and retrieves

    appropriate sentence pairs.



    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)

    :type backlinks: dict

    :param source_sents_lens: A list of target sentences' lengths

    :type source_sents_lens: list(int)

    :param target_sents_lens: A list of target sentences' lengths

    :type target_sents_lens: list(int)

    """
    links = []
    position = (len(source_sents_lens), len(target_sents_lens))
    while position != (0, 0) and all(p >= 0 for p in position):
        try:
            s, t = backlinks[position]
        except TypeError:
            position = (position[0] - 1, position[1] - 1)
            continue
        for i in range(s):
            for j in range(t):
                links.append((position[0] - i - 1, position[1] - j - 1))
        position = (position[0] - s, position[1] - t)

    return links[::-1]


def align_log_prob(i, j, source_sents, target_sents, alignment, params):
    """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}

    being aligned with a specific C{alignment}.



    @param i: The offset of the source sentence.

    @param j: The offset of the target sentence.

    @param source_sents: The list of source sentence lengths.

    @param target_sents: The list of target sentence lengths.

    @param alignment: The alignment type, a tuple of two integers.

    @param params: The sentence alignment parameters.



    @returns: The log probability of a specific alignment between the two sentences, given the parameters.

    """
    l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
    l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
    try:
        # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
        # reference implementation. With l_s in the denominator, insertions are impossible.
        m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
        delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(
            m * params.VARIANCE_CHARACTERS
        )
    except ZeroDivisionError:
        return float("-inf")

    return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))


def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent):
    """Return the sentence alignment of two text blocks (usually paragraphs).



        >>> align_blocks([5,5,5], [7,7,7])

        [(0, 0), (1, 1), (2, 2)]

        >>> align_blocks([10,5,5], [12,20])

        [(0, 0), (1, 1), (2, 1)]

        >>> align_blocks([12,20], [10,5,5])

        [(0, 0), (1, 1), (1, 2)]

        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])

        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]



    @param source_sents_lens: The list of source sentence lengths.

    @param target_sents_lens: The list of target sentence lengths.

    @param params: the sentence alignment parameters.

    @return: The sentence alignments, a list of index pairs.

    """

    alignment_types = list(params.PRIORS.keys())

    # there are always three rows in the history (with the last of them being filled)
    D = [[]]

    backlinks = {}

    for i in range(len(source_sents_lens) + 1):
        for j in range(len(target_sents_lens) + 1):
            min_dist = float("inf")
            min_align = None
            for a in alignment_types:
                prev_i = -1 - a[0]
                prev_j = j - a[1]
                if prev_i < -len(D) or prev_j < 0:
                    continue
                p = D[prev_i][prev_j] + align_log_prob(
                    i, j, source_sents_lens, target_sents_lens, a, params
                )
                if p < min_dist:
                    min_dist = p
                    min_align = a

            if min_dist == float("inf"):
                min_dist = 0

            backlinks[(i, j)] = min_align
            D[-1].append(min_dist)

        if len(D) > 2:
            D.pop(0)
        D.append([])

    return trace(backlinks, source_sents_lens, target_sents_lens)


def align_texts(source_blocks, target_blocks, params=LanguageIndependent):
    """Creates the sentence alignment of two texts.



    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence

    alignment links.



    Each block consists of a list that contains the lengths (in characters) of the sentences

    in this block.



    @param source_blocks: The list of blocks in the source text.

    @param target_blocks: The list of blocks in the target text.

    @param params: the sentence alignment parameters.



    @returns: A list of sentence alignment lists

    """
    if len(source_blocks) != len(target_blocks):
        raise ValueError(
            "Source and target texts do not have the same number of blocks."
        )

    return [
        align_blocks(source_block, target_block, params)
        for source_block, target_block in zip(source_blocks, target_blocks)
    ]


# File I/O functions; may belong in a corpus reader


def split_at(it, split_value):
    """Splits an iterator C{it} at values of C{split_value}.



    Each instance of C{split_value} is swallowed. The iterator produces

    subiterators which need to be consumed fully before the next subiterator

    can be used.

    """

    def _chunk_iterator(first):
        v = first
        while v != split_value:
            yield v
            v = it.next()

    while True:
        yield _chunk_iterator(it.next())


def parse_token_stream(stream, soft_delimiter, hard_delimiter):
    """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)

    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.

    """
    return [
        [
            sum(len(token) for token in sentence_it)
            for sentence_it in split_at(block_it, soft_delimiter)
        ]
        for block_it in split_at(stream, hard_delimiter)
    ]