File size: 6,246 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Natural Language Toolkit: GDFA word alignment symmetrization
#
# Copyright (C) 2001-2023 NLTK Project
# Authors: Liling Tan
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

from collections import defaultdict


def grow_diag_final_and(srclen, trglen, e2f, f2e):
    """

    This module symmetrisatizes the source-to-target and target-to-source

    word alignment output and produces, aka. GDFA algorithm (Koehn, 2005).



    Step 1: Find the intersection of the bidirectional alignment.



    Step 2: Search for additional neighbor alignment points to be added, given

            these criteria: (i) neighbor alignments points are not in the

            intersection and (ii) neighbor alignments are in the union.



    Step 3: Add all other alignment points that are not in the intersection, not in

            the neighboring alignments that met the criteria but in the original

            forward/backward alignment outputs.



        >>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 '

        ...         '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18')

        >>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 '

        ...         '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 '

        ...         '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18')

        >>> srctext = ("この よう な ハロー 白色 わい 星 の L 関数 "

        ...            "は L と 共 に 不連続 に 増加 する こと が "

        ...            "期待 さ れる こと を 示し た 。")

        >>> trgtext = ("Therefore , we expect that the luminosity function "

        ...            "of such halo white dwarfs increases discontinuously "

        ...            "with the luminosity .")

        >>> srclen = len(srctext.split())

        >>> trglen = len(trgtext.split())

        >>>

        >>> gdfa = grow_diag_final_and(srclen, trglen, forw, back)

        >>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),

        ...         (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20,

        ...         13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5),

        ...         (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22,

        ...         12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5,

        ...         12), (11, 6), (12, 8)]))

        True



    References:

    Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot.

    2005. Edinburgh System Description for the 2005 IWSLT Speech

    Translation Evaluation. In MT Eval Workshop.



    :type srclen: int

    :param srclen: the number of tokens in the source language

    :type trglen: int

    :param trglen: the number of tokens in the target language

    :type e2f: str

    :param e2f: the forward word alignment outputs from source-to-target

                language (in pharaoh output format)

    :type f2e: str

    :param f2e: the backward word alignment outputs from target-to-source

                language (in pharaoh output format)

    :rtype: set(tuple(int))

    :return: the symmetrized alignment points from the GDFA algorithm

    """

    # Converts pharaoh text format into list of tuples.
    e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()]
    f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()]

    neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]
    alignment = set(e2f).intersection(set(f2e))  # Find the intersection.
    union = set(e2f).union(set(f2e))

    # *aligned* is used to check if neighbors are aligned in grow_diag()
    aligned = defaultdict(set)
    for i, j in alignment:
        aligned["e"].add(i)
        aligned["f"].add(j)

    def grow_diag():
        """

        Search for the neighbor points and them to the intersected alignment

        points if criteria are met.

        """
        prev_len = len(alignment) - 1
        # iterate until no new points added
        while prev_len < len(alignment):
            no_new_points = True
            # for english word e = 0 ... en
            for e in range(srclen):
                # for foreign word f = 0 ... fn
                for f in range(trglen):
                    # if ( e aligned with f)
                    if (e, f) in alignment:
                        # for each neighboring point (e-new, f-new)
                        for neighbor in neighbors:
                            neighbor = tuple(i + j for i, j in zip((e, f), neighbor))
                            e_new, f_new = neighbor
                            # if ( ( e-new not aligned and f-new not aligned)
                            # and (e-new, f-new in union(e2f, f2e) )
                            if (
                                e_new not in aligned and f_new not in aligned
                            ) and neighbor in union:
                                alignment.add(neighbor)
                                aligned["e"].add(e_new)
                                aligned["f"].add(f_new)
                                prev_len += 1
                                no_new_points = False
            # iterate until no new points added
            if no_new_points:
                break

    def final_and(a):
        """

        Adds remaining points that are not in the intersection, not in the

        neighboring alignments but in the original *e2f* and *f2e* alignments

        """
        # for english word e = 0 ... en
        for e_new in range(srclen):
            # for foreign word f = 0 ... fn
            for f_new in range(trglen):
                # if ( ( e-new not aligned and f-new not aligned)
                # and (e-new, f-new in union(e2f, f2e) )
                if (
                    e_new not in aligned
                    and f_new not in aligned
                    and (e_new, f_new) in union
                ):
                    alignment.add((e_new, f_new))
                    aligned["e"].add(e_new)
                    aligned["f"].add(f_new)

    grow_diag()
    final_and(e2f)
    final_and(f2e)
    return sorted(alignment)