Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: GDFA word alignment symmetrization | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Authors: Liling Tan | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
from collections import defaultdict | |
def grow_diag_final_and(srclen, trglen, e2f, f2e): | |
""" | |
This module symmetrisatizes the source-to-target and target-to-source | |
word alignment output and produces, aka. GDFA algorithm (Koehn, 2005). | |
Step 1: Find the intersection of the bidirectional alignment. | |
Step 2: Search for additional neighbor alignment points to be added, given | |
these criteria: (i) neighbor alignments points are not in the | |
intersection and (ii) neighbor alignments are in the union. | |
Step 3: Add all other alignment points that are not in the intersection, not in | |
the neighboring alignments that met the criteria but in the original | |
forward/backward alignment outputs. | |
>>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 ' | |
... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18') | |
>>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 ' | |
... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 ' | |
... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18') | |
>>> srctext = ("この よう な ハロー 白色 わい 星 の L 関数 " | |
... "は L と 共 に 不連続 に 増加 する こと が " | |
... "期待 さ れる こと を 示し た 。") | |
>>> trgtext = ("Therefore , we expect that the luminosity function " | |
... "of such halo white dwarfs increases discontinuously " | |
... "with the luminosity .") | |
>>> srclen = len(srctext.split()) | |
>>> trglen = len(trgtext.split()) | |
>>> | |
>>> gdfa = grow_diag_final_and(srclen, trglen, forw, back) | |
>>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12), | |
... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20, | |
... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5), | |
... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22, | |
... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5, | |
... 12), (11, 6), (12, 8)])) | |
True | |
References: | |
Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot. | |
2005. Edinburgh System Description for the 2005 IWSLT Speech | |
Translation Evaluation. In MT Eval Workshop. | |
:type srclen: int | |
:param srclen: the number of tokens in the source language | |
:type trglen: int | |
:param trglen: the number of tokens in the target language | |
:type e2f: str | |
:param e2f: the forward word alignment outputs from source-to-target | |
language (in pharaoh output format) | |
:type f2e: str | |
:param f2e: the backward word alignment outputs from target-to-source | |
language (in pharaoh output format) | |
:rtype: set(tuple(int)) | |
:return: the symmetrized alignment points from the GDFA algorithm | |
""" | |
# Converts pharaoh text format into list of tuples. | |
e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()] | |
f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()] | |
neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)] | |
alignment = set(e2f).intersection(set(f2e)) # Find the intersection. | |
union = set(e2f).union(set(f2e)) | |
# *aligned* is used to check if neighbors are aligned in grow_diag() | |
aligned = defaultdict(set) | |
for i, j in alignment: | |
aligned["e"].add(i) | |
aligned["f"].add(j) | |
def grow_diag(): | |
""" | |
Search for the neighbor points and them to the intersected alignment | |
points if criteria are met. | |
""" | |
prev_len = len(alignment) - 1 | |
# iterate until no new points added | |
while prev_len < len(alignment): | |
no_new_points = True | |
# for english word e = 0 ... en | |
for e in range(srclen): | |
# for foreign word f = 0 ... fn | |
for f in range(trglen): | |
# if ( e aligned with f) | |
if (e, f) in alignment: | |
# for each neighboring point (e-new, f-new) | |
for neighbor in neighbors: | |
neighbor = tuple(i + j for i, j in zip((e, f), neighbor)) | |
e_new, f_new = neighbor | |
# if ( ( e-new not aligned and f-new not aligned) | |
# and (e-new, f-new in union(e2f, f2e) ) | |
if ( | |
e_new not in aligned and f_new not in aligned | |
) and neighbor in union: | |
alignment.add(neighbor) | |
aligned["e"].add(e_new) | |
aligned["f"].add(f_new) | |
prev_len += 1 | |
no_new_points = False | |
# iterate until no new points added | |
if no_new_points: | |
break | |
def final_and(a): | |
""" | |
Adds remaining points that are not in the intersection, not in the | |
neighboring alignments but in the original *e2f* and *f2e* alignments | |
""" | |
# for english word e = 0 ... en | |
for e_new in range(srclen): | |
# for foreign word f = 0 ... fn | |
for f_new in range(trglen): | |
# if ( ( e-new not aligned and f-new not aligned) | |
# and (e-new, f-new in union(e2f, f2e) ) | |
if ( | |
e_new not in aligned | |
and f_new not in aligned | |
and (e_new, f_new) in union | |
): | |
alignment.add((e_new, f_new)) | |
aligned["e"].add(e_new) | |
aligned["f"].add(f_new) | |
grow_diag() | |
final_and(e2f) | |
final_and(f2e) | |
return sorted(alignment) | |