sunnychenxiwang's picture
update nltk
d916065
raw
history blame
9.68 kB
"""
Tests for common methods of IBM translation models
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel(unittest.TestCase):
__TEST_SRC_SENTENCE = ["j'", "aime", "bien", "jambon"]
__TEST_TRG_SENTENCE = ["i", "love", "ham"]
def test_vocabularies_are_initialized(self):
parallel_corpora = [
AlignedSent(["one", "two", "three", "four"], ["un", "deux", "trois"]),
AlignedSent(["five", "one", "six"], ["quatre", "cinq", "six"]),
AlignedSent([], ["sept"]),
]
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 8)
self.assertEqual(len(ibm_model.trg_vocab), 6)
def test_vocabularies_are_initialized_even_with_empty_corpora(self):
parallel_corpora = []
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
self.assertEqual(len(ibm_model.trg_vocab), 0)
def test_best_model2_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
# None and 'bien' have zero fertility
translation_table = {
"i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0},
"love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03},
"ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
def test_best_model2_alignment_does_not_change_pegged_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
translation_table = {
"i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0},
"love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03},
"ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act: force 'love' to be pegged to 'jambon'
a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
# assert
self.assertEqual(a_info.alignment[1:], (1, 4, 4))
self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
def test_best_model2_alignment_handles_fertile_words(self):
# arrange
sentence_pair = AlignedSent(
["i", "really", ",", "really", "love", "ham"],
TestIBMModel.__TEST_SRC_SENTENCE,
)
# 'bien' produces 2 target words: 'really' and another 'really'
translation_table = {
"i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0},
"really": {"j'": 0, "aime": 0, "bien": 0.9, "jambon": 0.01, None: 0.09},
",": {"j'": 0, "aime": 0, "bien": 0.3, "jambon": 0, None: 0.7},
"love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03},
"ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
def test_best_model2_alignment_handles_empty_src_sentence(self):
# arrange
sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (0, 0, 0))
self.assertEqual(a_info.cepts, [[1, 2, 3]])
def test_best_model2_alignment_handles_empty_trg_sentence(self):
# arrange
sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], ())
self.assertEqual(a_info.cepts, [[], [], [], [], []])
def test_neighboring_finds_neighbor_alignments(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, "des", "œufs", "verts"),
("UNUSED", "green", "eggs"),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = {
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
(0, 3, 0),
(0, 3, 1),
(0, 3, 3),
# swaps
(0, 2, 3),
# original alignment
(0, 3, 2),
}
self.assertEqual(neighbor_alignments, expected_alignments)
def test_neighboring_sets_neighbor_alignment_info(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, "des", "œufs", "verts"),
("UNUSED", "green", "eggs"),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert: select a few particular alignments
for neighbor in neighbors:
if neighbor.alignment == (0, 2, 2):
moved_alignment = neighbor
elif neighbor.alignment == (0, 3, 2):
swapped_alignment = neighbor
self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
def test_neighboring_returns_neighbors_with_pegged_alignment(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, "des", "œufs", "verts"),
("UNUSED", "green", "eggs"),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act: peg 'eggs' to align with 'œufs'
neighbors = ibm_model.neighboring(a_info, 2)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = {
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
# no swaps
# original alignment
(0, 3, 2),
}
self.assertEqual(neighbor_alignments, expected_alignments)
def test_hillclimb(self):
# arrange
initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
def neighboring_mock(a, j):
if a.alignment == (0, 3, 2):
return {
AlignmentInfo((0, 2, 2), None, None, None),
AlignmentInfo((0, 1, 1), None, None, None),
}
elif a.alignment == (0, 2, 2):
return {
AlignmentInfo((0, 3, 3), None, None, None),
AlignmentInfo((0, 4, 4), None, None, None),
}
return set()
def prob_t_a_given_s_mock(a):
prob_values = {
(0, 3, 2): 0.5,
(0, 2, 2): 0.6,
(0, 1, 1): 0.4,
(0, 3, 3): 0.6,
(0, 4, 4): 0.7,
}
return prob_values.get(a.alignment, 0.01)
ibm_model = IBMModel([])
ibm_model.neighboring = neighboring_mock
ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
# act
best_alignment = ibm_model.hillclimb(initial_alignment)
# assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
self.assertEqual(best_alignment.alignment, (0, 4, 4))
def test_sample(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
ibm_model = IBMModel([])
ibm_model.prob_t_a_given_s = lambda x: 0.001
# act
samples, best_alignment = ibm_model.sample(sentence_pair)
# assert
self.assertEqual(len(samples), 61)