Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: IBM Model 1 | |
# | |
# Copyright (C) 2001-2013 NLTK Project | |
# Author: Chin Yee Lee <[email protected]> | |
# Hengfeng Li <[email protected]> | |
# Ruxin Hou <[email protected]> | |
# Calvin Tanujaya Lim <[email protected]> | |
# Based on earlier version by: | |
# Will Zhang <[email protected]> | |
# Guan Gui <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Lexical translation model that ignores word order. | |
In IBM Model 1, word order is ignored for simplicity. As long as the | |
word alignments are equivalent, it doesn't matter where the word occurs | |
in the source or target sentence. Thus, the following three alignments | |
are equally likely:: | |
Source: je mange du jambon | |
Target: i eat some ham | |
Alignment: (0,0) (1,1) (2,2) (3,3) | |
Source: je mange du jambon | |
Target: some ham eat i | |
Alignment: (0,2) (1,3) (2,1) (3,1) | |
Source: du jambon je mange | |
Target: eat i some ham | |
Alignment: (0,3) (1,2) (2,0) (3,1) | |
Note that an alignment is represented here as | |
(word_index_in_target, word_index_in_source). | |
The EM algorithm used in Model 1 is: | |
:E step: In the training data, count how many times a source language | |
word is translated into a target language word, weighted by | |
the prior probability of the translation. | |
:M step: Estimate the new probability of translation based on the | |
counts from the Expectation step. | |
Notations | |
--------- | |
:i: Position in the source sentence | |
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence | |
:j: Position in the target sentence | |
Valid values are 1, 2, ..., length of target sentence | |
:s: A word in the source language | |
:t: A word in the target language | |
References | |
---------- | |
Philipp Koehn. 2010. Statistical Machine Translation. | |
Cambridge University Press, New York. | |
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and | |
Robert L. Mercer. 1993. The Mathematics of Statistical Machine | |
Translation: Parameter Estimation. Computational Linguistics, 19 (2), | |
263-311. | |
""" | |
import warnings | |
from collections import defaultdict | |
from nltk.translate import AlignedSent, Alignment, IBMModel | |
from nltk.translate.ibm_model import Counts | |
class IBMModel1(IBMModel): | |
""" | |
Lexical translation model that ignores word order | |
>>> bitext = [] | |
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) | |
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) | |
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) | |
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) | |
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) | |
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) | |
>>> ibm1 = IBMModel1(bitext, 5) | |
>>> print(round(ibm1.translation_table['buch']['book'], 3)) | |
0.889 | |
>>> print(round(ibm1.translation_table['das']['book'], 3)) | |
0.062 | |
>>> print(round(ibm1.translation_table['buch'][None], 3)) | |
0.113 | |
>>> print(round(ibm1.translation_table['ja'][None], 3)) | |
0.073 | |
>>> test_sentence = bitext[2] | |
>>> test_sentence.words | |
['das', 'buch', 'ist', 'ja', 'klein'] | |
>>> test_sentence.mots | |
['the', 'book', 'is', 'small'] | |
>>> test_sentence.alignment | |
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) | |
""" | |
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): | |
""" | |
Train on ``sentence_aligned_corpus`` and create a lexical | |
translation model. | |
Translation direction is from ``AlignedSent.mots`` to | |
``AlignedSent.words``. | |
:param sentence_aligned_corpus: Sentence-aligned parallel corpus | |
:type sentence_aligned_corpus: list(AlignedSent) | |
:param iterations: Number of iterations to run training algorithm | |
:type iterations: int | |
:param probability_tables: Optional. Use this to pass in custom | |
probability values. If not specified, probabilities will be | |
set to a uniform distribution, or some other sensible value. | |
If specified, the following entry must be present: | |
``translation_table``. | |
See ``IBMModel`` for the type and purpose of this table. | |
:type probability_tables: dict[str]: object | |
""" | |
super().__init__(sentence_aligned_corpus) | |
if probability_tables is None: | |
self.set_uniform_probabilities(sentence_aligned_corpus) | |
else: | |
# Set user-defined probabilities | |
self.translation_table = probability_tables["translation_table"] | |
for n in range(0, iterations): | |
self.train(sentence_aligned_corpus) | |
self.align_all(sentence_aligned_corpus) | |
def set_uniform_probabilities(self, sentence_aligned_corpus): | |
initial_prob = 1 / len(self.trg_vocab) | |
if initial_prob < IBMModel.MIN_PROB: | |
warnings.warn( | |
"Target language vocabulary is too large (" | |
+ str(len(self.trg_vocab)) | |
+ " words). " | |
"Results may be less accurate." | |
) | |
for t in self.trg_vocab: | |
self.translation_table[t] = defaultdict(lambda: initial_prob) | |
def train(self, parallel_corpus): | |
counts = Counts() | |
for aligned_sentence in parallel_corpus: | |
trg_sentence = aligned_sentence.words | |
src_sentence = [None] + aligned_sentence.mots | |
# E step (a): Compute normalization factors to weigh counts | |
total_count = self.prob_all_alignments(src_sentence, trg_sentence) | |
# E step (b): Collect counts | |
for t in trg_sentence: | |
for s in src_sentence: | |
count = self.prob_alignment_point(s, t) | |
normalized_count = count / total_count[t] | |
counts.t_given_s[t][s] += normalized_count | |
counts.any_t_given_s[s] += normalized_count | |
# M step: Update probabilities with maximum likelihood estimate | |
self.maximize_lexical_translation_probabilities(counts) | |
def prob_all_alignments(self, src_sentence, trg_sentence): | |
""" | |
Computes the probability of all possible word alignments, | |
expressed as a marginal distribution over target words t | |
Each entry in the return value represents the contribution to | |
the total alignment probability by the target word t. | |
To obtain probability(alignment | src_sentence, trg_sentence), | |
simply sum the entries in the return value. | |
:return: Probability of t for all s in ``src_sentence`` | |
:rtype: dict(str): float | |
""" | |
alignment_prob_for_t = defaultdict(lambda: 0.0) | |
for t in trg_sentence: | |
for s in src_sentence: | |
alignment_prob_for_t[t] += self.prob_alignment_point(s, t) | |
return alignment_prob_for_t | |
def prob_alignment_point(self, s, t): | |
""" | |
Probability that word ``t`` in the target sentence is aligned to | |
word ``s`` in the source sentence | |
""" | |
return self.translation_table[t][s] | |
def prob_t_a_given_s(self, alignment_info): | |
""" | |
Probability of target sentence and an alignment given the | |
source sentence | |
""" | |
prob = 1.0 | |
for j, i in enumerate(alignment_info.alignment): | |
if j == 0: | |
continue # skip the dummy zeroeth element | |
trg_word = alignment_info.trg_sentence[j] | |
src_word = alignment_info.src_sentence[i] | |
prob *= self.translation_table[trg_word][src_word] | |
return max(prob, IBMModel.MIN_PROB) | |
def align_all(self, parallel_corpus): | |
for sentence_pair in parallel_corpus: | |
self.align(sentence_pair) | |
def align(self, sentence_pair): | |
""" | |
Determines the best word alignment for one sentence pair from | |
the corpus that the model was trained on. | |
The best alignment will be set in ``sentence_pair`` when the | |
method returns. In contrast with the internal implementation of | |
IBM models, the word indices in the ``Alignment`` are zero- | |
indexed, not one-indexed. | |
:param sentence_pair: A sentence in the source language and its | |
counterpart sentence in the target language | |
:type sentence_pair: AlignedSent | |
""" | |
best_alignment = [] | |
for j, trg_word in enumerate(sentence_pair.words): | |
# Initialize trg_word to align with the NULL token | |
best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) | |
best_alignment_point = None | |
for i, src_word in enumerate(sentence_pair.mots): | |
align_prob = self.translation_table[trg_word][src_word] | |
if align_prob >= best_prob: # prefer newer word in case of tie | |
best_prob = align_prob | |
best_alignment_point = i | |
best_alignment.append((j, best_alignment_point)) | |
sentence_pair.alignment = Alignment(best_alignment) | |