Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /translate /ibm1.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

9.52 kB

	# Natural Language Toolkit: IBM Model 1
	#
	# Copyright (C) 2001-2013 NLTK Project
	# Author: Chin Yee Lee <[email protected]>
	# Hengfeng Li <[email protected]>
	# Ruxin Hou <[email protected]>
	# Calvin Tanujaya Lim <[email protected]>
	# Based on earlier version by:
	# Will Zhang <[email protected]>
	# Guan Gui <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	Lexical translation model that ignores word order.

	In IBM Model 1, word order is ignored for simplicity. As long as the
	word alignments are equivalent, it doesn't matter where the word occurs
	in the source or target sentence. Thus, the following three alignments
	are equally likely::

	Source: je mange du jambon
	Target: i eat some ham
	Alignment: (0,0) (1,1) (2,2) (3,3)

	Source: je mange du jambon
	Target: some ham eat i
	Alignment: (0,2) (1,3) (2,1) (3,1)

	Source: du jambon je mange
	Target: eat i some ham
	Alignment: (0,3) (1,2) (2,0) (3,1)

	Note that an alignment is represented here as
	(word_index_in_target, word_index_in_source).

	The EM algorithm used in Model 1 is:

	:E step: In the training data, count how many times a source language
	word is translated into a target language word, weighted by
	the prior probability of the translation.

	:M step: Estimate the new probability of translation based on the
	counts from the Expectation step.

	Notations
	---------

	:i: Position in the source sentence
	Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
	:j: Position in the target sentence
	Valid values are 1, 2, ..., length of target sentence
	:s: A word in the source language
	:t: A word in the target language

	References
	----------

	Philipp Koehn. 2010. Statistical Machine Translation.
	Cambridge University Press, New York.

	Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
	Robert L. Mercer. 1993. The Mathematics of Statistical Machine
	Translation: Parameter Estimation. Computational Linguistics, 19 (2),
	263-311.
	"""

	import warnings
	from collections import defaultdict

	from nltk.translate import AlignedSent, Alignment, IBMModel
	from nltk.translate.ibm_model import Counts


	class IBMModel1(IBMModel):
	"""
	Lexical translation model that ignores word order

	>>> bitext = []
	>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
	>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
	>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
	>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
	>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
	>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))

	>>> ibm1 = IBMModel1(bitext, 5)

	>>> print(round(ibm1.translation_table['buch']['book'], 3))
	0.889
	>>> print(round(ibm1.translation_table['das']['book'], 3))
	0.062
	>>> print(round(ibm1.translation_table['buch'][None], 3))
	0.113
	>>> print(round(ibm1.translation_table['ja'][None], 3))
	0.073

	>>> test_sentence = bitext[2]
	>>> test_sentence.words
	['das', 'buch', 'ist', 'ja', 'klein']
	>>> test_sentence.mots
	['the', 'book', 'is', 'small']
	>>> test_sentence.alignment
	Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])

	"""

	def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
	"""
	Train on ``sentence_aligned_corpus`` and create a lexical
	translation model.

	Translation direction is from ``AlignedSent.mots`` to
	``AlignedSent.words``.

	:param sentence_aligned_corpus: Sentence-aligned parallel corpus
	:type sentence_aligned_corpus: list(AlignedSent)

	:param iterations: Number of iterations to run training algorithm
	:type iterations: int

	:param probability_tables: Optional. Use this to pass in custom
	probability values. If not specified, probabilities will be
	set to a uniform distribution, or some other sensible value.
	If specified, the following entry must be present:
	``translation_table``.
	See ``IBMModel`` for the type and purpose of this table.
	:type probability_tables: dict[str]: object
	"""
	super().__init__(sentence_aligned_corpus)

	if probability_tables is None:
	self.set_uniform_probabilities(sentence_aligned_corpus)
	else:
	# Set user-defined probabilities
	self.translation_table = probability_tables["translation_table"]

	for n in range(0, iterations):
	self.train(sentence_aligned_corpus)

	self.align_all(sentence_aligned_corpus)

	def set_uniform_probabilities(self, sentence_aligned_corpus):
	initial_prob = 1 / len(self.trg_vocab)
	if initial_prob < IBMModel.MIN_PROB:
	warnings.warn(
	"Target language vocabulary is too large ("
	+ str(len(self.trg_vocab))
	+ " words). "
	"Results may be less accurate."
	)

	for t in self.trg_vocab:
	self.translation_table[t] = defaultdict(lambda: initial_prob)

	def train(self, parallel_corpus):
	counts = Counts()
	for aligned_sentence in parallel_corpus:
	trg_sentence = aligned_sentence.words
	src_sentence = [None] + aligned_sentence.mots

	# E step (a): Compute normalization factors to weigh counts
	total_count = self.prob_all_alignments(src_sentence, trg_sentence)

	# E step (b): Collect counts
	for t in trg_sentence:
	for s in src_sentence:
	count = self.prob_alignment_point(s, t)
	normalized_count = count / total_count[t]
	counts.t_given_s[t][s] += normalized_count
	counts.any_t_given_s[s] += normalized_count

	# M step: Update probabilities with maximum likelihood estimate
	self.maximize_lexical_translation_probabilities(counts)

	def prob_all_alignments(self, src_sentence, trg_sentence):
	"""
	Computes the probability of all possible word alignments,
	expressed as a marginal distribution over target words t

	Each entry in the return value represents the contribution to
	the total alignment probability by the target word t.

	To obtain probability(alignment \| src_sentence, trg_sentence),
	simply sum the entries in the return value.

	:return: Probability of t for all s in ``src_sentence``
	:rtype: dict(str): float
	"""
	alignment_prob_for_t = defaultdict(lambda: 0.0)
	for t in trg_sentence:
	for s in src_sentence:
	alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
	return alignment_prob_for_t

	def prob_alignment_point(self, s, t):
	"""
	Probability that word ``t`` in the target sentence is aligned to
	word ``s`` in the source sentence
	"""
	return self.translation_table[t][s]

	def prob_t_a_given_s(self, alignment_info):
	"""
	Probability of target sentence and an alignment given the
	source sentence
	"""
	prob = 1.0

	for j, i in enumerate(alignment_info.alignment):
	if j == 0:
	continue # skip the dummy zeroeth element
	trg_word = alignment_info.trg_sentence[j]
	src_word = alignment_info.src_sentence[i]
	prob *= self.translation_table[trg_word][src_word]

	return max(prob, IBMModel.MIN_PROB)

	def align_all(self, parallel_corpus):
	for sentence_pair in parallel_corpus:
	self.align(sentence_pair)

	def align(self, sentence_pair):
	"""
	Determines the best word alignment for one sentence pair from
	the corpus that the model was trained on.

	The best alignment will be set in ``sentence_pair`` when the
	method returns. In contrast with the internal implementation of
	IBM models, the word indices in the ``Alignment`` are zero-
	indexed, not one-indexed.

	:param sentence_pair: A sentence in the source language and its
	counterpart sentence in the target language
	:type sentence_pair: AlignedSent
	"""
	best_alignment = []

	for j, trg_word in enumerate(sentence_pair.words):
	# Initialize trg_word to align with the NULL token
	best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
	best_alignment_point = None
	for i, src_word in enumerate(sentence_pair.mots):
	align_prob = self.translation_table[trg_word][src_word]
	if align_prob >= best_prob: # prefer newer word in case of tie
	best_prob = align_prob
	best_alignment_point = i

	best_alignment.append((j, best_alignment_point))

	sentence_pair.alignment = Alignment(best_alignment)