Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /parse /viterbi.py

sunnychenxiwang

update nltk

d916065 about 1 year ago

raw

history blame

18.4 kB

	# Natural Language Toolkit: Viterbi Probabilistic Parser
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Edward Loper <[email protected]>
	# Steven Bird <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	from functools import reduce

	from nltk.parse.api import ParserI
	from nltk.tree import ProbabilisticTree, Tree

	##//////////////////////////////////////////////////////
	## Viterbi PCFG Parser
	##//////////////////////////////////////////////////////


	class ViterbiParser(ParserI):
	"""
	A bottom-up ``PCFG`` parser that uses dynamic programming to find
	the single most likely parse for a text. The ``ViterbiParser`` parser
	parses texts by filling in a "most likely constituent table".
	This table records the most probable tree representation for any
	given span and node value. In particular, it has an entry for
	every start index, end index, and node value, recording the most
	likely subtree that spans from the start index to the end index,
	and has the given node value.

	The ``ViterbiParser`` parser fills in this table incrementally. It starts
	by filling in all entries for constituents that span one element
	of text (i.e., entries where the end index is one greater than the
	start index). After it has filled in all table entries for
	constituents that span one element of text, it fills in the
	entries for constitutants that span two elements of text. It
	continues filling in the entries for constituents spanning larger
	and larger portions of the text, until the entire table has been
	filled. Finally, it returns the table entry for a constituent
	spanning the entire text, whose node value is the grammar's start
	symbol.

	In order to find the most likely constituent with a given span and
	node value, the ``ViterbiParser`` parser considers all productions that
	could produce that node value. For each production, it finds all
	children that collectively cover the span and have the node values
	specified by the production's right hand side. If the probability
	of the tree formed by applying the production to the children is
	greater than the probability of the current entry in the table,
	then the table is updated with this new tree.

	A pseudo-code description of the algorithm used by
	``ViterbiParser`` is:

	\| Create an empty most likely constituent table, MLC.
	\| For width in 1...len(text):
	\| For start in 1...len(text)-width:
	\| For prod in grammar.productions:
	\| For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
	\| where t[i].label()==prod.rhs[i],
	\| and the sequence covers [start:start+width]:
	\| old_p = MLC[start, start+width, prod.lhs]
	\| new_p = P(t[1])P(t[1])...P(t[n])P(prod)
	\| if new_p > old_p:
	\| new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
	\| MLC[start, start+width, prod.lhs] = new_tree
	\| Return MLC[0, len(text), start_symbol]

	:type _grammar: PCFG
	:ivar _grammar: The grammar used to parse sentences.
	:type _trace: int
	:ivar _trace: The level of tracing output that should be generated
	when parsing a text.
	"""

	def __init__(self, grammar, trace=0):
	"""
	Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
	parse texts.

	:type grammar: PCFG
	:param grammar: The grammar used to parse texts.
	:type trace: int
	:param trace: The level of tracing that should be used when
	parsing a text. ``0`` will generate no tracing output;
	and higher numbers will produce more verbose tracing
	output.
	"""
	self._grammar = grammar
	self._trace = trace

	def grammar(self):
	return self._grammar

	def trace(self, trace=2):
	"""
	Set the level of tracing output that should be generated when
	parsing a text.

	:type trace: int
	:param trace: The trace level. A trace level of ``0`` will
	generate no tracing output; and higher trace levels will
	produce more verbose tracing output.
	:rtype: None
	"""
	self._trace = trace

	def parse(self, tokens):
	# Inherit docs from ParserI

	tokens = list(tokens)
	self._grammar.check_coverage(tokens)

	# The most likely constituent table. This table specifies the
	# most likely constituent for a given span and type.
	# Constituents can be either Trees or tokens. For Trees,
	# the "type" is the Nonterminal for the tree's root node
	# value. For Tokens, the "type" is the token's type.
	# The table is stored as a dictionary, since it is sparse.
	constituents = {}

	# Initialize the constituents dictionary with the words from
	# the text.
	if self._trace:
	print("Inserting tokens into the most likely" + " constituents table...")
	for index in range(len(tokens)):
	token = tokens[index]
	constituents[index, index + 1, token] = token
	if self._trace > 1:
	self._trace_lexical_insertion(token, index, len(tokens))

	# Consider each span of length 1, 2, ..., n; and add any trees
	# that might cover that span to the constituents dictionary.
	for length in range(1, len(tokens) + 1):
	if self._trace:
	print(
	"Finding the most likely constituents"
	+ " spanning %d text elements..." % length
	)
	for start in range(len(tokens) - length + 1):
	span = (start, start + length)
	self._add_constituents_spanning(span, constituents, tokens)

	# Return the tree that spans the entire text & have the right cat
	tree = constituents.get((0, len(tokens), self._grammar.start()))
	if tree is not None:
	yield tree

	def _add_constituents_spanning(self, span, constituents, tokens):
	"""
	Find any constituents that might cover ``span``, and add them
	to the most likely constituents table.

	:rtype: None
	:type span: tuple(int, int)
	:param span: The section of the text for which we are
	trying to find possible constituents. The span is
	specified as a pair of integers, where the first integer
	is the index of the first token that should be included in
	the constituent; and the second integer is the index of
	the first token that should not be included in the
	constituent. I.e., the constituent should cover
	``text[span[0]:span[1]]``, where ``text`` is the text
	that we are parsing.

	:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
	:param constituents: The most likely constituents table. This
	table records the most probable tree representation for
	any given span and node value. In particular,
	``constituents(s,e,nv)`` is the most likely
	``ProbabilisticTree`` that covers ``text[s:e]``
	and has a node value ``nv.symbol()``, where ``text``
	is the text that we are parsing. When
	``_add_constituents_spanning`` is called, ``constituents``
	should contain all possible constituents that are shorter
	than ``span``.

	:type tokens: list of tokens
	:param tokens: The text we are parsing. This is only used for
	trace output.
	"""
	# Since some of the grammar productions may be unary, we need to
	# repeatedly try all of the productions until none of them add any
	# new constituents.
	changed = True
	while changed:
	changed = False

	# Find all ways instantiations of the grammar productions that
	# cover the span.
	instantiations = self._find_instantiations(span, constituents)

	# For each production instantiation, add a new
	# ProbabilisticTree whose probability is the product
	# of the childrens' probabilities and the production's
	# probability.
	for (production, children) in instantiations:
	subtrees = [c for c in children if isinstance(c, Tree)]
	p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
	node = production.lhs().symbol()
	tree = ProbabilisticTree(node, children, prob=p)

	# If it's new a constituent, then add it to the
	# constituents dictionary.
	c = constituents.get((span[0], span[1], production.lhs()))
	if self._trace > 1:
	if c is None or c != tree:
	if c is None or c.prob() < tree.prob():
	print(" Insert:", end=" ")
	else:
	print(" Discard:", end=" ")
	self._trace_production(production, p, span, len(tokens))
	if c is None or c.prob() < tree.prob():
	constituents[span[0], span[1], production.lhs()] = tree
	changed = True

	def _find_instantiations(self, span, constituents):
	"""
	:return: a list of the production instantiations that cover a
	given span of the text. A "production instantiation" is
	a tuple containing a production and a list of children,
	where the production's right hand side matches the list of
	children; and the children cover ``span``. :rtype: list
	of ``pair`` of ``Production``, (list of
	(``ProbabilisticTree`` or token.

	:type span: tuple(int, int)
	:param span: The section of the text for which we are
	trying to find production instantiations. The span is
	specified as a pair of integers, where the first integer
	is the index of the first token that should be covered by
	the production instantiation; and the second integer is
	the index of the first token that should not be covered by
	the production instantiation.
	:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
	:param constituents: The most likely constituents table. This
	table records the most probable tree representation for
	any given span and node value. See the module
	documentation for more information.
	"""
	rv = []
	for production in self._grammar.productions():
	childlists = self._match_rhs(production.rhs(), span, constituents)

	for childlist in childlists:
	rv.append((production, childlist))
	return rv

	def _match_rhs(self, rhs, span, constituents):
	"""
	:return: a set of all the lists of children that cover ``span``
	and that match ``rhs``.
	:rtype: list(list(ProbabilisticTree or token)

	:type rhs: list(Nonterminal or any)
	:param rhs: The list specifying what kinds of children need to
	cover ``span``. Each nonterminal in ``rhs`` specifies
	that the corresponding child should be a tree whose node
	value is that nonterminal's symbol. Each terminal in ``rhs``
	specifies that the corresponding child should be a token
	whose type is that terminal.
	:type span: tuple(int, int)
	:param span: The section of the text for which we are
	trying to find child lists. The span is specified as a
	pair of integers, where the first integer is the index of
	the first token that should be covered by the child list;
	and the second integer is the index of the first token
	that should not be covered by the child list.
	:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
	:param constituents: The most likely constituents table. This
	table records the most probable tree representation for
	any given span and node value. See the module
	documentation for more information.
	"""
	(start, end) = span

	# Base case
	if start >= end and rhs == ():
	return [[]]
	if start >= end or rhs == ():
	return []

	# Find everything that matches the 1st symbol of the RHS
	childlists = []
	for split in range(start, end + 1):
	l = constituents.get((start, split, rhs[0]))
	if l is not None:
	rights = self._match_rhs(rhs[1:], (split, end), constituents)
	childlists += [[l] + r for r in rights]

	return childlists

	def _trace_production(self, production, p, span, width):
	"""
	Print trace output indicating that a given production has been
	applied at a given location.

	:param production: The production that has been applied
	:type production: Production
	:param p: The probability of the tree produced by the production.
	:type p: float
	:param span: The span of the production
	:type span: tuple
	:rtype: None
	"""

	str = "\|" + "." * span[0]
	str += "=" * (span[1] - span[0])
	str += "." * (width - span[1]) + "\| "
	str += "%s" % production
	if self._trace > 2:
	str = f"{str:<40} {p:12.10f} "

	print(str)

	def _trace_lexical_insertion(self, token, index, width):
	str = " Insert: \|" + "." * index + "=" + "." * (width - index - 1) + "\| "
	str += f"{token}"
	print(str)

	def __repr__(self):
	return "<ViterbiParser for %r>" % self._grammar


	##//////////////////////////////////////////////////////
	## Test Code
	##//////////////////////////////////////////////////////


	def demo():
	"""
	A demonstration of the probabilistic parsers. The user is
	prompted to select which demo to run, and how many parses should
	be found; and then each parser is run on the same demo, and a
	summary of the results are displayed.
	"""
	import sys
	import time

	from nltk import tokenize
	from nltk.grammar import PCFG
	from nltk.parse import ViterbiParser

	toy_pcfg1 = PCFG.fromstring(
	"""
	S -> NP VP [1.0]
	NP -> Det N [0.5] \| NP PP [0.25] \| 'John' [0.1] \| 'I' [0.15]
	Det -> 'the' [0.8] \| 'my' [0.2]
	N -> 'man' [0.5] \| 'telescope' [0.5]
	VP -> VP PP [0.1] \| V NP [0.7] \| V [0.2]
	V -> 'ate' [0.35] \| 'saw' [0.65]
	PP -> P NP [1.0]
	P -> 'with' [0.61] \| 'under' [0.39]
	"""
	)

	toy_pcfg2 = PCFG.fromstring(
	"""
	S -> NP VP [1.0]
	VP -> V NP [.59]
	VP -> V [.40]
	VP -> VP PP [.01]
	NP -> Det N [.41]
	NP -> Name [.28]
	NP -> NP PP [.31]
	PP -> P NP [1.0]
	V -> 'saw' [.21]
	V -> 'ate' [.51]
	V -> 'ran' [.28]
	N -> 'boy' [.11]
	N -> 'cookie' [.12]
	N -> 'table' [.13]
	N -> 'telescope' [.14]
	N -> 'hill' [.5]
	Name -> 'Jack' [.52]
	Name -> 'Bob' [.48]
	P -> 'with' [.61]
	P -> 'under' [.39]
	Det -> 'the' [.41]
	Det -> 'a' [.31]
	Det -> 'my' [.28]
	"""
	)

	# Define two demos. Each demo has a sentence and a grammar.
	demos = [
	("I saw the man with my telescope", toy_pcfg1),
	("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
	]

	# Ask the user which demo they want to use.
	print()
	for i in range(len(demos)):
	print(f"{i + 1:>3}: {demos[i][0]}")
	print(" %r" % demos[i][1])
	print()
	print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
	try:
	snum = int(sys.stdin.readline().strip()) - 1
	sent, grammar = demos[snum]
	except:
	print("Bad sentence number")
	return

	# Tokenize the sentence.
	tokens = sent.split()

	parser = ViterbiParser(grammar)
	all_parses = {}

	print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
	parser.trace(3)
	t = time.time()
	parses = parser.parse_all(tokens)
	time = time.time() - t
	average = (
	reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
	)
	num_parses = len(parses)
	for p in parses:
	all_parses[p.freeze()] = 1

	# Print some summary statistics
	print()
	print("Time (secs) # Parses Average P(parse)")
	print("-----------------------------------------")
	print("%11.4f%11d%19.14f" % (time, num_parses, average))
	parses = all_parses.keys()
	if parses:
	p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
	else:
	p = 0
	print("------------------------------------------")
	print("%11s%11d%19.14f" % ("n/a", len(parses), p))

	# Ask the user if we should draw the parses.
	print()
	print("Draw parses (y/n)? ", end=" ")
	if sys.stdin.readline().strip().lower().startswith("y"):
	from nltk.draw.tree import draw_trees

	print(" please wait...")
	draw_trees(*parses)

	# Ask the user if we should print the parses.
	print()
	print("Print parses (y/n)? ", end=" ")
	if sys.stdin.readline().strip().lower().startswith("y"):
	for parse in parses:
	print(parse)


	if __name__ == "__main__":
	demo()