Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Shift-Reduce Parser | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Edward Loper <[email protected]> | |
# Steven Bird <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
from nltk.grammar import Nonterminal | |
from nltk.parse.api import ParserI | |
from nltk.tree import Tree | |
##////////////////////////////////////////////////////// | |
## Shift/Reduce Parser | |
##////////////////////////////////////////////////////// | |
class ShiftReduceParser(ParserI): | |
""" | |
A simple bottom-up CFG parser that uses two operations, "shift" | |
and "reduce", to find a single parse for a text. | |
``ShiftReduceParser`` maintains a stack, which records the | |
structure of a portion of the text. This stack is a list of | |
strings and Trees that collectively cover a portion of | |
the text. For example, while parsing the sentence "the dog saw | |
the man" with a typical grammar, ``ShiftReduceParser`` will produce | |
the following stack, which covers "the dog saw":: | |
[(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] | |
``ShiftReduceParser`` attempts to extend the stack to cover the | |
entire text, and to combine the stack elements into a single tree, | |
producing a complete parse for the sentence. | |
Initially, the stack is empty. It is extended to cover the text, | |
from left to right, by repeatedly applying two operations: | |
- "shift" moves a token from the beginning of the text to the | |
end of the stack. | |
- "reduce" uses a CFG production to combine the rightmost stack | |
elements into a single Tree. | |
Often, more than one operation can be performed on a given stack. | |
In this case, ``ShiftReduceParser`` uses the following heuristics | |
to decide which operation to perform: | |
- Only shift if no reductions are available. | |
- If multiple reductions are available, then apply the reduction | |
whose CFG production is listed earliest in the grammar. | |
Note that these heuristics are not guaranteed to choose an | |
operation that leads to a parse of the text. Also, if multiple | |
parses exists, ``ShiftReduceParser`` will return at most one of | |
them. | |
:see: ``nltk.grammar`` | |
""" | |
def __init__(self, grammar, trace=0): | |
""" | |
Create a new ``ShiftReduceParser``, that uses ``grammar`` to | |
parse texts. | |
:type grammar: Grammar | |
:param grammar: The grammar used to parse texts. | |
:type trace: int | |
:param trace: The level of tracing that should be used when | |
parsing a text. ``0`` will generate no tracing output; | |
and higher numbers will produce more verbose tracing | |
output. | |
""" | |
self._grammar = grammar | |
self._trace = trace | |
self._check_grammar() | |
def grammar(self): | |
return self._grammar | |
def parse(self, tokens): | |
tokens = list(tokens) | |
self._grammar.check_coverage(tokens) | |
# initialize the stack. | |
stack = [] | |
remaining_text = tokens | |
# Trace output. | |
if self._trace: | |
print("Parsing %r" % " ".join(tokens)) | |
self._trace_stack(stack, remaining_text) | |
# iterate through the text, pushing the token onto | |
# the stack, then reducing the stack. | |
while len(remaining_text) > 0: | |
self._shift(stack, remaining_text) | |
while self._reduce(stack, remaining_text): | |
pass | |
# Did we reduce everything? | |
if len(stack) == 1: | |
# Did we end up with the right category? | |
if stack[0].label() == self._grammar.start().symbol(): | |
yield stack[0] | |
def _shift(self, stack, remaining_text): | |
""" | |
Move a token from the beginning of ``remaining_text`` to the | |
end of ``stack``. | |
:type stack: list(str and Tree) | |
:param stack: A list of strings and Trees, encoding | |
the structure of the text that has been parsed so far. | |
:type remaining_text: list(str) | |
:param remaining_text: The portion of the text that is not yet | |
covered by ``stack``. | |
:rtype: None | |
""" | |
stack.append(remaining_text[0]) | |
remaining_text.remove(remaining_text[0]) | |
if self._trace: | |
self._trace_shift(stack, remaining_text) | |
def _match_rhs(self, rhs, rightmost_stack): | |
""" | |
:rtype: bool | |
:return: true if the right hand side of a CFG production | |
matches the rightmost elements of the stack. ``rhs`` | |
matches ``rightmost_stack`` if they are the same length, | |
and each element of ``rhs`` matches the corresponding | |
element of ``rightmost_stack``. A nonterminal element of | |
``rhs`` matches any Tree whose node value is equal | |
to the nonterminal's symbol. A terminal element of ``rhs`` | |
matches any string whose type is equal to the terminal. | |
:type rhs: list(terminal and Nonterminal) | |
:param rhs: The right hand side of a CFG production. | |
:type rightmost_stack: list(string and Tree) | |
:param rightmost_stack: The rightmost elements of the parser's | |
stack. | |
""" | |
if len(rightmost_stack) != len(rhs): | |
return False | |
for i in range(len(rightmost_stack)): | |
if isinstance(rightmost_stack[i], Tree): | |
if not isinstance(rhs[i], Nonterminal): | |
return False | |
if rightmost_stack[i].label() != rhs[i].symbol(): | |
return False | |
else: | |
if isinstance(rhs[i], Nonterminal): | |
return False | |
if rightmost_stack[i] != rhs[i]: | |
return False | |
return True | |
def _reduce(self, stack, remaining_text, production=None): | |
""" | |
Find a CFG production whose right hand side matches the | |
rightmost stack elements; and combine those stack elements | |
into a single Tree, with the node specified by the | |
production's left-hand side. If more than one CFG production | |
matches the stack, then use the production that is listed | |
earliest in the grammar. The new Tree replaces the | |
elements in the stack. | |
:rtype: Production or None | |
:return: If a reduction is performed, then return the CFG | |
production that the reduction is based on; otherwise, | |
return false. | |
:type stack: list(string and Tree) | |
:param stack: A list of strings and Trees, encoding | |
the structure of the text that has been parsed so far. | |
:type remaining_text: list(str) | |
:param remaining_text: The portion of the text that is not yet | |
covered by ``stack``. | |
""" | |
if production is None: | |
productions = self._grammar.productions() | |
else: | |
productions = [production] | |
# Try each production, in order. | |
for production in productions: | |
rhslen = len(production.rhs()) | |
# check if the RHS of a production matches the top of the stack | |
if self._match_rhs(production.rhs(), stack[-rhslen:]): | |
# combine the tree to reflect the reduction | |
tree = Tree(production.lhs().symbol(), stack[-rhslen:]) | |
stack[-rhslen:] = [tree] | |
# We reduced something | |
if self._trace: | |
self._trace_reduce(stack, production, remaining_text) | |
return production | |
# We didn't reduce anything | |
return None | |
def trace(self, trace=2): | |
""" | |
Set the level of tracing output that should be generated when | |
parsing a text. | |
:type trace: int | |
:param trace: The trace level. A trace level of ``0`` will | |
generate no tracing output; and higher trace levels will | |
produce more verbose tracing output. | |
:rtype: None | |
""" | |
# 1: just show shifts. | |
# 2: show shifts & reduces | |
# 3: display which tokens & productions are shifed/reduced | |
self._trace = trace | |
def _trace_stack(self, stack, remaining_text, marker=" "): | |
""" | |
Print trace output displaying the given stack and text. | |
:rtype: None | |
:param marker: A character that is printed to the left of the | |
stack. This is used with trace level 2 to print 'S' | |
before shifted stacks and 'R' before reduced stacks. | |
""" | |
s = " " + marker + " [ " | |
for elt in stack: | |
if isinstance(elt, Tree): | |
s += repr(Nonterminal(elt.label())) + " " | |
else: | |
s += repr(elt) + " " | |
s += "* " + " ".join(remaining_text) + "]" | |
print(s) | |
def _trace_shift(self, stack, remaining_text): | |
""" | |
Print trace output displaying that a token has been shifted. | |
:rtype: None | |
""" | |
if self._trace > 2: | |
print("Shift %r:" % stack[-1]) | |
if self._trace == 2: | |
self._trace_stack(stack, remaining_text, "S") | |
elif self._trace > 0: | |
self._trace_stack(stack, remaining_text) | |
def _trace_reduce(self, stack, production, remaining_text): | |
""" | |
Print trace output displaying that ``production`` was used to | |
reduce ``stack``. | |
:rtype: None | |
""" | |
if self._trace > 2: | |
rhs = " ".join(production.rhs()) | |
print(f"Reduce {production.lhs()!r} <- {rhs}") | |
if self._trace == 2: | |
self._trace_stack(stack, remaining_text, "R") | |
elif self._trace > 1: | |
self._trace_stack(stack, remaining_text) | |
def _check_grammar(self): | |
""" | |
Check to make sure that all of the CFG productions are | |
potentially useful. If any productions can never be used, | |
then print a warning. | |
:rtype: None | |
""" | |
productions = self._grammar.productions() | |
# Any production whose RHS is an extension of another production's RHS | |
# will never be used. | |
for i in range(len(productions)): | |
for j in range(i + 1, len(productions)): | |
rhs1 = productions[i].rhs() | |
rhs2 = productions[j].rhs() | |
if rhs1[: len(rhs2)] == rhs2: | |
print("Warning: %r will never be used" % productions[i]) | |
##////////////////////////////////////////////////////// | |
## Stepping Shift/Reduce Parser | |
##////////////////////////////////////////////////////// | |
class SteppingShiftReduceParser(ShiftReduceParser): | |
""" | |
A ``ShiftReduceParser`` that allows you to setp through the parsing | |
process, performing a single operation at a time. It also allows | |
you to change the parser's grammar midway through parsing a text. | |
The ``initialize`` method is used to start parsing a text. | |
``shift`` performs a single shift operation, and ``reduce`` performs | |
a single reduce operation. ``step`` will perform a single reduce | |
operation if possible; otherwise, it will perform a single shift | |
operation. ``parses`` returns the set of parses that have been | |
found by the parser. | |
:ivar _history: A list of ``(stack, remaining_text)`` pairs, | |
containing all of the previous states of the parser. This | |
history is used to implement the ``undo`` operation. | |
:see: ``nltk.grammar`` | |
""" | |
def __init__(self, grammar, trace=0): | |
super().__init__(grammar, trace) | |
self._stack = None | |
self._remaining_text = None | |
self._history = [] | |
def parse(self, tokens): | |
tokens = list(tokens) | |
self.initialize(tokens) | |
while self.step(): | |
pass | |
return self.parses() | |
def stack(self): | |
""" | |
:return: The parser's stack. | |
:rtype: list(str and Tree) | |
""" | |
return self._stack | |
def remaining_text(self): | |
""" | |
:return: The portion of the text that is not yet covered by the | |
stack. | |
:rtype: list(str) | |
""" | |
return self._remaining_text | |
def initialize(self, tokens): | |
""" | |
Start parsing a given text. This sets the parser's stack to | |
``[]`` and sets its remaining text to ``tokens``. | |
""" | |
self._stack = [] | |
self._remaining_text = tokens | |
self._history = [] | |
def step(self): | |
""" | |
Perform a single parsing operation. If a reduction is | |
possible, then perform that reduction, and return the | |
production that it is based on. Otherwise, if a shift is | |
possible, then perform it, and return True. Otherwise, | |
return False. | |
:return: False if no operation was performed; True if a shift was | |
performed; and the CFG production used to reduce if a | |
reduction was performed. | |
:rtype: Production or bool | |
""" | |
return self.reduce() or self.shift() | |
def shift(self): | |
""" | |
Move a token from the beginning of the remaining text to the | |
end of the stack. If there are no more tokens in the | |
remaining text, then do nothing. | |
:return: True if the shift operation was successful. | |
:rtype: bool | |
""" | |
if len(self._remaining_text) == 0: | |
return False | |
self._history.append((self._stack[:], self._remaining_text[:])) | |
self._shift(self._stack, self._remaining_text) | |
return True | |
def reduce(self, production=None): | |
""" | |
Use ``production`` to combine the rightmost stack elements into | |
a single Tree. If ``production`` does not match the | |
rightmost stack elements, then do nothing. | |
:return: The production used to reduce the stack, if a | |
reduction was performed. If no reduction was performed, | |
return None. | |
:rtype: Production or None | |
""" | |
self._history.append((self._stack[:], self._remaining_text[:])) | |
return_val = self._reduce(self._stack, self._remaining_text, production) | |
if not return_val: | |
self._history.pop() | |
return return_val | |
def undo(self): | |
""" | |
Return the parser to its state before the most recent | |
shift or reduce operation. Calling ``undo`` repeatedly return | |
the parser to successively earlier states. If no shift or | |
reduce operations have been performed, ``undo`` will make no | |
changes. | |
:return: true if an operation was successfully undone. | |
:rtype: bool | |
""" | |
if len(self._history) == 0: | |
return False | |
(self._stack, self._remaining_text) = self._history.pop() | |
return True | |
def reducible_productions(self): | |
""" | |
:return: A list of the productions for which reductions are | |
available for the current parser state. | |
:rtype: list(Production) | |
""" | |
productions = [] | |
for production in self._grammar.productions(): | |
rhslen = len(production.rhs()) | |
if self._match_rhs(production.rhs(), self._stack[-rhslen:]): | |
productions.append(production) | |
return productions | |
def parses(self): | |
""" | |
:return: An iterator of the parses that have been found by this | |
parser so far. | |
:rtype: iter(Tree) | |
""" | |
if ( | |
len(self._remaining_text) == 0 | |
and len(self._stack) == 1 | |
and self._stack[0].label() == self._grammar.start().symbol() | |
): | |
yield self._stack[0] | |
# copied from nltk.parser | |
def set_grammar(self, grammar): | |
""" | |
Change the grammar used to parse texts. | |
:param grammar: The new grammar. | |
:type grammar: CFG | |
""" | |
self._grammar = grammar | |
##////////////////////////////////////////////////////// | |
## Demonstration Code | |
##////////////////////////////////////////////////////// | |
def demo(): | |
""" | |
A demonstration of the shift-reduce parser. | |
""" | |
from nltk import CFG, parse | |
grammar = CFG.fromstring( | |
""" | |
S -> NP VP | |
NP -> Det N | Det N PP | |
VP -> V NP | V NP PP | |
PP -> P NP | |
NP -> 'I' | |
N -> 'man' | 'park' | 'telescope' | 'dog' | |
Det -> 'the' | 'a' | |
P -> 'in' | 'with' | |
V -> 'saw' | |
""" | |
) | |
sent = "I saw a man in the park".split() | |
parser = parse.ShiftReduceParser(grammar, trace=2) | |
for p in parser.parse(sent): | |
print(p) | |
if __name__ == "__main__": | |
demo() | |