Spaces:
Sleeping
Sleeping
.. Copyright (C) 2001-2023 NLTK Project | |
.. For license information, see LICENSE.TXT | |
========== | |
Chunking | |
========== | |
>>> from nltk.chunk import * | |
>>> from nltk.chunk.util import * | |
>>> from nltk.chunk.regexp import * | |
>>> from nltk import Tree | |
>>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." | |
>>> gold_chunked_text = tagstr2tree(tagged_text) | |
>>> unchunked_text = gold_chunked_text.flatten() | |
Chunking uses a special regexp syntax for rules that delimit the chunks. These | |
rules must be converted to 'regular' regular expressions before a sentence can | |
be chunked. | |
>>> tag_pattern = "<DT>?<JJ>*<NN.*>" | |
>>> regexp_pattern = tag_pattern2re_pattern(tag_pattern) | |
>>> regexp_pattern | |
'(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)' | |
Construct some new chunking rules. | |
>>> chunk_rule = ChunkRule(r"<.*>+", "Chunk everything") | |
>>> strip_rule = StripRule(r"<VBD|IN|\.>", "Strip on verbs/prepositions") | |
>>> split_rule = SplitRule("<DT><NN>", "<DT><NN>", | |
... "Split successive determiner/noun pairs") | |
Create and score a series of chunk parsers, successively more complex. | |
>>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP') | |
>>> chunked_text = chunk_parser.parse(unchunked_text) | |
>>> print(chunked_text) | |
(S | |
(NP | |
The/DT | |
cat/NN | |
sat/VBD | |
on/IN | |
the/DT | |
mat/NN | |
the/DT | |
dog/NN | |
chewed/VBD | |
./.)) | |
>>> chunkscore = ChunkScore() | |
>>> chunkscore.score(gold_chunked_text, chunked_text) | |
>>> print(chunkscore.precision()) | |
0.0 | |
>>> print(chunkscore.recall()) | |
0.0 | |
>>> print(chunkscore.f_measure()) | |
0 | |
>>> for chunk in sorted(chunkscore.missed()): print(chunk) | |
(NP The/DT cat/NN) | |
(NP the/DT dog/NN) | |
(NP the/DT mat/NN) | |
>>> for chunk in chunkscore.incorrect(): print(chunk) | |
(NP | |
The/DT | |
cat/NN | |
sat/VBD | |
on/IN | |
the/DT | |
mat/NN | |
the/DT | |
dog/NN | |
chewed/VBD | |
./.) | |
>>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule], | |
... chunk_label='NP') | |
>>> chunked_text = chunk_parser.parse(unchunked_text) | |
>>> print(chunked_text) | |
(S | |
(NP The/DT cat/NN) | |
sat/VBD | |
on/IN | |
(NP the/DT mat/NN the/DT dog/NN) | |
chewed/VBD | |
./.) | |
>>> assert chunked_text == chunk_parser.parse(list(unchunked_text)) | |
>>> chunkscore = ChunkScore() | |
>>> chunkscore.score(gold_chunked_text, chunked_text) | |
>>> chunkscore.precision() | |
0.5 | |
>>> print(chunkscore.recall()) | |
0.33333333... | |
>>> print(chunkscore.f_measure()) | |
0.4 | |
>>> for chunk in sorted(chunkscore.missed()): print(chunk) | |
(NP the/DT dog/NN) | |
(NP the/DT mat/NN) | |
>>> for chunk in chunkscore.incorrect(): print(chunk) | |
(NP the/DT mat/NN the/DT dog/NN) | |
>>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule, split_rule], | |
... chunk_label='NP') | |
>>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) | |
# Input: | |
<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.> | |
# Chunk everything: | |
{<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>} | |
# Strip on verbs/prepositions: | |
{<DT> <NN>} <VBD> <IN> {<DT> <NN> <DT> <NN>} <VBD> <.> | |
# Split successive determiner/noun pairs: | |
{<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.> | |
>>> print(chunked_text) | |
(S | |
(NP The/DT cat/NN) | |
sat/VBD | |
on/IN | |
(NP the/DT mat/NN) | |
(NP the/DT dog/NN) | |
chewed/VBD | |
./.) | |
>>> chunkscore = ChunkScore() | |
>>> chunkscore.score(gold_chunked_text, chunked_text) | |
>>> chunkscore.precision() | |
1.0 | |
>>> chunkscore.recall() | |
1.0 | |
>>> chunkscore.f_measure() | |
1.0 | |
>>> chunkscore.missed() | |
[] | |
>>> chunkscore.incorrect() | |
[] | |
>>> chunk_parser.rules() | |
[<ChunkRule: '<.*>+'>, <StripRule: '<VBD|IN|\\.>'>, | |
<SplitRule: '<DT><NN>', '<DT><NN>'>] | |
Printing parsers: | |
>>> print(repr(chunk_parser)) | |
<RegexpChunkParser with 3 rules> | |
>>> print(chunk_parser) | |
RegexpChunkParser with 3 rules: | |
Chunk everything | |
<ChunkRule: '<.*>+'> | |
Strip on verbs/prepositions | |
<StripRule: '<VBD|IN|\\.>'> | |
Split successive determiner/noun pairs | |
<SplitRule: '<DT><NN>', '<DT><NN>'> | |
Regression Tests | |
~~~~~~~~~~~~~~~~ | |
ChunkParserI | |
------------ | |
`ChunkParserI` is an abstract interface -- it is not meant to be | |
instantiated directly. | |
>>> ChunkParserI().parse([]) | |
Traceback (most recent call last): | |
. . . | |
NotImplementedError | |
ChunkString | |
----------- | |
ChunkString can be built from a tree of tagged tuples, a tree of | |
trees, or a mixed list of both: | |
>>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)]) | |
>>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])]) | |
>>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])]) | |
>>> ChunkString(t1) | |
<ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'> | |
>>> ChunkString(t2) | |
<ChunkString: '<t0><t1>'> | |
>>> ChunkString(t3) | |
<ChunkString: '<t0><t1>'> | |
Other values generate an error: | |
>>> ChunkString(Tree('S', ['x'])) | |
Traceback (most recent call last): | |
. . . | |
ValueError: chunk structures must contain tagged tokens or trees | |
The `str()` for a chunk string adds spaces to it, which makes it line | |
up with `str()` output for other chunk strings over the same | |
underlying input. | |
>>> cs = ChunkString(t1) | |
>>> print(cs) | |
<t0> <t1> <t2> <t3> <t4> <t5> <t6> <t7> <t8> <t9> | |
>>> cs.xform('<t3>', '{<t3>}') | |
>>> print(cs) | |
<t0> <t1> <t2> {<t3>} <t4> <t5> <t6> <t7> <t8> <t9> | |
The `_verify()` method makes sure that our transforms don't corrupt | |
the chunk string. By setting debug_level=2, `_verify()` will be | |
called at the end of every call to `xform`. | |
>>> cs = ChunkString(t1, debug_level=3) | |
>>> # tag not marked with <...>: | |
>>> cs.xform('<t3>', 't3') | |
Traceback (most recent call last): | |
. . . | |
ValueError: Transformation generated invalid chunkstring: | |
<t0><t1><t2>t3<t4><t5><t6><t7><t8><t9> | |
>>> # brackets not balanced: | |
>>> cs.xform('<t3>', '{<t3>') | |
Traceback (most recent call last): | |
. . . | |
ValueError: Transformation generated invalid chunkstring: | |
<t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9> | |
>>> # nested brackets: | |
>>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}') | |
Traceback (most recent call last): | |
. . . | |
ValueError: Transformation generated invalid chunkstring: | |
<t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9> | |
>>> # modified tags: | |
>>> cs.xform('<t3>', '<t9>') | |
Traceback (most recent call last): | |
. . . | |
ValueError: Transformation generated invalid chunkstring: tag changed | |
>>> # added tags: | |
>>> cs.xform('<t9>', '<t9><t10>') | |
Traceback (most recent call last): | |
. . . | |
ValueError: Transformation generated invalid chunkstring: tag changed | |
Chunking Rules | |
-------------- | |
Test the different rule constructors & __repr__ methods: | |
>>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_STRIP_PATTERN, | |
... '{<a|b>}', 'chunk <a> and <b>') | |
>>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_STRIP_PATTERN), | |
... '{<a|b>}', 'chunk <a> and <b>') | |
>>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>') | |
>>> r4 = StripRule('<a|b>', 'strip <a> and <b>') | |
>>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>') | |
>>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>') | |
>>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>') | |
>>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>') | |
>>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>') | |
>>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9: | |
... print(rule) | |
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'> | |
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'> | |
<ChunkRule: '<a|b>'> | |
<StripRule: '<a|b>'> | |
<UnChunkRule: '<a|b>'> | |
<MergeRule: '<a>', '<b>'> | |
<SplitRule: '<a>', '<b>'> | |
<ExpandLeftRule: '<a>', '<b>'> | |
<ExpandRightRule: '<a>', '<b>'> | |
`tag_pattern2re_pattern()` complains if the tag pattern looks problematic: | |
>>> tag_pattern2re_pattern('{}') | |
Traceback (most recent call last): | |
. . . | |
ValueError: Bad tag pattern: '{}' | |
RegexpChunkParser | |
----------------- | |
A warning is printed when parsing an empty sentence: | |
>>> parser = RegexpChunkParser([ChunkRule('<a>', '')]) | |
>>> parser.parse(Tree('S', [])) | |
Warning: parsing empty text | |
Tree('S', []) | |
RegexpParser | |
------------ | |
>>> parser = RegexpParser(''' | |
... NP: {<DT>? <JJ>* <NN>*} # NP | |
... P: {<IN>} # Preposition | |
... V: {<V.*>} # Verb | |
... PP: {<P> <NP>} # PP -> P NP | |
... VP: {<V> <NP|PP>*} # VP -> V (NP|PP)* | |
... ''') | |
>>> print(repr(parser)) | |
<chunk.RegexpParser with 5 stages> | |
>>> print(parser) | |
chunk.RegexpParser with 5 stages: | |
RegexpChunkParser with 1 rules: | |
NP <ChunkRule: '<DT>? <JJ>* <NN>*'> | |
RegexpChunkParser with 1 rules: | |
Preposition <ChunkRule: '<IN>'> | |
RegexpChunkParser with 1 rules: | |
Verb <ChunkRule: '<V.*>'> | |
RegexpChunkParser with 1 rules: | |
PP -> P NP <ChunkRule: '<P> <NP>'> | |
RegexpChunkParser with 1 rules: | |
VP -> V (NP|PP)* <ChunkRule: '<V> <NP|PP>*'> | |
>>> print(parser.parse(unchunked_text, trace=True)) | |
# Input: | |
<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.> | |
# NP: | |
{<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.> | |
# Input: | |
<NP> <VBD> <IN> <NP> <NP> <VBD> <.> | |
# Preposition: | |
<NP> <VBD> {<IN>} <NP> <NP> <VBD> <.> | |
# Input: | |
<NP> <VBD> <P> <NP> <NP> <VBD> <.> | |
# Verb: | |
<NP> {<VBD>} <P> <NP> <NP> {<VBD>} <.> | |
# Input: | |
<NP> <V> <P> <NP> <NP> <V> <.> | |
# PP -> P NP: | |
<NP> <V> {<P> <NP>} <NP> <V> <.> | |
# Input: | |
<NP> <V> <PP> <NP> <V> <.> | |
# VP -> V (NP|PP)*: | |
<NP> {<V> <PP> <NP>}{<V>} <.> | |
(S | |
(NP The/DT cat/NN) | |
(VP | |
(V sat/VBD) | |
(PP (P on/IN) (NP the/DT mat/NN)) | |
(NP the/DT dog/NN)) | |
(VP (V chewed/VBD)) | |
./.) | |
Test parsing of other rule types: | |
>>> print(RegexpParser(''' | |
... X: | |
... }<a><b>{ # strip rule | |
... <a>}{<b> # split rule | |
... <a>{}<b> # merge rule | |
... <a>{<b>}<c> # chunk rule w/ context | |
... ''')) | |
chunk.RegexpParser with 1 stages: | |
RegexpChunkParser with 4 rules: | |
strip rule <StripRule: '<a><b>'> | |
split rule <SplitRule: '<a>', '<b>'> | |
merge rule <MergeRule: '<a>', '<b>'> | |
chunk rule w/ context <ChunkRuleWithContext: '<a>', '<b>', '<c>'> | |
Illegal patterns give an error message: | |
>>> print(RegexpParser('X: {<foo>} {<bar>}')) | |
Traceback (most recent call last): | |
. . . | |
ValueError: Illegal chunk pattern: {<foo>} {<bar>} | |