Spaces:
Sleeping
Sleeping
File size: 9,619 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <[email protected]>
# Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""
import sys
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import Tree
# we use [^\s()]+ instead of \S+? to avoid matching ()
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
class BracketParseCorpusReader(SyntaxCorpusReader):
"""
Reader for corpora that consist of parenthesis-delineated parse trees,
like those found in the "combined" section of the Penn Treebank,
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
"""
def __init__(
self,
root,
fileids,
comment_char=None,
detect_blocks="unindented_paren",
encoding="utf8",
tagset=None,
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param comment_char: The character which can appear at the start of
a line to indicate that the rest of the line is a comment.
:param detect_blocks: The method that is used to find blocks
in the corpus; can be 'unindented_paren' (every unindented
parenthesis starts a new parse) or 'sexpr' (brackets are
matched).
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
``tagged_...()`` methods.
"""
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
self._comment_char = comment_char
self._detect_blocks = detect_blocks
self._tagset = tagset
def _read_block(self, stream):
if self._detect_blocks == "sexpr":
return read_sexpr_block(stream, comment_char=self._comment_char)
elif self._detect_blocks == "blankline":
return read_blankline_block(stream)
elif self._detect_blocks == "unindented_paren":
# Tokens start with unindented left parens.
toks = read_regexp_block(stream, start_re=r"^\(")
# Strip any comments out of the tokens.
if self._comment_char:
toks = [
re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
for tok in toks
]
return toks
else:
assert 0, "bad block type"
def _normalize(self, t):
# Replace leaves of the form (!), (,), with (! !), (, ,)
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
# Replace leaves of the form (tag word root) with (tag word)
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
return t
def _parse(self, t):
try:
tree = Tree.fromstring(self._normalize(t))
# If there's an empty node at the top, strip it off
if tree.label() == "" and len(tree) == 1:
return tree[0]
else:
return tree
except ValueError as e:
sys.stderr.write("Bad tree detected; trying to recover...\n")
# Try to recover, if we can:
if e.args == ("mismatched parens",):
for n in range(1, 5):
try:
v = Tree(self._normalize(t + ")" * n))
sys.stderr.write(
" Recovered by adding %d close " "paren(s)\n" % n
)
return v
except ValueError:
pass
# Try something else:
sys.stderr.write(" Recovered by returning a flat parse.\n")
# sys.stderr.write(' '.join(t.split())+'\n')
return Tree("S", self._tag(t))
def _tag(self, t, tagset=None):
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
]
return tagged_sent
def _word(self, t):
return WORD.findall(self._normalize(t))
class CategorizedBracketParseCorpusReader(
CategorizedCorpusReader, BracketParseCorpusReader
):
"""
A reader for parsed corpora whose documents are
divided into categories based on their file identifiers.
@author: Nathan Schneider <[email protected]>
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
the L{CategorizedCorpusReader constructor
<CategorizedCorpusReader.__init__>}. The remaining arguments
are passed to the L{BracketParseCorpusReader constructor
<BracketParseCorpusReader.__init__>}.
"""
CategorizedCorpusReader.__init__(self, kwargs)
BracketParseCorpusReader.__init__(self, *args, **kwargs)
def tagged_words(self, fileids=None, categories=None, tagset=None):
return super().tagged_words(self._resolve(fileids, categories), tagset)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return super().tagged_sents(self._resolve(fileids, categories), tagset)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return super().tagged_paras(self._resolve(fileids, categories), tagset)
def parsed_words(self, fileids=None, categories=None):
return super().parsed_words(self._resolve(fileids, categories))
def parsed_sents(self, fileids=None, categories=None):
return super().parsed_sents(self._resolve(fileids, categories))
def parsed_paras(self, fileids=None, categories=None):
return super().parsed_paras(self._resolve(fileids, categories))
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
This corpus has a lexical breakdown structure embedded, as read by `_parse`
Unfortunately this puts punctuation and some other words out of the sentence
order in the xml element tree. This is no good for `tag_` and `word_`
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
to the overridden _normalize function. The _parse function can then remain
untouched.
"""
def __init__(self, root, encoding="ISO-8859-1", tagset=None):
BracketParseCorpusReader.__init__(
self,
root,
r"alpino\.xml",
detect_blocks="blankline",
encoding=encoding,
tagset=tagset,
)
def _normalize(self, t, ordered=False):
"""Normalize the xml sentence element in t.
The sentence elements <alpino_ds>, although embedded in a few overall
xml elements, are separated by blank lines. That's how the reader can
deliver them one at a time.
Each sentence has a few category subnodes that are of no use to us.
The remaining word nodes may or may not appear in the proper order.
Each word node has attributes, among which:
- begin : the position of the word in the sentence
- pos : Part of Speech: the Tag
- word : the actual word
The return value is a string with all xml elementes replaced by
clauses: either a cat clause with nested clauses, or a word clause.
The order of the bracket clauses closely follows the xml.
If ordered == True, the word clauses include an order sequence number.
If ordered == False, the word clauses only have pos and word parts.
"""
if t[:10] != "<alpino_ds":
return ""
# convert XML to sexpr notation
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
if ordered:
t = re.sub(
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
r"(\1 \2 \3)",
t,
)
else:
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
t = re.sub(r" </node>", r")", t)
t = re.sub(r"<sentence>.*</sentence>", r"", t)
t = re.sub(r"</?alpino_ds.*>", r"", t)
return t
def _tag(self, t, tagset=None):
tagged_sent = [
(int(o), w, p)
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
]
tagged_sent.sort()
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
]
else:
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
return tagged_sent
def _word(self, t):
"""Return a correctly ordered list if words"""
tagged_sent = self._tag(t)
return [w for (w, p) in tagged_sent]
|