Spaces:
Sleeping
Sleeping
File size: 22,301 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 |
# Natural Language Toolkit: CONLL Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <[email protected]>
# Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Read CoNLL-style chunk fileids.
"""
import textwrap
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import Tree
from nltk.util import LazyConcatenation, LazyMap
class ConllCorpusReader(CorpusReader):
"""
A corpus reader for CoNLL-style files. These files consist of a
series of sentences, separated by blank lines. Each sentence is
encoded using a table (or "grid") of values, where each line
corresponds to a single word, and each column corresponds to an
annotation type. The set of columns used by CoNLL-style files can
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
therefore takes an argument, ``columntypes``, which is used to
specify the columns that are used by a given corpus. By default
columns are split by consecutive whitespaces, with the
``separator`` argument you can set a string to split by (e.g.
``\'\t\'``).
@todo: Add support for reading from corpora where different
parallel files contain different columns.
@todo: Possibly add caching of the grid corpus view? This would
allow the same grid view to be used by different data access
methods (eg words() and parsed_sents() could both share the
same grid corpus view object).
@todo: Better support for -DOCSTART-. Currently, we just ignore
it, but it could be used to define methods that retrieve a
document at a time (eg parsed_documents()).
"""
# /////////////////////////////////////////////////////////////////
# Column Types
# /////////////////////////////////////////////////////////////////
WORDS = "words" #: column type for words
POS = "pos" #: column type for part-of-speech tags
TREE = "tree" #: column type for parse trees
CHUNK = "chunk" #: column type for chunk structures
NE = "ne" #: column type for named entities
SRL = "srl" #: column type for semantic role labels
IGNORE = "ignore" #: column type for column that should be ignored
#: A list of all column types supported by the conll corpus reader.
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
# /////////////////////////////////////////////////////////////////
# Constructor
# /////////////////////////////////////////////////////////////////
def __init__(
self,
root,
fileids,
columntypes,
chunk_types=None,
root_label="S",
pos_in_tree=False,
srl_includes_roleset=True,
encoding="utf8",
tree_class=Tree,
tagset=None,
separator=None,
):
for columntype in columntypes:
if columntype not in self.COLUMN_TYPES:
raise ValueError("Bad column type %r" % columntype)
if isinstance(chunk_types, str):
chunk_types = [chunk_types]
self._chunk_types = chunk_types
self._colmap = {c: i for (i, c) in enumerate(columntypes)}
self._pos_in_tree = pos_in_tree
self._root_label = root_label # for chunks
self._srl_includes_roleset = srl_includes_roleset
self._tree_class = tree_class
CorpusReader.__init__(self, root, fileids, encoding)
self._tagset = tagset
self.sep = separator
# /////////////////////////////////////////////////////////////////
# Data Access Methods
# /////////////////////////////////////////////////////////////////
def words(self, fileids=None):
self._require(self.WORDS)
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
def sents(self, fileids=None):
self._require(self.WORDS)
return LazyMap(self._get_words, self._grids(fileids))
def tagged_words(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
def tagged_sents(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyMap(get_tagged_words, self._grids(fileids))
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyMap(get_chunked_words, self._grids(fileids))
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
self._require(self.WORDS, self.POS, self.TREE)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_parsed_sent(grid): # capture pos_in_tree as local var
return self._get_parsed_sent(grid, pos_in_tree, tagset)
return LazyMap(get_parsed_sent, self._grids(fileids))
def srl_spans(self, fileids=None):
self._require(self.SRL)
return LazyMap(self._get_srl_spans, self._grids(fileids))
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_srl_instances(grid): # capture pos_in_tree as local var
return self._get_srl_instances(grid, pos_in_tree)
result = LazyMap(get_srl_instances, self._grids(fileids))
if flatten:
result = LazyConcatenation(result)
return result
def iob_words(self, fileids=None, tagset=None):
"""
:return: a list of word/tag/IOB tuples
:rtype: list(tuple)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
def iob_sents(self, fileids=None, tagset=None):
"""
:return: a list of lists of word/tag/IOB tuples
:rtype: list(list)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyMap(get_iob_words, self._grids(fileids))
# /////////////////////////////////////////////////////////////////
# Grid Reading
# /////////////////////////////////////////////////////////////////
def _grids(self, fileids=None):
# n.b.: we could cache the object returned here (keyed on
# fileids), which would let us reuse the same corpus view for
# different things (eg srl and parse trees).
return concat(
[
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_grid_block(self, stream):
grids = []
for block in read_blankline_block(stream):
block = block.strip()
if not block:
continue
grid = [line.split(self.sep) for line in block.split("\n")]
# If there's a docstart row, then discard. ([xx] eventually it
# would be good to actually use it)
if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
del grid[0]
# Check that the grid is consistent.
for row in grid:
if len(row) != len(grid[0]):
raise ValueError("Inconsistent number of columns:\n%s" % block)
grids.append(grid)
return grids
# /////////////////////////////////////////////////////////////////
# Transforms
# /////////////////////////////////////////////////////////////////
# given a grid, transform it into some representation (e.g.,
# a list of words or a parse tree).
def _get_words(self, grid):
return self._get_column(grid, self._colmap["words"])
def _get_tagged_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
def _get_iob_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(
zip(
self._get_column(grid, self._colmap["words"]),
pos_tags,
self._get_column(grid, self._colmap["chunk"]),
)
)
def _get_chunked_words(self, grid, chunk_types, tagset=None):
# n.b.: this method is very similar to conllstr2tree.
words = self._get_column(grid, self._colmap["words"])
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
chunk_tags = self._get_column(grid, self._colmap["chunk"])
stack = [Tree(self._root_label, [])]
for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
if chunk_tag == "O":
state, chunk_type = "O", ""
else:
(state, chunk_type) = chunk_tag.split("-")
# If it's a chunk we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
state = "O"
# Treat a mismatching I like a B.
if state == "I" and chunk_type != stack[-1].label():
state = "B"
# For B or I: close any open chunks
if state in "BO" and len(stack) == 2:
stack.pop()
# For B: start a new chunk.
if state == "B":
new_chunk = Tree(chunk_type, [])
stack[-1].append(new_chunk)
stack.append(new_chunk)
# Add the word token.
stack[-1].append((word, pos_tag))
return stack[0]
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
words = self._get_column(grid, self._colmap["words"])
pos_tags = self._get_column(grid, self._colmap["pos"])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
parse_tags = self._get_column(grid, self._colmap["tree"])
treestr = ""
for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
if word == "(":
word = "-LRB-"
if word == ")":
word = "-RRB-"
if pos_tag == "(":
pos_tag = "-LRB-"
if pos_tag == ")":
pos_tag = "-RRB-"
(left, right) = parse_tag.split("*")
right = right.count(")") * ")" # only keep ')'.
treestr += f"{left} ({pos_tag} {word}) {right}"
try:
tree = self._tree_class.fromstring(treestr)
except (ValueError, IndexError):
tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
if not pos_in_tree:
for subtree in tree.subtrees():
for i, child in enumerate(subtree):
if (
isinstance(child, Tree)
and len(child) == 1
and isinstance(child[0], str)
):
subtree[i] = (child[0], child.label())
return tree
def _get_srl_spans(self, grid):
"""
list of list of (start, end), tag) tuples
"""
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap["srl"] + 1)
start_col = self._colmap["srl"] + 2
else:
predicates = self._get_column(grid, self._colmap["srl"])
start_col = self._colmap["srl"] + 1
# Count how many predicates there are. This tells us how many
# columns to expect for SRL data.
num_preds = len([p for p in predicates if p != "-"])
spanlists = []
for i in range(num_preds):
col = self._get_column(grid, start_col + i)
spanlist = []
stack = []
for wordnum, srl_tag in enumerate(col):
(left, right) = srl_tag.split("*")
for tag in left.split("("):
if tag:
stack.append((tag, wordnum))
for i in range(right.count(")")):
(tag, start) = stack.pop()
spanlist.append(((start, wordnum + 1), tag))
spanlists.append(spanlist)
return spanlists
def _get_srl_instances(self, grid, pos_in_tree):
tree = self._get_parsed_sent(grid, pos_in_tree)
spanlists = self._get_srl_spans(grid)
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap["srl"] + 1)
rolesets = self._get_column(grid, self._colmap["srl"])
else:
predicates = self._get_column(grid, self._colmap["srl"])
rolesets = [None] * len(predicates)
instances = ConllSRLInstanceList(tree)
for wordnum, predicate in enumerate(predicates):
if predicate == "-":
continue
# Decide which spanlist to use. Don't assume that they're
# sorted in the same order as the predicates (even though
# they usually are).
for spanlist in spanlists:
for (start, end), tag in spanlist:
if wordnum in range(start, end) and tag in ("V", "C-V"):
break
else:
continue
break
else:
raise ValueError("No srl column found for %r" % predicate)
instances.append(
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
)
return instances
# /////////////////////////////////////////////////////////////////
# Helper Methods
# /////////////////////////////////////////////////////////////////
def _require(self, *columntypes):
for columntype in columntypes:
if columntype not in self._colmap:
raise ValueError(
"This corpus does not contain a %s " "column." % columntype
)
@staticmethod
def _get_column(grid, column_index):
return [grid[i][column_index] for i in range(len(grid))]
class ConllSRLInstance:
"""
An SRL instance from a CoNLL corpus, which identifies and
providing labels for the arguments of a single verb.
"""
# [xx] add inst.core_arguments, inst.argm_arguments?
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
self.verb = []
"""A list of the word indices of the words that compose the
verb whose arguments are identified by this instance.
This will contain multiple word indices when multi-word
verbs are used (e.g. 'turn on')."""
self.verb_head = verb_head
"""The word index of the head word of the verb whose arguments
are identified by this instance. E.g., for a sentence that
uses the verb 'turn on,' ``verb_head`` will be the word index
of the word 'turn'."""
self.verb_stem = verb_stem
self.roleset = roleset
self.arguments = []
"""A list of ``(argspan, argid)`` tuples, specifying the location
and type for each of the arguments identified by this
instance. ``argspan`` is a tuple ``start, end``, indicating
that the argument consists of the ``words[start:end]``."""
self.tagged_spans = tagged_spans
"""A list of ``(span, id)`` tuples, specifying the location and
type for each of the arguments, as well as the verb pieces,
that make up this instance."""
self.tree = tree
"""The parse tree for the sentence containing this instance."""
self.words = tree.leaves()
"""A list of the words in the sentence containing this
instance."""
# Fill in the self.verb and self.arguments values.
for (start, end), tag in tagged_spans:
if tag in ("V", "C-V"):
self.verb += list(range(start, end))
else:
self.arguments.append(((start, end), tag))
def __repr__(self):
# Originally, its:
##plural = 's' if len(self.arguments) != 1 else ''
plural = "s" if len(self.arguments) != 1 else ""
return "<ConllSRLInstance for %r with %d argument%s>" % (
(self.verb_stem, len(self.arguments), plural)
)
def pprint(self):
verbstr = " ".join(self.words[i][0] for i in self.verb)
hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
s = ""
for i, word in enumerate(self.words):
if isinstance(word, tuple):
word = word[0]
for (start, end), argid in self.arguments:
if i == start:
s += "[%s " % argid
if i == end:
s += "] "
if i in self.verb:
word = "<<%s>>" % word
s += word + " "
return hdr + textwrap.fill(
s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
)
class ConllSRLInstanceList(list):
"""
Set of instances for a single sentence
"""
def __init__(self, tree, instances=()):
self.tree = tree
list.__init__(self, instances)
def __str__(self):
return self.pprint()
def pprint(self, include_tree=False):
# Sanity check: trees should be the same
for inst in self:
if inst.tree != self.tree:
raise ValueError("Tree mismatch!")
# If desired, add trees:
if include_tree:
words = self.tree.leaves()
pos = [None] * len(words)
synt = ["*"] * len(words)
self._tree2conll(self.tree, 0, words, pos, synt)
s = ""
for i in range(len(words)):
# optional tree columns
if include_tree:
s += "%-20s " % words[i]
s += "%-8s " % pos[i]
s += "%15s*%-8s " % tuple(synt[i].split("*"))
# verb head column
for inst in self:
if i == inst.verb_head:
s += "%-20s " % inst.verb_stem
break
else:
s += "%-20s " % "-"
# Remaining columns: self
for inst in self:
argstr = "*"
for (start, end), argid in inst.tagged_spans:
if i == start:
argstr = f"({argid}{argstr}"
if i == (end - 1):
argstr += ")"
s += "%-12s " % argstr
s += "\n"
return s
def _tree2conll(self, tree, wordnum, words, pos, synt):
assert isinstance(tree, Tree)
if len(tree) == 1 and isinstance(tree[0], str):
pos[wordnum] = tree.label()
assert words[wordnum] == tree[0]
return wordnum + 1
elif len(tree) == 1 and isinstance(tree[0], tuple):
assert len(tree[0]) == 2
pos[wordnum], pos[wordnum] = tree[0]
return wordnum + 1
else:
synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
for child in tree:
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
synt[wordnum - 1] += ")"
return wordnum
class ConllChunkCorpusReader(ConllCorpusReader):
"""
A ConllCorpusReader whose data file contains three columns: words,
pos, and chunk.
"""
def __init__(
self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
):
ConllCorpusReader.__init__(
self,
root,
fileids,
("words", "pos", "chunk"),
chunk_types=chunk_types,
encoding=encoding,
tagset=tagset,
separator=separator,
)
|