Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: SemCor Corpus Reader | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Nathan Schneider <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Corpus reader for the SemCor Corpus. | |
""" | |
__docformat__ = "epytext en" | |
from nltk.corpus.reader.api import * | |
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView | |
from nltk.tree import Tree | |
class SemcorCorpusReader(XMLCorpusReader): | |
""" | |
Corpus reader for the SemCor Corpus. | |
For access to the complete XML data structure, use the ``xml()`` | |
method. For access to simple word lists and tagged word lists, use | |
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. | |
""" | |
def __init__(self, root, fileids, wordnet, lazy=True): | |
XMLCorpusReader.__init__(self, root, fileids) | |
self._lazy = lazy | |
self._wordnet = wordnet | |
def words(self, fileids=None): | |
""" | |
:return: the given file(s) as a list of words and punctuation symbols. | |
:rtype: list(str) | |
""" | |
return self._items(fileids, "word", False, False, False) | |
def chunks(self, fileids=None): | |
""" | |
:return: the given file(s) as a list of chunks, | |
each of which is a list of words and punctuation symbols | |
that form a unit. | |
:rtype: list(list(str)) | |
""" | |
return self._items(fileids, "chunk", False, False, False) | |
def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")): | |
""" | |
:return: the given file(s) as a list of tagged chunks, represented | |
in tree form. | |
:rtype: list(Tree) | |
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` | |
to indicate the kind of tags to include. Semantic tags consist of | |
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity | |
without a specific entry in WordNet. (Named entities of type 'other' | |
have no lemma. Other chunks not in WordNet have no semantic tag. | |
Punctuation tokens have `None` for their part of speech tag.) | |
""" | |
return self._items(fileids, "chunk", False, tag != "sem", tag != "pos") | |
def sents(self, fileids=None): | |
""" | |
:return: the given file(s) as a list of sentences, each encoded | |
as a list of word strings. | |
:rtype: list(list(str)) | |
""" | |
return self._items(fileids, "word", True, False, False) | |
def chunk_sents(self, fileids=None): | |
""" | |
:return: the given file(s) as a list of sentences, each encoded | |
as a list of chunks. | |
:rtype: list(list(list(str))) | |
""" | |
return self._items(fileids, "chunk", True, False, False) | |
def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")): | |
""" | |
:return: the given file(s) as a list of sentences. Each sentence | |
is represented as a list of tagged chunks (in tree form). | |
:rtype: list(list(Tree)) | |
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` | |
to indicate the kind of tags to include. Semantic tags consist of | |
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity | |
without a specific entry in WordNet. (Named entities of type 'other' | |
have no lemma. Other chunks not in WordNet have no semantic tag. | |
Punctuation tokens have `None` for their part of speech tag.) | |
""" | |
return self._items(fileids, "chunk", True, tag != "sem", tag != "pos") | |
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag): | |
if unit == "word" and not bracket_sent: | |
# the result of the SemcorWordView may be a multiword unit, so the | |
# LazyConcatenation will make sure the sentence is flattened | |
_ = lambda *args: LazyConcatenation( | |
(SemcorWordView if self._lazy else self._words)(*args) | |
) | |
else: | |
_ = SemcorWordView if self._lazy else self._words | |
return concat( | |
[ | |
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet) | |
for fileid in self.abspaths(fileids) | |
] | |
) | |
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag): | |
""" | |
Helper used to implement the view methods -- returns a list of | |
tokens, (segmented) words, chunks, or sentences. The tokens | |
and chunks may optionally be tagged (with POS and sense | |
information). | |
:param fileid: The name of the underlying file. | |
:param unit: One of `'token'`, `'word'`, or `'chunk'`. | |
:param bracket_sent: If true, include sentence bracketing. | |
:param pos_tag: Whether to include part-of-speech tags. | |
:param sem_tag: Whether to include semantic tags, namely WordNet lemma | |
and OOV named entity status. | |
""" | |
assert unit in ("token", "word", "chunk") | |
result = [] | |
xmldoc = ElementTree.parse(fileid).getroot() | |
for xmlsent in xmldoc.findall(".//s"): | |
sent = [] | |
for xmlword in _all_xmlwords_in(xmlsent): | |
itm = SemcorCorpusReader._word( | |
xmlword, unit, pos_tag, sem_tag, self._wordnet | |
) | |
if unit == "word": | |
sent.extend(itm) | |
else: | |
sent.append(itm) | |
if bracket_sent: | |
result.append(SemcorSentence(xmlsent.attrib["snum"], sent)) | |
else: | |
result.extend(sent) | |
assert None not in result | |
return result | |
def _word(xmlword, unit, pos_tag, sem_tag, wordnet): | |
tkn = xmlword.text | |
if not tkn: | |
tkn = "" # fixes issue 337? | |
lemma = xmlword.get("lemma", tkn) # lemma or NE class | |
lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense) | |
if lexsn is not None: | |
sense_key = lemma + "%" + lexsn | |
wnpos = ("n", "v", "a", "r", "s")[ | |
int(lexsn.split(":")[0]) - 1 | |
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html | |
else: | |
sense_key = wnpos = None | |
redef = xmlword.get( | |
"rdf", tkn | |
) # redefinition--this indicates the lookup string | |
# does not exactly match the enclosed string, e.g. due to typographical adjustments | |
# or discontinuity of a multiword expression. If a redefinition has occurred, | |
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma. | |
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). | |
sensenum = xmlword.get("wnsn") # WordNet sense number | |
isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet | |
pos = xmlword.get( | |
"pos" | |
) # part of speech for the whole chunk (None for punctuation) | |
if unit == "token": | |
if not pos_tag and not sem_tag: | |
itm = tkn | |
else: | |
itm = ( | |
(tkn,) | |
+ ((pos,) if pos_tag else ()) | |
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ()) | |
) | |
return itm | |
else: | |
ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE | |
if unit == "word": | |
return ww | |
else: | |
if sensenum is not None: | |
try: | |
sense = wordnet.lemma_from_key(sense_key) # Lemma object | |
except Exception: | |
# cannot retrieve the wordnet.Lemma object. possible reasons: | |
# (a) the wordnet corpus is not downloaded; | |
# (b) a nonexistent sense is annotated: e.g., such.s.00 triggers: | |
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00' | |
# solution: just use the lemma name as a string | |
try: | |
sense = "%s.%s.%02d" % ( | |
lemma, | |
wnpos, | |
int(sensenum), | |
) # e.g.: reach.v.02 | |
except ValueError: | |
sense = ( | |
lemma + "." + wnpos + "." + sensenum | |
) # e.g. the sense number may be "2;1" | |
bottom = [Tree(pos, ww)] if pos_tag else ww | |
if sem_tag and isOOVEntity: | |
if sensenum is not None: | |
return Tree(sense, [Tree("NE", bottom)]) | |
else: # 'other' NE | |
return Tree("NE", bottom) | |
elif sem_tag and sensenum is not None: | |
return Tree(sense, bottom) | |
elif pos_tag: | |
return bottom[0] | |
else: | |
return bottom # chunk as a list | |
def _all_xmlwords_in(elt, result=None): | |
if result is None: | |
result = [] | |
for child in elt: | |
if child.tag in ("wf", "punc"): | |
result.append(child) | |
else: | |
_all_xmlwords_in(child, result) | |
return result | |
class SemcorSentence(list): | |
""" | |
A list of words, augmented by an attribute ``num`` used to record | |
the sentence identifier (the ``n`` attribute from the XML). | |
""" | |
def __init__(self, num, items): | |
self.num = num | |
list.__init__(self, items) | |
class SemcorWordView(XMLCorpusView): | |
""" | |
A stream backed corpus view specialized for use with the BNC corpus. | |
""" | |
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet): | |
""" | |
:param fileid: The name of the underlying file. | |
:param unit: One of `'token'`, `'word'`, or `'chunk'`. | |
:param bracket_sent: If true, include sentence bracketing. | |
:param pos_tag: Whether to include part-of-speech tags. | |
:param sem_tag: Whether to include semantic tags, namely WordNet lemma | |
and OOV named entity status. | |
""" | |
if bracket_sent: | |
tagspec = ".*/s" | |
else: | |
tagspec = ".*/s/(punc|wf)" | |
self._unit = unit | |
self._sent = bracket_sent | |
self._pos_tag = pos_tag | |
self._sem_tag = sem_tag | |
self._wordnet = wordnet | |
XMLCorpusView.__init__(self, fileid, tagspec) | |
def handle_elt(self, elt, context): | |
if self._sent: | |
return self.handle_sent(elt) | |
else: | |
return self.handle_word(elt) | |
def handle_word(self, elt): | |
return SemcorCorpusReader._word( | |
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet | |
) | |
def handle_sent(self, elt): | |
sent = [] | |
for child in elt: | |
if child.tag in ("wf", "punc"): | |
itm = self.handle_word(child) | |
if self._unit == "word": | |
sent.extend(itm) | |
else: | |
sent.append(itm) | |
else: | |
raise ValueError("Unexpected element %s" % child.tag) | |
return SemcorSentence(elt.attrib["snum"], sent) | |