Spaces:
Sleeping
Sleeping
File size: 3,890 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# Natural Language Toolkit: Dependency Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Kepa Sarasola <[email protected]>
# Iker Manterola <[email protected]>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.parse import DependencyGraph
from nltk.tokenize import *
class DependencyCorpusReader(SyntaxCorpusReader):
def __init__(
self,
root,
fileids,
encoding="utf8",
word_tokenizer=TabTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
para_block_reader=read_blankline_block,
):
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
#########################################################
def words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def parsed_sents(self, fileids=None):
sents = concat(
[
DependencyCorpusView(fileid, False, True, True, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
return [DependencyGraph(sent) for sent in sents]
class DependencyCorpusView(StreamBackedCorpusView):
_DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
def __init__(
self,
corpus_file,
tagged,
group_by_sent,
dependencies,
chunk_types=None,
encoding="utf8",
):
self._tagged = tagged
self._dependencies = dependencies
self._group_by_sent = group_by_sent
self._chunk_types = chunk_types
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
# Read the next sentence.
sent = read_blankline_block(stream)[0].strip()
# Strip off the docstart marker, if present.
if sent.startswith(self._DOCSTART):
sent = sent[len(self._DOCSTART) :].lstrip()
# extract word and tag from any of the formats
if not self._dependencies:
lines = [line.split("\t") for line in sent.split("\n")]
if len(lines[0]) == 3 or len(lines[0]) == 4:
sent = [(line[0], line[1]) for line in lines]
elif len(lines[0]) == 10:
sent = [(line[1], line[4]) for line in lines]
else:
raise ValueError("Unexpected number of fields in dependency tree file")
# discard tags if they weren't requested
if not self._tagged:
sent = [word for (word, tag) in sent]
# Return the result.
if self._group_by_sent:
return [sent]
else:
return list(sent)
|