File size: 3,890 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Natural Language Toolkit: Dependency Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Kepa Sarasola <[email protected]>
#         Iker Manterola <[email protected]>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.parse import DependencyGraph
from nltk.tokenize import *


class DependencyCorpusReader(SyntaxCorpusReader):
    def __init__(

        self,

        root,

        fileids,

        encoding="utf8",

        word_tokenizer=TabTokenizer(),

        sent_tokenizer=RegexpTokenizer("\n", gaps=True),

        para_block_reader=read_blankline_block,

    ):
        SyntaxCorpusReader.__init__(self, root, fileids, encoding)

    #########################################################

    def words(self, fileids=None):
        return concat(
            [
                DependencyCorpusView(fileid, False, False, False, encoding=enc)
                for fileid, enc in self.abspaths(fileids, include_encoding=True)
            ]
        )

    def tagged_words(self, fileids=None):
        return concat(
            [
                DependencyCorpusView(fileid, True, False, False, encoding=enc)
                for fileid, enc in self.abspaths(fileids, include_encoding=True)
            ]
        )

    def sents(self, fileids=None):
        return concat(
            [
                DependencyCorpusView(fileid, False, True, False, encoding=enc)
                for fileid, enc in self.abspaths(fileids, include_encoding=True)
            ]
        )

    def tagged_sents(self, fileids=None):
        return concat(
            [
                DependencyCorpusView(fileid, True, True, False, encoding=enc)
                for fileid, enc in self.abspaths(fileids, include_encoding=True)
            ]
        )

    def parsed_sents(self, fileids=None):
        sents = concat(
            [
                DependencyCorpusView(fileid, False, True, True, encoding=enc)
                for fileid, enc in self.abspaths(fileids, include_encoding=True)
            ]
        )
        return [DependencyGraph(sent) for sent in sents]


class DependencyCorpusView(StreamBackedCorpusView):
    _DOCSTART = "-DOCSTART- -DOCSTART- O\n"  # dokumentu hasiera definitzen da

    def __init__(

        self,

        corpus_file,

        tagged,

        group_by_sent,

        dependencies,

        chunk_types=None,

        encoding="utf8",

    ):
        self._tagged = tagged
        self._dependencies = dependencies
        self._group_by_sent = group_by_sent
        self._chunk_types = chunk_types
        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)

    def read_block(self, stream):
        # Read the next sentence.
        sent = read_blankline_block(stream)[0].strip()
        # Strip off the docstart marker, if present.
        if sent.startswith(self._DOCSTART):
            sent = sent[len(self._DOCSTART) :].lstrip()

        # extract word and tag from any of the formats
        if not self._dependencies:
            lines = [line.split("\t") for line in sent.split("\n")]
            if len(lines[0]) == 3 or len(lines[0]) == 4:
                sent = [(line[0], line[1]) for line in lines]
            elif len(lines[0]) == 10:
                sent = [(line[1], line[4]) for line in lines]
            else:
                raise ValueError("Unexpected number of fields in dependency tree file")

            # discard tags if they weren't requested
            if not self._tagged:
                sent = [word for (word, tag) in sent]

        # Return the result.
        if self._group_by_sent:
            return [sent]
        else:
            return list(sent)