File size: 5,787 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2023 NLTK Project
# Author: Masato Hagiwara <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html

import re

from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
from nltk.corpus.reader.util import (
    FileSystemPathPointer,
    find_corpus_fileids,
    read_blankline_block,
)
from nltk.parse import DependencyGraph

# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")


class KNBCorpusReader(SyntaxCorpusReader):
    """

    This class implements:

      - ``__init__``, which specifies the location of the corpus

        and a method for detecting the sentence blocks in corpus files.

      - ``_read_block``, which reads a block from the input stream.

      - ``_word``, which takes a block and returns a list of list of words.

      - ``_tag``, which takes a block and returns a list of list of tagged

        words.

      - ``_parse``, which takes a block and returns a list of parsed

        sentences.



    The structure of tagged words:

      tagged_word = (word(str), tags(tuple))

      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)



    Usage example



    >>> from nltk.corpus.util import LazyCorpusLoader

    >>> knbc = LazyCorpusLoader(

    ...     'knbc/corpus1',

    ...     KNBCorpusReader,

    ...     r'.*/KN.*',

    ...     encoding='euc-jp',

    ... )



    >>> len(knbc.sents()[0])

    9



    """

    def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
        """

        Initialize KNBCorpusReader

        morphs2str is a function to convert morphlist to str for tree representation

        for _parse()

        """
        SyntaxCorpusReader.__init__(self, root, fileids, encoding)
        self.morphs2str = morphs2str

    def _read_block(self, stream):
        # blocks are split by blankline (or EOF) - default
        return read_blankline_block(stream)

    def _word(self, t):
        res = []
        for line in t.splitlines():
            # ignore the Bunsets headers
            if not re.match(r"EOS|\*|\#|\+", line):
                cells = line.strip().split(" ")
                res.append(cells[0])

        return res

    # ignores tagset argument
    def _tag(self, t, tagset=None):
        res = []
        for line in t.splitlines():
            # ignore the Bunsets headers
            if not re.match(r"EOS|\*|\#|\+", line):
                cells = line.strip().split(" ")
                # convert cells to morph tuples
                res.append((cells[0], " ".join(cells[1:])))

        return res

    def _parse(self, t):
        dg = DependencyGraph()
        i = 0
        for line in t.splitlines():
            if line[0] in "*+":
                # start of bunsetsu or tag

                cells = line.strip().split(" ", 3)
                m = re.match(r"([\-0-9]*)([ADIP])", cells[1])

                assert m is not None

                node = dg.nodes[i]
                node.update({"address": i, "rel": m.group(2), "word": []})

                dep_parent = int(m.group(1))

                if dep_parent == -1:
                    dg.root = node
                else:
                    dg.nodes[dep_parent]["deps"].append(i)

                i += 1
            elif line[0] != "#":
                # normal morph
                cells = line.strip().split(" ")
                # convert cells to morph tuples
                morph = cells[0], " ".join(cells[1:])
                dg.nodes[i - 1]["word"].append(morph)

        if self.morphs2str:
            for node in dg.nodes.values():
                node["word"] = self.morphs2str(node["word"])

        return dg.tree()


######################################################################
# Demo
######################################################################


def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f
        for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader(
        "knbc/corpus1",
        KNBCorpusReader,
        sorted(fileids, key=_knbc_fileids_sort),
        encoding="euc-jp",
    )

    print(knbc.fileids()[:10])
    print("".join(knbc.words()[:100]))

    print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: "/".join(
        "{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
    ).encode("utf-8")

    print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))

    print(
        "\n".join(
            " ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
            for sent in knbc.tagged_sents()[0:2]
        )
    )


def test():

    from nltk.corpus.util import LazyCorpusLoader

    knbc = LazyCorpusLoader(
        "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
    )
    assert isinstance(knbc.words()[0], str)
    assert isinstance(knbc.sents()[0][0], str)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)


if __name__ == "__main__":
    demo()