Spaces:
Sleeping
Sleeping
File size: 5,787 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2023 NLTK Project
# Author: Masato Hagiwara <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
import re
from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader
from nltk.corpus.reader.util import (
FileSystemPathPointer,
find_corpus_fileids,
read_blankline_block,
)
from nltk.parse import DependencyGraph
# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
class KNBCorpusReader(SyntaxCorpusReader):
"""
This class implements:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
The structure of tagged words:
tagged_word = (word(str), tags(tuple))
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
Usage example
>>> from nltk.corpus.util import LazyCorpusLoader
>>> knbc = LazyCorpusLoader(
... 'knbc/corpus1',
... KNBCorpusReader,
... r'.*/KN.*',
... encoding='euc-jp',
... )
>>> len(knbc.sents()[0])
9
"""
def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
def _read_block(self, stream):
# blocks are split by blankline (or EOF) - default
return read_blankline_block(stream)
def _word(self, t):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
res.append(cells[0])
return res
# ignores tagset argument
def _tag(self, t, tagset=None):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
res.append((cells[0], " ".join(cells[1:])))
return res
def _parse(self, t):
dg = DependencyGraph()
i = 0
for line in t.splitlines():
if line[0] in "*+":
# start of bunsetsu or tag
cells = line.strip().split(" ", 3)
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
assert m is not None
node = dg.nodes[i]
node.update({"address": i, "rel": m.group(2), "word": []})
dep_parent = int(m.group(1))
if dep_parent == -1:
dg.root = node
else:
dg.nodes[dep_parent]["deps"].append(i)
i += 1
elif line[0] != "#":
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
morph = cells[0], " ".join(cells[1:])
dg.nodes[i - 1]["word"].append(morph)
if self.morphs2str:
for node in dg.nodes.values():
node["word"] = self.morphs2str(node["word"])
return dg.tree()
######################################################################
# Demo
######################################################################
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find("corpora/knbc/corpus1")
fileids = [
f
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
]
def _knbc_fileids_sort(x):
cells = x.split("-")
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader(
"knbc/corpus1",
KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort),
encoding="euc-jp",
)
print(knbc.fileids()[:10])
print("".join(knbc.words()[:100]))
print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
knbc.morphs2str = lambda morphs: "/".join(
"{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
).encode("utf-8")
print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
print(
"\n".join(
" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
)
assert isinstance(knbc.words()[0], str)
assert isinstance(knbc.sents()[0][0], str)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
if __name__ == "__main__":
demo()
|