Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /corpus /reader /dependency.py

sunnychenxiwang

update nltk

d916065 12 months ago

raw

history blame

3.89 kB

	# Natural Language Toolkit: Dependency Corpus Reader
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Kepa Sarasola <[email protected]>
	# Iker Manterola <[email protected]>
	#
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	from nltk.corpus.reader.api import *
	from nltk.corpus.reader.util import *
	from nltk.parse import DependencyGraph
	from nltk.tokenize import *


	class DependencyCorpusReader(SyntaxCorpusReader):
	def __init__(
	self,
	root,
	fileids,
	encoding="utf8",
	word_tokenizer=TabTokenizer(),
	sent_tokenizer=RegexpTokenizer("\n", gaps=True),
	para_block_reader=read_blankline_block,
	):
	SyntaxCorpusReader.__init__(self, root, fileids, encoding)

	#########################################################

	def words(self, fileids=None):
	return concat(
	[
	DependencyCorpusView(fileid, False, False, False, encoding=enc)
	for fileid, enc in self.abspaths(fileids, include_encoding=True)
	]
	)

	def tagged_words(self, fileids=None):
	return concat(
	[
	DependencyCorpusView(fileid, True, False, False, encoding=enc)
	for fileid, enc in self.abspaths(fileids, include_encoding=True)
	]
	)

	def sents(self, fileids=None):
	return concat(
	[
	DependencyCorpusView(fileid, False, True, False, encoding=enc)
	for fileid, enc in self.abspaths(fileids, include_encoding=True)
	]
	)

	def tagged_sents(self, fileids=None):
	return concat(
	[
	DependencyCorpusView(fileid, True, True, False, encoding=enc)
	for fileid, enc in self.abspaths(fileids, include_encoding=True)
	]
	)

	def parsed_sents(self, fileids=None):
	sents = concat(
	[
	DependencyCorpusView(fileid, False, True, True, encoding=enc)
	for fileid, enc in self.abspaths(fileids, include_encoding=True)
	]
	)
	return [DependencyGraph(sent) for sent in sents]


	class DependencyCorpusView(StreamBackedCorpusView):
	_DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da

	def __init__(
	self,
	corpus_file,
	tagged,
	group_by_sent,
	dependencies,
	chunk_types=None,
	encoding="utf8",
	):
	self._tagged = tagged
	self._dependencies = dependencies
	self._group_by_sent = group_by_sent
	self._chunk_types = chunk_types
	StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)

	def read_block(self, stream):
	# Read the next sentence.
	sent = read_blankline_block(stream)[0].strip()
	# Strip off the docstart marker, if present.
	if sent.startswith(self._DOCSTART):
	sent = sent[len(self._DOCSTART) :].lstrip()

	# extract word and tag from any of the formats
	if not self._dependencies:
	lines = [line.split("\t") for line in sent.split("\n")]
	if len(lines[0]) == 3 or len(lines[0]) == 4:
	sent = [(line[0], line[1]) for line in lines]
	elif len(lines[0]) == 10:
	sent = [(line[1], line[4]) for line in lines]
	else:
	raise ValueError("Unexpected number of fields in dependency tree file")

	# discard tags if they weren't requested
	if not self._tagged:
	sent = [word for (word, tag) in sent]

	# Return the result.
	if self._group_by_sent:
	return [sent]
	else:
	return list(sent)