Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /corpus /reader /chunked.py

sunnychenxiwang

update nltk

d916065 about 1 year ago

raw

history blame

9.37 kB

	# Natural Language Toolkit: Chunked Corpus Reader
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Steven Bird <[email protected]>
	# Edward Loper <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	A reader for corpora that contain chunked (and optionally tagged)
	documents.
	"""

	import codecs
	import os.path

	import nltk
	from nltk.chunk import tagstr2tree
	from nltk.corpus.reader.api import *
	from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
	from nltk.corpus.reader.util import *
	from nltk.tokenize import *
	from nltk.tree import Tree


	class ChunkedCorpusReader(CorpusReader):
	"""
	Reader for chunked (and optionally tagged) corpora. Paragraphs
	are split using a block reader. They are then tokenized into
	sentences using a sentence tokenizer. Finally, these sentences
	are parsed into chunk trees using a string-to-chunktree conversion
	function. Each of these steps can be performed using a default
	function or a custom function. By default, paragraphs are split
	on blank lines; sentences are listed one per line; and sentences
	are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
	"""

	def __init__(
	self,
	root,
	fileids,
	extension="",
	str2chunktree=tagstr2tree,
	sent_tokenizer=RegexpTokenizer("\n", gaps=True),
	para_block_reader=read_blankline_block,
	encoding="utf8",
	tagset=None,
	):
	"""
	:param root: The root directory for this corpus.
	:param fileids: A list or regexp specifying the fileids in this corpus.
	"""
	CorpusReader.__init__(self, root, fileids, encoding)
	self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
	"""Arguments for corpus views generated by this corpus: a tuple
	(str2chunktree, sent_tokenizer, para_block_tokenizer)"""

	def words(self, fileids=None):
	"""
	:return: the given file(s) as a list of words
	and punctuation symbols.
	:rtype: list(str)
	"""
	return concat(
	[
	ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def sents(self, fileids=None):
	"""
	:return: the given file(s) as a list of
	sentences or utterances, each encoded as a list of word
	strings.
	:rtype: list(list(str))
	"""
	return concat(
	[
	ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def paras(self, fileids=None):
	"""
	:return: the given file(s) as a list of
	paragraphs, each encoded as a list of sentences, which are
	in turn encoded as lists of word strings.
	:rtype: list(list(list(str)))
	"""
	return concat(
	[
	ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def tagged_words(self, fileids=None, tagset=None):
	"""
	:return: the given file(s) as a list of tagged
	words and punctuation symbols, encoded as tuples
	``(word,tag)``.
	:rtype: list(tuple(str,str))
	"""
	return concat(
	[
	ChunkedCorpusView(
	f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
	)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def tagged_sents(self, fileids=None, tagset=None):
	"""
	:return: the given file(s) as a list of
	sentences, each encoded as a list of ``(word,tag)`` tuples.

	:rtype: list(list(tuple(str,str)))
	"""
	return concat(
	[
	ChunkedCorpusView(
	f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
	)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def tagged_paras(self, fileids=None, tagset=None):
	"""
	:return: the given file(s) as a list of
	paragraphs, each encoded as a list of sentences, which are
	in turn encoded as lists of ``(word,tag)`` tuples.
	:rtype: list(list(list(tuple(str,str))))
	"""
	return concat(
	[
	ChunkedCorpusView(
	f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
	)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def chunked_words(self, fileids=None, tagset=None):
	"""
	:return: the given file(s) as a list of tagged
	words and chunks. Words are encoded as ``(word, tag)``
	tuples (if the corpus has tags) or word strings (if the
	corpus has no tags). Chunks are encoded as depth-one
	trees over ``(word,tag)`` tuples or word strings.
	:rtype: list(tuple(str,str) and Tree)
	"""
	return concat(
	[
	ChunkedCorpusView(
	f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
	)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def chunked_sents(self, fileids=None, tagset=None):
	"""
	:return: the given file(s) as a list of
	sentences, each encoded as a shallow Tree. The leaves
	of these trees are encoded as ``(word, tag)`` tuples (if
	the corpus has tags) or word strings (if the corpus has no
	tags).
	:rtype: list(Tree)
	"""
	return concat(
	[
	ChunkedCorpusView(
	f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
	)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def chunked_paras(self, fileids=None, tagset=None):
	"""
	:return: the given file(s) as a list of
	paragraphs, each encoded as a list of sentences, which are
	in turn encoded as a shallow Tree. The leaves of these
	trees are encoded as ``(word, tag)`` tuples (if the corpus
	has tags) or word strings (if the corpus has no tags).
	:rtype: list(list(Tree))
	"""
	return concat(
	[
	ChunkedCorpusView(
	f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
	)
	for (f, enc) in self.abspaths(fileids, True)
	]
	)

	def _read_block(self, stream):
	return [tagstr2tree(t) for t in read_blankline_block(stream)]


	class ChunkedCorpusView(StreamBackedCorpusView):
	def __init__(
	self,
	fileid,
	encoding,
	tagged,
	group_by_sent,
	group_by_para,
	chunked,
	str2chunktree,
	sent_tokenizer,
	para_block_reader,
	source_tagset=None,
	target_tagset=None,
	):
	StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
	self._tagged = tagged
	self._group_by_sent = group_by_sent
	self._group_by_para = group_by_para
	self._chunked = chunked
	self._str2chunktree = str2chunktree
	self._sent_tokenizer = sent_tokenizer
	self._para_block_reader = para_block_reader
	self._source_tagset = source_tagset
	self._target_tagset = target_tagset

	def read_block(self, stream):
	block = []
	for para_str in self._para_block_reader(stream):
	para = []
	for sent_str in self._sent_tokenizer.tokenize(para_str):
	sent = self._str2chunktree(
	sent_str,
	source_tagset=self._source_tagset,
	target_tagset=self._target_tagset,
	)

	# If requested, throw away the tags.
	if not self._tagged:
	sent = self._untag(sent)

	# If requested, throw away the chunks.
	if not self._chunked:
	sent = sent.leaves()

	# Add the sentence to `para`.
	if self._group_by_sent:
	para.append(sent)
	else:
	para.extend(sent)

	# Add the paragraph to `block`.
	if self._group_by_para:
	block.append(para)
	else:
	block.extend(para)

	# Return the block
	return block

	def _untag(self, tree):
	for i, child in enumerate(tree):
	if isinstance(child, Tree):
	self._untag(child)
	elif isinstance(child, tuple):
	tree[i] = child[0]
	else:
	raise ValueError("expected child to be Tree or tuple")
	return tree