Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /corpus /__init__.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

17.4 kB

	# Natural Language Toolkit: Corpus Readers
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Edward Loper <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	# TODO this docstring isn't up-to-date!
	"""
	NLTK corpus readers. The modules in this package provide functions
	that can be used to read corpus files in a variety of formats. These
	functions can be used to read both the corpus files that are
	distributed in the NLTK corpus package, and corpus files that are part
	of external corpora.

	Available Corpora
	=================

	Please see https://www.nltk.org/nltk_data/ for a complete list.
	Install corpora using nltk.download().

	Corpus Reader Functions
	=======================
	Each corpus module defines one or more "corpus reader functions",
	which can be used to read documents from that corpus. These functions
	take an argument, ``item``, which is used to indicate which document
	should be read from the corpus:

	- If ``item`` is one of the unique identifiers listed in the corpus
	module's ``items`` variable, then the corresponding document will
	be loaded from the NLTK corpus package.
	- If ``item`` is a filename, then that file will be read.

	Additionally, corpus reader functions can be given lists of item
	names; in which case, they will return a concatenation of the
	corresponding documents.

	Corpus reader functions are named based on the type of information
	they return. Some common examples, and their return types, are:

	- words(): list of str
	- sents(): list of (list of str)
	- paras(): list of (list of (list of str))
	- tagged_words(): list of (str,str) tuple
	- tagged_sents(): list of (list of (str,str))
	- tagged_paras(): list of (list of (list of (str,str)))
	- chunked_sents(): list of (Tree w/ (str,str) leaves)
	- parsed_sents(): list of (Tree with str leaves)
	- parsed_paras(): list of (list of (Tree with str leaves))
	- xml(): A single xml ElementTree
	- raw(): unprocessed corpus contents

	For example, to read a list of the words in the Brown Corpus, use
	``nltk.corpus.brown.words()``:

	>>> from nltk.corpus import brown
	>>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
	The, Fulton, County, Grand, Jury, said, ...

	"""

	import re

	from nltk.corpus.reader import *
	from nltk.corpus.util import LazyCorpusLoader
	from nltk.tokenize import RegexpTokenizer

	abc: PlaintextCorpusReader = LazyCorpusLoader(
	"abc",
	PlaintextCorpusReader,
	r"(?!\.).*\.txt",
	encoding=[("science", "latin_1"), ("rural", "utf8")],
	)
	alpino: AlpinoCorpusReader = LazyCorpusLoader(
	"alpino", AlpinoCorpusReader, tagset="alpino"
	)
	bcp47: BCP47CorpusReader = LazyCorpusLoader(
	"bcp47", BCP47CorpusReader, r"(cldr\|iana)/*"
	)
	brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
	"brown",
	CategorizedTaggedCorpusReader,
	r"c[a-z]\d\d",
	cat_file="cats.txt",
	tagset="brown",
	encoding="ascii",
	)
	cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
	"cess_cat",
	BracketParseCorpusReader,
	r"(?!\.).*\.tbf",
	tagset="unknown",
	encoding="ISO-8859-15",
	)
	cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
	"cess_esp",
	BracketParseCorpusReader,
	r"(?!\.).*\.tbf",
	tagset="unknown",
	encoding="ISO-8859-15",
	)
	cmudict: CMUDictCorpusReader = LazyCorpusLoader(
	"cmudict", CMUDictCorpusReader, ["cmudict"]
	)
	comtrans: AlignedCorpusReader = LazyCorpusLoader(
	"comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
	)
	comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
	"comparative_sentences",
	ComparativeSentencesCorpusReader,
	r"labeledSentences\.txt",
	encoding="latin-1",
	)
	conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
	"conll2000",
	ConllChunkCorpusReader,
	["train.txt", "test.txt"],
	("NP", "VP", "PP"),
	tagset="wsj",
	encoding="ascii",
	)
	conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
	"conll2002",
	ConllChunkCorpusReader,
	r".\.(test\|train).",
	("LOC", "PER", "ORG", "MISC"),
	encoding="utf-8",
	)
	conll2007: DependencyCorpusReader = LazyCorpusLoader(
	"conll2007",
	DependencyCorpusReader,
	r".\.(test\|train).",
	encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
	)
	crubadan: CrubadanCorpusReader = LazyCorpusLoader(
	"crubadan", CrubadanCorpusReader, r".*\.txt"
	)
	dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
	"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
	)
	extended_omw: CorpusReader = LazyCorpusLoader(
	"extended_omw", CorpusReader, r"./wn-[a-z\-]\.tab", encoding="utf8"
	)
	floresta: BracketParseCorpusReader = LazyCorpusLoader(
	"floresta",
	BracketParseCorpusReader,
	r"(?!\.).*\.ptb",
	"#",
	tagset="unknown",
	encoding="ISO-8859-15",
	)
	framenet15: FramenetCorpusReader = LazyCorpusLoader(
	"framenet_v15",
	FramenetCorpusReader,
	[
	"frRelation.xml",
	"frameIndex.xml",
	"fulltextIndex.xml",
	"luIndex.xml",
	"semTypes.xml",
	],
	)
	framenet: FramenetCorpusReader = LazyCorpusLoader(
	"framenet_v17",
	FramenetCorpusReader,
	[
	"frRelation.xml",
	"frameIndex.xml",
	"fulltextIndex.xml",
	"luIndex.xml",
	"semTypes.xml",
	],
	)
	gazetteers: WordListCorpusReader = LazyCorpusLoader(
	"gazetteers", WordListCorpusReader, r"(?!LICENSE\|\.).*\.txt", encoding="ISO-8859-2"
	)
	genesis: PlaintextCorpusReader = LazyCorpusLoader(
	"genesis",
	PlaintextCorpusReader,
	r"(?!\.).*\.txt",
	encoding=[
	("finnish\|french\|german", "latin_1"),
	("swedish", "cp865"),
	(".*", "utf_8"),
	],
	)
	gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
	"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
	)
	ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README\|\.).*")
	inaugural: PlaintextCorpusReader = LazyCorpusLoader(
	"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
	)
	# [XX] This should probably just use TaggedCorpusReader:
	indian: IndianCorpusReader = LazyCorpusLoader(
	"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
	)

	jeita: ChasenCorpusReader = LazyCorpusLoader(
	"jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
	)
	knbc: KNBCorpusReader = LazyCorpusLoader(
	"knbc/corpus1", KNBCorpusReader, r"./KN.", encoding="euc-jp"
	)
	lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
	"lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
	)
	mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
	"mac_morpho",
	MacMorphoCorpusReader,
	r"(?!\.).*\.txt",
	tagset="unknown",
	encoding="latin-1",
	)
	machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
	"machado",
	PortugueseCategorizedPlaintextCorpusReader,
	r"(?!\.).*\.txt",
	cat_pattern=r"([a-z])/.",
	encoding="latin-1",
	)
	masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
	"masc_tagged",
	CategorizedTaggedCorpusReader,
	r"(spoken\|written)/.*\.txt",
	cat_file="categories.txt",
	tagset="wsj",
	encoding="utf-8",
	sep="_",
	)
	movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
	"movie_reviews",
	CategorizedPlaintextCorpusReader,
	r"(?!\.).*\.txt",
	cat_pattern=r"(neg\|pos)/.*",
	encoding="ascii",
	)
	multext_east: MTECorpusReader = LazyCorpusLoader(
	"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
	)
	names: WordListCorpusReader = LazyCorpusLoader(
	"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
	)
	nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
	"nps_chat", NPSChatCorpusReader, r"(?!README\|\.).*\.xml", tagset="wsj"
	)
	opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
	"opinion_lexicon",
	OpinionLexiconCorpusReader,
	r"(\w+)\-words\.txt",
	encoding="ISO-8859-2",
	)
	ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
	"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
	)
	product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
	"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
	)
	product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
	"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
	)
	pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
	"pros_cons",
	ProsConsCorpusReader,
	r"Integrated(Cons\|Pros)\.txt",
	cat_pattern=r"Integrated(Cons\|Pros)\.txt",
	encoding="ISO-8859-2",
	)
	ptb: CategorizedBracketParseCorpusReader = (
	LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
	"ptb",
	CategorizedBracketParseCorpusReader,
	r"(WSJ/\d\d/WSJ_\d\d\|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
	cat_file="allcats.txt",
	tagset="wsj",
	)
	)
	qc: StringCategoryCorpusReader = LazyCorpusLoader(
	"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
	)
	reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
	"reuters",
	CategorizedPlaintextCorpusReader,
	"(training\|test).*",
	cat_file="cats.txt",
	encoding="ISO-8859-2",
	)
	rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
	senseval: SensevalCorpusReader = LazyCorpusLoader(
	"senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
	)
	sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
	"sentence_polarity",
	CategorizedSentencesCorpusReader,
	r"rt-polarity\.(neg\|pos)",
	cat_pattern=r"rt-polarity\.(neg\|pos)",
	encoding="utf-8",
	)
	sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
	"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
	)
	shakespeare: XMLCorpusReader = LazyCorpusLoader(
	"shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
	)
	sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
	"sinica_treebank",
	SinicaTreebankCorpusReader,
	["parsed"],
	tagset="unknown",
	encoding="utf-8",
	)
	state_union: PlaintextCorpusReader = LazyCorpusLoader(
	"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
	)
	stopwords: WordListCorpusReader = LazyCorpusLoader(
	"stopwords", WordListCorpusReader, r"(?!README\|\.).*", encoding="utf8"
	)
	subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
	"subjectivity",
	CategorizedSentencesCorpusReader,
	r"(quote.tok.gt9\|plot.tok.gt9)\.5000",
	cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
	encoding="latin-1",
	)
	swadesh: SwadeshCorpusReader = LazyCorpusLoader(
	"swadesh", SwadeshCorpusReader, r"(?!README\|\.).*", encoding="utf8"
	)
	swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
	"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
	)
	swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
	"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
	)
	switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
	"switchboard", SwitchboardCorpusReader, tagset="wsj"
	)
	timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
	timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
	"timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
	)
	toolbox: ToolboxCorpusReader = LazyCorpusLoader(
	"toolbox", ToolboxCorpusReader, r"(?!.(README\|\.)).\.(dic\|txt)"
	)
	treebank: BracketParseCorpusReader = LazyCorpusLoader(
	"treebank/combined",
	BracketParseCorpusReader,
	r"wsj_.*\.mrg",
	tagset="wsj",
	encoding="ascii",
	)
	treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
	"treebank/tagged",
	ChunkedCorpusReader,
	r"wsj_.*\.pos",
	sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s(?![^\[]\])", gaps=True),
	para_block_reader=tagged_treebank_para_block_reader,
	tagset="wsj",
	encoding="ascii",
	)
	treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
	"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
	)
	twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
	"twitter_samples", TwitterCorpusReader, r".*\.json"
	)
	udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
	udhr2: PlaintextCorpusReader = LazyCorpusLoader(
	"udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
	)
	universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
	"universal_treebanks_v20",
	ConllCorpusReader,
	r".*\.conll",
	columntypes=(
	"ignore",
	"words",
	"ignore",
	"ignore",
	"pos",
	"ignore",
	"ignore",
	"ignore",
	"ignore",
	"ignore",
	),
	)
	verbnet: VerbnetCorpusReader = LazyCorpusLoader(
	"verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
	)
	webtext: PlaintextCorpusReader = LazyCorpusLoader(
	"webtext", PlaintextCorpusReader, r"(?!README\|\.).*\.txt", encoding="ISO-8859-2"
	)
	wordnet: WordNetCorpusReader = LazyCorpusLoader(
	"wordnet",
	WordNetCorpusReader,
	LazyCorpusLoader("omw-1.4", CorpusReader, r"./wn-data-.\.tab", encoding="utf8"),
	)
	wordnet31: WordNetCorpusReader = LazyCorpusLoader(
	"wordnet31",
	WordNetCorpusReader,
	LazyCorpusLoader("omw-1.4", CorpusReader, r"./wn-data-.\.tab", encoding="utf8"),
	)
	wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
	"wordnet2021",
	WordNetCorpusReader,
	LazyCorpusLoader("omw-1.4", CorpusReader, r"./wn-data-.\.tab", encoding="utf8"),
	)
	wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
	"wordnet_ic", WordNetICCorpusReader, r".*\.dat"
	)
	words: WordListCorpusReader = LazyCorpusLoader(
	"words", WordListCorpusReader, r"(?!README\|\.).*", encoding="ascii"
	)

	# defined after treebank
	propbank: PropbankCorpusReader = LazyCorpusLoader(
	"propbank",
	PropbankCorpusReader,
	"prop.txt",
	r"frames/.*\.xml",
	"verbs.txt",
	lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
	treebank,
	) # Must be defined after treebank corpus.
	nombank: NombankCorpusReader = LazyCorpusLoader(
	"nombank.1.0",
	NombankCorpusReader,
	"nombank.1.0",
	r"frames/.*\.xml",
	"nombank.1.0.words",
	lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
	treebank,
	) # Must be defined after treebank corpus.
	propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
	"propbank",
	PropbankCorpusReader,
	"prop.txt",
	r"frames/.*\.xml",
	"verbs.txt",
	lambda filename: filename.upper(),
	ptb,
	) # Must be defined after ptb corpus.
	nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
	"nombank.1.0",
	NombankCorpusReader,
	"nombank.1.0",
	r"frames/.*\.xml",
	"nombank.1.0.words",
	lambda filename: filename.upper(),
	ptb,
	) # Must be defined after ptb corpus.
	semcor: SemcorCorpusReader = LazyCorpusLoader(
	"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
	) # Must be defined after wordnet corpus.

	nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
	"nonbreaking_prefixes",
	NonbreakingPrefixesCorpusReader,
	r"(?!README\|\.).*",
	encoding="utf8",
	)
	perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
	"perluniprops",
	UnicharsCorpusReader,
	r"(?!README\|\.).*",
	nltk_data_subdir="misc",
	encoding="utf8",
	)

	# mwa_ppdb = LazyCorpusLoader(
	# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README\|\.).*', nltk_data_subdir='misc', encoding='utf8')

	# See https://github.com/nltk/nltk/issues/1579
	# and https://github.com/nltk/nltk/issues/1716
	#
	# pl196x = LazyCorpusLoader(
	# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
	# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
	#
	# ipipan = LazyCorpusLoader(
	# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
	#
	# nkjp = LazyCorpusLoader(
	# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
	#
	# panlex_lite = LazyCorpusLoader(
	# 'panlex_lite', PanLexLiteCorpusReader)
	#
	# ycoe = LazyCorpusLoader(
	# 'ycoe', YCOECorpusReader)
	#
	# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
	# hebrew_treebank = LazyCorpusLoader(
	# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')

	# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
	def demo():
	# This is out-of-date:
	abc.demo()
	brown.demo()
	# chat80.demo()
	cmudict.demo()
	conll2000.demo()
	conll2002.demo()
	genesis.demo()
	gutenberg.demo()
	ieer.demo()
	inaugural.demo()
	indian.demo()
	names.demo()
	ppattach.demo()
	senseval.demo()
	shakespeare.demo()
	sinica_treebank.demo()
	state_union.demo()
	stopwords.demo()
	timit.demo()
	toolbox.demo()
	treebank.demo()
	udhr.demo()
	webtext.demo()
	words.demo()


	# ycoe.demo()

	if __name__ == "__main__":
	# demo()
	pass