Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /corpus /reader /childes.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

26.1 kB

	# CHILDES XML Corpus Reader

	# Copyright (C) 2001-2023 NLTK Project
	# Author: Tomonori Nagano <[email protected]>
	# Alexis Dimitriadis <[email protected]>
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	"""
	Corpus reader for the XML version of the CHILDES corpus.
	"""

	__docformat__ = "epytext en"

	import re
	from collections import defaultdict

	from nltk.corpus.reader.util import concat
	from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
	from nltk.util import LazyConcatenation, LazyMap, flatten

	# to resolve the namespace issue
	NS = "http://www.talkbank.org/ns/talkbank"


	class CHILDESCorpusReader(XMLCorpusReader):
	"""
	Corpus reader for the XML version of the CHILDES corpus.
	The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
	version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
	Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
	(``nltk_data/corpora/CHILDES/``).

	For access to the file text use the usual nltk functions,
	``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
	"""

	def __init__(self, root, fileids, lazy=True):
	XMLCorpusReader.__init__(self, root, fileids)
	self._lazy = lazy

	def words(
	self,
	fileids=None,
	speaker="ALL",
	stem=False,
	relation=False,
	strip_space=True,
	replace=False,
	):
	"""
	:return: the given file(s) as a list of words
	:rtype: list(str)

	:param speaker: If specified, select specific speaker(s) defined
	in the corpus. Default is 'ALL' (all participants). Common choices
	are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
	researchers)
	:param stem: If true, then use word stems instead of word strings.
	:param relation: If true, then return tuples of (stem, index,
	dependent_index)
	:param strip_space: If true, then strip trailing spaces from word
	tokens. Otherwise, leave the spaces on the tokens.
	:param replace: If true, then use the replaced (intended) word instead
	of the original word (e.g., 'wat' will be replaced with 'watch')
	"""
	sent = None
	pos = False
	if not self._lazy:
	return [
	self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	for fileid in self.abspaths(fileids)
	]

	get_words = lambda fileid: self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

	def tagged_words(
	self,
	fileids=None,
	speaker="ALL",
	stem=False,
	relation=False,
	strip_space=True,
	replace=False,
	):
	"""
	:return: the given file(s) as a list of tagged
	words and punctuation symbols, encoded as tuples
	``(word,tag)``.
	:rtype: list(tuple(str,str))

	:param speaker: If specified, select specific speaker(s) defined
	in the corpus. Default is 'ALL' (all participants). Common choices
	are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
	researchers)
	:param stem: If true, then use word stems instead of word strings.
	:param relation: If true, then return tuples of (stem, index,
	dependent_index)
	:param strip_space: If true, then strip trailing spaces from word
	tokens. Otherwise, leave the spaces on the tokens.
	:param replace: If true, then use the replaced (intended) word instead
	of the original word (e.g., 'wat' will be replaced with 'watch')
	"""
	sent = None
	pos = True
	if not self._lazy:
	return [
	self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	for fileid in self.abspaths(fileids)
	]

	get_words = lambda fileid: self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

	def sents(
	self,
	fileids=None,
	speaker="ALL",
	stem=False,
	relation=None,
	strip_space=True,
	replace=False,
	):
	"""
	:return: the given file(s) as a list of sentences or utterances, each
	encoded as a list of word strings.
	:rtype: list(list(str))

	:param speaker: If specified, select specific speaker(s) defined
	in the corpus. Default is 'ALL' (all participants). Common choices
	are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
	researchers)
	:param stem: If true, then use word stems instead of word strings.
	:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
	If there is manually-annotated relation info, it will return
	tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
	:param strip_space: If true, then strip trailing spaces from word
	tokens. Otherwise, leave the spaces on the tokens.
	:param replace: If true, then use the replaced (intended) word instead
	of the original word (e.g., 'wat' will be replaced with 'watch')
	"""
	sent = True
	pos = False
	if not self._lazy:
	return [
	self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	for fileid in self.abspaths(fileids)
	]

	get_words = lambda fileid: self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

	def tagged_sents(
	self,
	fileids=None,
	speaker="ALL",
	stem=False,
	relation=None,
	strip_space=True,
	replace=False,
	):
	"""
	:return: the given file(s) as a list of
	sentences, each encoded as a list of ``(word,tag)`` tuples.
	:rtype: list(list(tuple(str,str)))

	:param speaker: If specified, select specific speaker(s) defined
	in the corpus. Default is 'ALL' (all participants). Common choices
	are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
	researchers)
	:param stem: If true, then use word stems instead of word strings.
	:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
	If there is manually-annotated relation info, it will return
	tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
	:param strip_space: If true, then strip trailing spaces from word
	tokens. Otherwise, leave the spaces on the tokens.
	:param replace: If true, then use the replaced (intended) word instead
	of the original word (e.g., 'wat' will be replaced with 'watch')
	"""
	sent = True
	pos = True
	if not self._lazy:
	return [
	self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	for fileid in self.abspaths(fileids)
	]

	get_words = lambda fileid: self._get_words(
	fileid, speaker, sent, stem, relation, pos, strip_space, replace
	)
	return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

	def corpus(self, fileids=None):
	"""
	:return: the given file(s) as a dict of ``(corpus_property_key, value)``
	:rtype: list(dict)
	"""
	if not self._lazy:
	return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
	return LazyMap(self._get_corpus, self.abspaths(fileids))

	def _get_corpus(self, fileid):
	results = dict()
	xmldoc = ElementTree.parse(fileid).getroot()
	for key, value in xmldoc.items():
	results[key] = value
	return results

	def participants(self, fileids=None):
	"""
	:return: the given file(s) as a dict of
	``(participant_property_key, value)``
	:rtype: list(dict)
	"""
	if not self._lazy:
	return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
	return LazyMap(self._get_participants, self.abspaths(fileids))

	def _get_participants(self, fileid):
	# multidimensional dicts
	def dictOfDicts():
	return defaultdict(dictOfDicts)

	xmldoc = ElementTree.parse(fileid).getroot()
	# getting participants' data
	pat = dictOfDicts()
	for participant in xmldoc.findall(
	f".//{{{NS}}}Participants/{{{NS}}}participant"
	):
	for (key, value) in participant.items():
	pat[participant.get("id")][key] = value
	return pat

	def age(self, fileids=None, speaker="CHI", month=False):
	"""
	:return: the given file(s) as string or int
	:rtype: list or int

	:param month: If true, return months instead of year-month-date
	"""
	if not self._lazy:
	return [
	self._get_age(fileid, speaker, month)
	for fileid in self.abspaths(fileids)
	]
	get_age = lambda fileid: self._get_age(fileid, speaker, month)
	return LazyMap(get_age, self.abspaths(fileids))

	def _get_age(self, fileid, speaker, month):
	xmldoc = ElementTree.parse(fileid).getroot()
	for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
	try:
	if pat.get("id") == speaker:
	age = pat.get("age")
	if month:
	age = self.convert_age(age)
	return age
	# some files don't have age data
	except (TypeError, AttributeError) as e:
	return None

	def convert_age(self, age_year):
	"Caclculate age in months from a string in CHILDES format"
	m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
	age_month = int(m.group(1)) * 12 + int(m.group(2))
	try:
	if int(m.group(3)) > 15:
	age_month += 1
	# some corpora don't have age information?
	except ValueError as e:
	pass
	return age_month

	def MLU(self, fileids=None, speaker="CHI"):
	"""
	:return: the given file(s) as a floating number
	:rtype: list(float)
	"""
	if not self._lazy:
	return [
	self._getMLU(fileid, speaker=speaker)
	for fileid in self.abspaths(fileids)
	]
	get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
	return LazyMap(get_MLU, self.abspaths(fileids))

	def _getMLU(self, fileid, speaker):
	sents = self._get_words(
	fileid,
	speaker=speaker,
	sent=True,
	stem=True,
	relation=False,
	pos=True,
	strip_space=True,
	replace=True,
	)
	results = []
	lastSent = []
	numFillers = 0
	sentDiscount = 0
	for sent in sents:
	posList = [pos for (word, pos) in sent]
	# if any part of the sentence is intelligible
	if any(pos == "unk" for pos in posList):
	continue
	# if the sentence is null
	elif sent == []:
	continue
	# if the sentence is the same as the last sent
	elif sent == lastSent:
	continue
	else:
	results.append([word for (word, pos) in sent])
	# count number of fillers
	if len({"co", None}.intersection(posList)) > 0:
	numFillers += posList.count("co")
	numFillers += posList.count(None)
	sentDiscount += 1
	lastSent = sent
	try:
	thisWordList = flatten(results)
	# count number of morphemes
	# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
	numWords = (
	len(flatten([word.split("-") for word in thisWordList])) - numFillers
	)
	numSents = len(results) - sentDiscount
	mlu = numWords / numSents
	except ZeroDivisionError:
	mlu = 0
	# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
	return mlu

	def _get_words(
	self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
	):
	if (
	isinstance(speaker, str) and speaker != "ALL"
	): # ensure we have a list of speakers
	speaker = [speaker]
	xmldoc = ElementTree.parse(fileid).getroot()
	# processing each xml doc
	results = []
	for xmlsent in xmldoc.findall(".//{%s}u" % NS):
	sents = []
	# select speakers
	if speaker == "ALL" or xmlsent.get("who") in speaker:
	for xmlword in xmlsent.findall(".//{%s}w" % NS):
	infl = None
	suffixStem = None
	suffixTag = None
	# getting replaced words
	if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
	xmlword = xmlsent.find(
	f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
	)
	elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
	xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
	# get text
	if xmlword.text:
	word = xmlword.text
	else:
	word = ""
	# strip tailing space
	if strip_space:
	word = word.strip()
	# stem
	if relation or stem:
	try:
	xmlstem = xmlword.find(".//{%s}stem" % NS)
	word = xmlstem.text
	except AttributeError as e:
	pass
	# if there is an inflection
	try:
	xmlinfl = xmlword.find(
	f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
	)
	word += "-" + xmlinfl.text
	except:
	pass
	# if there is a suffix
	try:
	xmlsuffix = xmlword.find(
	".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
	% (NS, NS, NS, NS)
	)
	suffixStem = xmlsuffix.text
	except AttributeError:
	suffixStem = ""
	if suffixStem:
	word += "~" + suffixStem
	# pos
	if relation or pos:
	try:
	xmlpos = xmlword.findall(".//{%s}c" % NS)
	xmlpos2 = xmlword.findall(".//{%s}s" % NS)
	if xmlpos2 != []:
	tag = xmlpos[0].text + ":" + xmlpos2[0].text
	else:
	tag = xmlpos[0].text
	except (AttributeError, IndexError) as e:
	tag = ""
	try:
	xmlsuffixpos = xmlword.findall(
	".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
	% (NS, NS, NS, NS, NS)
	)
	xmlsuffixpos2 = xmlword.findall(
	".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
	% (NS, NS, NS, NS, NS)
	)
	if xmlsuffixpos2:
	suffixTag = (
	xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
	)
	else:
	suffixTag = xmlsuffixpos[0].text
	except:
	pass
	if suffixTag:
	tag += "~" + suffixTag
	word = (word, tag)
	# relational
	# the gold standard is stored in
	# <mor></mor><mor type="trn"><gra type="grt">
	if relation == True:
	for xmlstem_rel in xmlword.findall(
	f".//{{{NS}}}mor/{{{NS}}}gra"
	):
	if not xmlstem_rel.get("type") == "grt":
	word = (
	word[0],
	word[1],
	xmlstem_rel.get("index")
	+ "\|"
	+ xmlstem_rel.get("head")
	+ "\|"
	+ xmlstem_rel.get("relation"),
	)
	else:
	word = (
	word[0],
	word[1],
	word[2],
	word[0],
	word[1],
	xmlstem_rel.get("index")
	+ "\|"
	+ xmlstem_rel.get("head")
	+ "\|"
	+ xmlstem_rel.get("relation"),
	)
	try:
	for xmlpost_rel in xmlword.findall(
	f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
	):
	if not xmlpost_rel.get("type") == "grt":
	suffixStem = (
	suffixStem[0],
	suffixStem[1],
	xmlpost_rel.get("index")
	+ "\|"
	+ xmlpost_rel.get("head")
	+ "\|"
	+ xmlpost_rel.get("relation"),
	)
	else:
	suffixStem = (
	suffixStem[0],
	suffixStem[1],
	suffixStem[2],
	suffixStem[0],
	suffixStem[1],
	xmlpost_rel.get("index")
	+ "\|"
	+ xmlpost_rel.get("head")
	+ "\|"
	+ xmlpost_rel.get("relation"),
	)
	except:
	pass
	sents.append(word)
	if sent or relation:
	results.append(sents)
	else:
	results.extend(sents)
	return LazyMap(lambda x: x, results)

	# Ready-to-use browser opener

	"""
	The base URL for viewing files on the childes website. This
	shouldn't need to be changed, unless CHILDES changes the configuration
	of their server or unless the user sets up their own corpus webserver.
	"""
	childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="

	def webview_file(self, fileid, urlbase=None):
	"""Map a corpus file to its web version on the CHILDES website,
	and open it in a web browser.

	The complete URL to be used is:
	childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')

	If no urlbase is passed, we try to calculate it. This
	requires that the childes corpus was set up to mirror the
	folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
	nltk_data/corpora/childes/Eng-USA/Cornell/??? or
	nltk_data/corpora/childes/Romance/Spanish/Aguirre/???

	The function first looks (as a special case) if "Eng-USA" is
	on the path consisting of <corpus root>+fileid; then if
	"childes", possibly followed by "data-xml", appears. If neither
	one is found, we use the unmodified fileid and hope for the best.
	If this is not right, specify urlbase explicitly, e.g., if the
	corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
	"""

	import webbrowser

	if urlbase:
	path = urlbase + "/" + fileid
	else:
	full = self.root + "/" + fileid
	full = re.sub(r"\\", "/", full)
	if "/childes/" in full.lower():
	# Discard /data-xml/ if present
	path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
	elif "eng-usa" in full.lower():
	path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
	else:
	path = fileid

	# Strip ".xml" and add ".cha", as necessary:
	if path.endswith(".xml"):
	path = path[:-4]

	if not path.endswith(".cha"):
	path = path + ".cha"

	url = self.childes_url_base + path

	webbrowser.open_new_tab(url)
	print("Opening in browser:", url)
	# Pausing is a good idea, but it's up to the user...
	# raw_input("Hit Return to continue")


	def demo(corpus_root=None):
	"""
	The CHILDES corpus should be manually downloaded and saved
	to ``[NLTK_Data_Dir]/corpora/childes/``
	"""
	if not corpus_root:
	from nltk.data import find

	corpus_root = find("corpora/childes/data-xml/Eng-USA/")

	try:
	childes = CHILDESCorpusReader(corpus_root, ".*.xml")
	# describe all corpus
	for file in childes.fileids()[:5]:
	corpus = ""
	corpus_id = ""
	for (key, value) in childes.corpus(file)[0].items():
	if key == "Corpus":
	corpus = value
	if key == "Id":
	corpus_id = value
	print("Reading", corpus, corpus_id, " .....")
	print("words:", childes.words(file)[:7], "...")
	print(
	"words with replaced words:",
	childes.words(file, replace=True)[:7],
	" ...",
	)
	print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
	print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
	print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
	print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
	print(
	"words with relations and pos-tag:",
	childes.words(file, relation=True)[:5],
	" ...",
	)
	print("sentence:", childes.sents(file)[:2], " ...")
	for (participant, values) in childes.participants(file)[0].items():
	for (key, value) in values.items():
	print("\tparticipant", participant, key, ":", value)
	print("num of sent:", len(childes.sents(file)))
	print("num of morphemes:", len(childes.words(file, stem=True)))
	print("age:", childes.age(file))
	print("age in month:", childes.age(file, month=True))
	print("MLU:", childes.MLU(file))
	print()

	except LookupError as e:
	print(
	"""The CHILDES corpus, or the parts you need, should be manually
	downloaded from https://childes.talkbank.org/data-xml/ and saved at
	[NLTK_Data_Dir]/corpora/childes/
	Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
	demo('/path/to/childes/data-xml/Eng-USA/")
	"""
	)
	# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
	# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
	##this fails
	# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())


	if __name__ == "__main__":
	demo()